In [59]:
import pandas as pd
import spacy
from tqdm import tqdm
from sklearn.metrics import classification_report
tqdm.pandas()
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin
from sklearn.metrics import f1_score


In [29]:
df = pd.read_csv('../raw_data/data_prep.csv')

In [19]:
categories = df['category'].unique()

In [20]:
categories

array(['BB', 'AAA', 'AA', 'A', 'BBB', 'B', 'C'], dtype=object)

In [44]:
ratings = df['rating'].unique()

In [45]:
syms = ['A', 'B']

In [46]:
sims_a = ['AAA', 'AA+', 'A', 'A-', 'A+', 'AA', 'AA-']

In [47]:
sims_b = ['BB', 'BB+', 'BBB', 'BBB+', 'B-', 'B+', 'BB-', 'B', 'BBB-']

In [48]:
train, dev = train_test_split(df, test_size=0.2, random_state=42)


In [49]:
train.shape, dev.shape

((960, 8), (240, 8))

In [50]:
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)

In [33]:
nlp_cat = spacy.load('../cat/cat_model/model-best/')

In [34]:
nlp_sym = spacy.load('../sym/sym_model/model-best/')

In [35]:
nlp_sub_a = spacy.load('../subsym/subsym_a/subsym_model/model-best/')

In [36]:
nlp_sub_b = spacy.load('../subsym/subsym_b/subsym_model/model-best/')

In [37]:
def get_prection(text, model):
    doc = model(text)
    scores = doc.cats
    return max(scores, key=scores.get)

In [38]:
def get_prection_rating(row, model1, model2):
    if row['sym'] == 'A':     
        doc = model1(row['tokenized_str'])
        scores = doc.cats
        return max(scores, key=scores.get)
    else:
        doc = model2(row['tokenized_str'])
        scores = doc.cats
        return max(scores, key=scores.get)

In [51]:
dev['y'] = dev['clear_text'].apply(lambda x: get_prection(x, nlp_cat))

In [52]:
dev['sym'] = dev['clear_text'].apply(lambda x: get_prection(x, nlp_sym))

In [53]:
dev['rating_y'] = dev.apply(lambda x: get_prection_rating(x, nlp_sub_a, nlp_sub_b), axis=1)

In [66]:
dev['rating_y'].unique()

array(['B+', 'A+', 'AA-', 'BB-', 'A-', 'BB', 'AAA', 'AA+', 'BBB+', 'BBB',
       'B-', 'AA', 'BBB-', 'A', 'BB+', 'B'], dtype=object)

In [70]:
dev.query('rating_y == "C"')

Unnamed: 0.1,Unnamed: 0,Id,pr_txt,category,rating,clear_text,tokenized,tokenized_str,y,sym,rating_y


In [56]:
print(classification_report(dev['category'], dev['y'], target_names=categories))

              precision    recall  f1-score   support

          BB       0.75      0.92      0.83        83
         AAA       0.79      0.71      0.75        38
          AA       0.91      0.91      0.91        32
           A       1.00      0.86      0.92         7
         BBB       0.77      0.87      0.82        23
           B       0.92      0.67      0.77        54
           C       1.00      0.67      0.80         3

    accuracy                           0.82       240
   macro avg       0.88      0.80      0.83       240
weighted avg       0.83      0.82      0.81       240



In [69]:
print(classification_report(dev['rating'], dev['rating_y'], target_names=ratings))

              precision    recall  f1-score   support

          BB       1.00      1.00      1.00        18
         AAA       0.93      1.00      0.97        28
         AA+       0.90      1.00      0.95        37
           A       1.00      1.00      1.00        13
         BB+       1.00      1.00      1.00        13
          A-       1.00      1.00      1.00        12
          A+       1.00      1.00      1.00        32
         BBB       1.00      1.00      1.00         2
        BBB+       1.00      1.00      1.00         4
          B-       1.00      1.00      1.00         1
          AA       1.00      0.89      0.94         9
          B+       0.90      1.00      0.95         9
         BB-       1.00      1.00      1.00         5
         AA-       0.95      1.00      0.97        19
           C       1.00      0.83      0.90        23
           B       1.00      1.00      1.00        12
        BBB-       0.00      0.00      0.00         3

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
(f1_score(dev['category'], dev['y'], average='weighted') * 0.35) + (f1_score(dev['rating'], dev['rating_y'], average='weighted') * 0.65)

0.9092693461243028

0.6241524661891374