In [35]:
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin

In [36]:
df = pd.read_csv('../../../../raw_data/data_prep.csv')

In [37]:
train, dev = train_test_split(df, test_size=0.1, random_state=42)

In [38]:
train.shape, dev.shape

((1080, 8), (120, 8))

In [39]:
df['rating'].unique()

array(['A', 'BB', 'AAA', 'BBB', 'AA+', 'BB+', 'BB-', 'A-', 'A+', 'B',
       'AA-', 'BBB+', 'BBB-', 'B-', 'AA', 'B+', 'C'], dtype=object)

In [40]:
categories = ['C', 'BBB', 'BBB+', 'BBB-', 'BB', 'BB-', 'BB+']

In [41]:
dev = dev.query('rating in @categories')

In [42]:
train = train.query('rating in @categories')

In [43]:
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)

In [44]:
nlp = spacy.load('ru_core_news_lg')

In [45]:
def create_docbin(data):
    db = DocBin()
    global categories
    for i in range(data.shape[0]):    
        doc = nlp.make_doc(str(data["tokenized_str"][i]))
        doc.cats = {category: 0 for category in categories}
        doc.cats[data["rating"][i]] = 1
        db.add(doc)
    return db

In [46]:
create_docbin(train).to_disk("train.spacy")
create_docbin(dev).to_disk("dev.spacy")

In [47]:
! python -m spacy train config.cfg --output ./subsym_model --paths.train train.spacy --paths.dev dev.spacy

[38;5;4mℹ Saving to output directory: subsym_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       48.81    0.49
  0     200          24.74       71.41    0.71
  1     400          21.56       76.88    0.77
  1     600          10.80       76.02    0.76
  2     800           9.94       79.20    0.79
  2    1000           6.22       76.37    0.76
  3    1200           5.34       79.55    0.80
  3    1400           3.04       76.70    0.77
  4    1600           3.73       79.71    0.80
  4    1800           3.14       80.35    0.80
  5    2000           2.32       82.81    0.83
  5    2200           2.36       79.95    0.80
  6    2400           1.82       79.00    0.79
  6    2600           1.07       81.79    0.82
  7    2800           1.42      