In [7]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin

In [8]:
df = pd.read_csv('../../../../raw_data/data_prep.csv')

In [9]:
train, dev = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
train.shape, dev.shape

((960, 8), (240, 8))

In [19]:
df['rating'].value_counts()

AAA     153
A-      150
A+      126
BBB+    115
A       110
BBB      90
AA       74
AA+      71
BBB-     65
BB+      57
AA-      54
BB       50
BB-      29
B+       20
B        12
B-       12
C        12
Name: rating, dtype: int64

In [11]:
categories = ['AAA', 'AA+', 'AA-', 'AA']

In [12]:
dev = dev.query('rating in @categories')
train = train.query('rating in @categories')

In [13]:
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)

In [14]:
train.shape, dev.shape

((274, 8), (78, 8))

In [15]:
nlp = spacy.load('ru_core_news_lg')

In [16]:
def create_docbin(data):
    db = DocBin()
    global categories
    for i in range(data.shape[0]):    
        doc = nlp.make_doc(str(data["tokenized_str"][i]))
        doc.cats = {category: 0 for category in categories}
        doc.cats[data["rating"][i]] = 1
        db.add(doc)
    return db

In [17]:
create_docbin(dev).to_disk("dev.spacy")
create_docbin(train).to_disk("train.spacy")

In [18]:
! python -m spacy train config.cfg --output ./subsym_model --paths.train train.spacy --paths.dev dev.spacy

[38;5;4mℹ Saving to output directory: subsym_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.19        8.51    0.09
  0     200         39.03       37.96    0.38
  1     400         17.01       69.64    0.70
  2     600         13.52       70.83    0.71
  2     800          7.85       76.87    0.77
  3    1000          3.58       70.04    0.70
  4    1200          4.16       74.83    0.75
  5    1400          3.09       80.76    0.81
  5    1600          2.90       74.63    0.75
  6    1800          2.23       70.30    0.70
  7    2000          1.28       76.13    0.76
  8    2200          1.91       79.04    0.79
  8    2400          2.77       68.52    0.69
  9    2600          0.90       69.58    0.70
 10    2800          1.33       72.23    0.72
 10    3000  