In [1]:
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split
from spacy.tokens import DocBin

In [2]:
df = pd.read_csv('../../../../raw_data/data_prep.csv')

In [4]:
train, dev = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
train.shape, dev.shape

((960, 8), (240, 8))

In [3]:
df['rating'].unique()

array(['A', 'BB', 'AAA', 'BBB', 'AA+', 'BB+', 'BB-', 'A-', 'A+', 'B',
       'AA-', 'BBB+', 'BBB-', 'B-', 'AA', 'B+', 'C'], dtype=object)

In [6]:
categories = ['C', 'BBB', 'BBB+', 'BBB-', 'BB', 'BB-', 'BB+']

In [7]:
dev = dev.query('rating in @categories')

In [8]:
train = train.query('rating in @categories')

In [9]:
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)

In [10]:
nlp = spacy.load('ru_core_news_lg')

In [11]:
def create_docbin(data):
    db = DocBin()
    global categories
    for i in range(data.shape[0]):    
        doc = nlp.make_doc(str(data["tokenized_str"][i]))
        doc.cats = {category: 0 for category in categories}
        doc.cats[data["rating"][i]] = 1
        db.add(doc)
    return db

In [12]:
create_docbin(train).to_disk("train.spacy")
create_docbin(dev).to_disk("dev.spacy")

In [13]:
! python -m spacy train config.cfg --output ./subsym_model --paths.train train.spacy --paths.dev dev.spacy

[38;5;4mℹ Saving to output directory: subsym_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       55.99    0.56
  1     200          23.52       81.06    0.81
  3     400           1.54       86.83    0.87
  4     600           0.11       87.09    0.87
  6     800           0.05       87.31    0.87
  8    1000           0.03       87.16    0.87
  9    1200           0.02       87.75    0.88
 11    1400           0.02       87.89    0.88
 13    1600           0.01       88.07    0.88
 14    1800           0.01       87.71    0.88
 16    2000           0.01       87.86    0.88
 18    2200           0.01       88.22    0.88
 19    2400           0.01       87.91    0.88
 21    2600           0.01       87.55    0.88
 23    2800           0.01      