In [15]:
import numpy as np 
import pandas as pd
import csv

In [2]:
#!pip install fasttext
import fasttext

In [4]:
#!pip install gensim
from gensim.utils import simple_preprocess

не забудьте перенести в папку с этим .ipynb-файлом полученные в "02 Balanced" parquet-файлы 'train_undersampled_data' и 'train_oversampled_data'

In [5]:
train_data_undersampled = pd.read_parquet('train_undersampled_data.parquet')
train_data_oversampled = pd.read_parquet('train_oversampled_data.parquet')

In [6]:
train_data = train_data_undersampled.append(train_data_oversampled, ignore_index=True)

In [7]:
train_data.head()

Unnamed: 0,category_id,Docs
0,11937,чехлы плотный чехол для xiaomi poco m3 redmi...
1,11937,чехлы силиконовый чехол с принтом для iphone 6...
2,11937,чехлы чехол oneplus 7t pro черный силиконовый ...
3,11937,чехлы чехол samsung galaxy a50 a50s a30s с...
4,11937,чехлы силиконовый чехол для xiaomi redmi 4x с...


In [8]:
#Выборка сбалансирована
train_data['category_id'].value_counts().describe()

count    874.0
mean      50.0
std        0.0
min       50.0
25%       50.0
50%       50.0
75%       50.0
max       50.0
Name: category_id, dtype: float64

In [9]:
#Подготовка к FastText
train_data.Docs = train_data.Docs.apply(lambda x: ' '.join(simple_preprocess(x)))
train_data.category_id = train_data.category_id.apply(lambda x: '__label__' + str(x))

In [10]:
#Размешаем данные
train_data_new = train_data.sample(frac=1, random_state=17)

In [11]:
#у нас в выборке 43700 объектов. Возьмем 5000 чтобы быстрее обучать
train_data_learn = train_data_new.sample(5000)

In [13]:
#Разбиваем на train/val
n = int(0.8*len(train_data_learn))
train_df = train_data_learn[:n]
val_df = train_data_learn[n:]

In [16]:
# сохраняем csv в txt. Это важно для FastText
train_df[['Docs', 'category_id']].to_csv('fasttext_balanced_train.txt',
                index = False, sep = ' ', header = None,
                quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")

val_df[['Docs', 'category_id']].to_csv('fasttext_balanced_val.txt', index = False, sep = ' ', header = None, 
              quoting = csv.QUOTE_NONE, quotechar = "", escapechar = " ")

In [None]:
'''
ИЗ ПРЕДЫДУЩЕГО ПЕРЕБОБРА ПАРАМЕТРОВ
********CURRENT BEST PARAMS*******
0.7773279352226721
lr =  3.0
dim =  60
ws =  3
epoch =  70
neg =  3
wordNgrams =  3
******************

ВЫВОД:
1) 70 эпох для обучения идеально
2) lr=3.0
dim 60
ws = 3
neg =  3
wordNgrams =  3
'''

In [17]:
#лучшие параметры для сбалансированной выборки
epochs_list = [60, 70, 80]
dim_lst =  [40,50,60]
neg_lst = [3, 5]
ws_lst = [3]
wordNgrams_lst = [3]
lr_lst = [2.0, 3.0, 4.0, 5.0]

In [18]:
val_f1 = 0
for epochs in epochs_list:
    print('Epochs: ', epochs)
    for dim in dim_lst:
        print('* dim: ', dim)
        for neg in neg_lst:
            print('** neg: ', neg)
            for ws in ws_lst:
                print('*** ws = ', ws)
                for w in wordNgrams_lst:
                    print('**** wordNgrams = ', w)
                    for lr in lr_lst:
                        print('***** lr = ', lr)
                        model = fasttext.train_supervised('fasttext_balanced_train.txt',
                                                          lr=lr, 
                                                          dim=dim, 
                                                          ws=ws,
                                                          epoch=epochs,
                                                          neg=neg,
                                                          wordNgrams=w)
                        
                        _, precision, recall = model.test('fasttext_balanced_val.txt')
                        
                        val_cur = (2*precision*recall) / (precision+recall)
                        if (val_cur > val_f1):
                            val_f1 = val_cur
                            print('********CURRENT BEST PARAMS*******')
                            print(val_f1)
                            print('lr = ', lr)
                            print('dim = ', dim)
                            print('ws = ', ws)
                            print('epoch = ', epochs)
                            print('neg = ', neg)
                            print('wordNgrams = ', w)
                            print('******************')
                            print(' ')

Epochs:  60
* dim:  40
** neg:  3
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
********CURRENT BEST PARAMS*******
0.7439271255060729
lr =  2.0
dim =  40
ws =  3
epoch =  60
neg =  3
wordNgrams =  3
******************
 
***** lr =  3.0
********CURRENT BEST PARAMS*******
0.7631578947368421
lr =  3.0
dim =  40
ws =  3
epoch =  60
neg =  3
wordNgrams =  3
******************
 
***** lr =  4.0
***** lr =  5.0
** neg:  5
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
***** lr =  3.0
***** lr =  4.0
***** lr =  5.0
* dim:  50
** neg:  3
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
***** lr =  3.0
***** lr =  4.0
***** lr =  5.0
** neg:  5
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
***** lr =  3.0
***** lr =  4.0
***** lr =  5.0
* dim:  60
** neg:  3
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
***** lr =  3.0
***** lr =  4.0
***** lr =  5.0
** neg:  5
*** ws =  3
**** wordNgrams =  3
***** lr =  2.0
***** lr =  3.0
***** lr =  4.0
***** lr =  5.0
Epochs:  70
* dim:  40
** n