#Predict score field by text and parent_text fields.

In [0]:
!pip install transformers
!pip install --upgrade wandb
!wandb login

In [0]:
import wandb

import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
from torchtext  import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel

SEED = 43
np.random.seed(SEED)
torch.manual_seed(SEED)

import warnings
warnings.filterwarnings('ignore')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Data Preparation

In [0]:
columns = ["text", "parent_text", "score"]
df = pd.concat([
    pd.read_csv("/content/drive/My Drive/jetbrains/comments_positive.csv", usecols=columns, na_filter=False),
    pd.read_csv("/content/drive/My Drive/jetbrains/comments_negative.csv", usecols=columns, na_filter=False)
], ignore_index=True)

In [0]:
y = df['score']
df.drop(columns='score', inplace=True)
X = df

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=SEED)

# To be sure we don't use indices to predict something
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

Train shape: (3800000, 2)
Test shape: (200000, 2)


In [9]:
X_train.head()

Unnamed: 0,text,parent_text
0,"I was a whole bunch of folks on Recess, yup. :...",Your IMDB just blew my mind. You were Upside D...
1,"I'm not quite sure how that was relevant, but ...",http://i.imgur.com/GKLI7.jpg
2,Sounds like you weren't paying attention.,Sadly high school history doesn't teach you ab...
3,&gt;Polygamy doesn't mean that there's just wo...,&gt; from a feminist perspective \n\nSo what i...
4,The New Orleans Gargle-Glub-Glubs.,My high school's mascot was water...


In [10]:
y_train.head()

0    454
1     71
2     -9
3    -12
4     91
Name: score, dtype: int64

In [11]:
y_train.describe()

count    3.800000e+06
mean     9.188734e+01
std      2.118246e+02
min     -2.946000e+03
25%     -1.000000e+01
50%      6.600000e+01
75%      1.160000e+02
max      9.582000e+03
Name: score, dtype: float64

In [12]:
y_test.describe()

count    200000.000000
mean         91.988825
std         210.331459
min        -812.000000
25%         -10.000000
50%          -6.000000
75%         116.000000
max        5665.000000
Name: score, dtype: float64

## Cleaning the data

Т.к. некоторые тексты - это просто смайлики или нечитаемые бессмысленные вещи, то их было решено удалить. Также были удалены строки, если после очисти они были пустыми. 

In [0]:
def clean_text(text):
    text = re.sub(r'\<[^>]*\>', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'&[a-z]{0,7};', ' ', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
    text = re.sub('@[^\s]+','AT_USER',text)
    text = re.sub(r'\\x\d{1,4}', '', text)
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'\r', ' ', text)
    text = re.sub('[\s]+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

def clean_df(df, y, text_cleaner, train=True):
    """
    Apply text_cleaner to df columns and deleting empty rows.

    Args:
        df : initial DataFrame
        y : target values
        text_cleaner: function to apply to 'text' and 'parent_text' columns

    Returns:
        Pandas DataFrame with rating coumns as well
    """
    df_cleaned = pd.DataFrame()
    df_cleaned['text'] = df['text'].apply(lambda x: text_cleaner(x))
    df_cleaned['parent_text'] = df['parent_text'].apply(lambda x: text_cleaner(x))
    df_cleaned['rating'] = y    

    if train:
        replacing = {'':np.nan, 'null':np.nan, 'NaN':np.nan, 'NA':np.nan}
    else:
        replacing = {'':'empty', 'null':'empty', 'NaN':'empty', 'NA':'empty'}
    df_cleaned['text'].replace(to_replace=replacing, value=None, inplace=True)
    df_cleaned['parent_text'].replace(to_replace=replacing, value=None, inplace=True)
    df_cleaned.dropna(subset=['text', 'parent_text', 'rating'], inplace=True)
    return df_cleaned

def scale_y(df, standart=True, training=True, scaler=None):
    """
    Transforms rating score with StandardScaler or MinMaxScaler
    Returns pd.DataFrame with new column 'rating_scaled', scaler
    """
    if training:  
        scaler = StandardScaler() if standart else MinMaxScaler()     
        X = np.array(list(df['rating'])).reshape(-1,1)
        scaler.fit(X)
    df['rating_scaled'] = df['rating'].apply(lambda x : scaler.transform(np.array(x).reshape(1,-1))[0][0])
    return df, scaler

Т.к. данных очень много, было решено использовать часть выборки для подбора архитектуры модели, гиперпараметров (подробнее будет ниже). 

Так, N - размер тренировочной выборки. После выбранные модели уже были обучены на всех данных. 

In [0]:
# N = 1000000
# train = clean_df(X_train[:N], y_train[:N], clean_text)

In [14]:
train = clean_df(X_train, y_train, clean_text)
test = clean_df(X_test, y_test, clean_text, train=False)
train.shape, X_train.shape, test.shape, X_test.shape

((3795473, 3), (3800000, 2), (200000, 3), (200000, 2))

In [15]:
train, scaler = scale_y(train)
test, scaler  = scale_y(test, training=False, scaler=scaler)
train.head()

Unnamed: 0,text,parent_text,rating,rating_scaled
0,i was a whole bunch of folks on recess yup th...,your imdb just blew my mind you were upside do...,454,1.709521
1,im not quite sure how that was relevant but im...,url,71,-0.098651
2,sounds like you werent paying attention,sadly high school history doesnt teach you abo...,-9,-0.476337
3,polygamy doesnt mean that theres just women i...,from a feminist perspective so what if you wo...,-12,-0.4905
4,the new orleans gargleglubglubs,my high schools mascot was water,91,-0.004229


In [0]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

Убедимся, что ничего не съехало:

In [18]:
train.iloc[-1], X_train.iloc[-1], y_train.iloc[-1]

(text             4x the pixels has nothing to do with the resol...
 parent_text      the new gpu is 2x as powerful as the ipad 2s b...
 rating                                                          -8
 rating_scaled                                            -0.471616
 Name: 3799999, dtype: object,
 text           4x the pixels has nothing to do with the resol...
 parent_text    The new GPU is 2x as powerful as the iPad 2's,...
 Name: 3799999, dtype: object,
 -8)

In [19]:
test.iloc[-1], X_test.iloc[-1], y_test.iloc[-1]

(text             ha people on reddit go on the side of whoever ...
 parent_text      i just hope you guys dont ruin it for the play...
 rating                                                          91
 rating_scaled                                          -0.00422926
 Name: 199999, dtype: object,
 text           Ha, people on Reddit go on the side of whoever...
 parent_text    I just hope you guys don't ruin it for the pla...
 Name: 199999, dtype: object,
 91)

# Model

In [20]:
pretrained_weights = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
pad_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

bert = BertModel.from_pretrained(pretrained_weights)
bert.to(device)
def tokenize(text, tokenizer=tokenizer):
    return tokenizer.encode(text, max_length=100)

embeddings_pretrained = bert.get_input_embeddings()
embeddings_pretrained

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




Embedding(30522, 768, padding_idx=0)

In [0]:
LABEL = data.Field(dtype=torch.float64,
                   use_vocab=False,
                   sequential=False, 
                   preprocessing=data.Pipeline(int),
                   )
SCALED = data.Field(dtype=torch.float64,
                    use_vocab=False,
                    sequential=False,
                    preprocessing=data.Pipeline(float))

TEXT = data.Field(sequential=True, 
                  include_lengths=False,
                  batch_first=True, 
                  tokenize=tokenize, 
                  pad_first=False, 
                  lower=False,
                  use_vocab=False,
                  preprocessing=data.Pipeline(int),
                  pad_token=pad_index)

fields = [('text', TEXT), ('parent_text', TEXT), ('label',LABEL),  ('rating_scaled', SCALED)]

train = data.TabularDataset('/content/train.csv', 
                            format='csv', 
                            fields=fields, 
                            skip_header=True)

test = data.TabularDataset('/content/test.csv',  
                           format='csv', 
                           fields=fields, 
                           skip_header=True)

train, valid = train.split(0.9)

In [22]:
len(train), len(valid), len(test), len(train)+len(valid)

(3415926, 379547, 200000, 3795473)

In [0]:
EMBEDDINGS_DIM = embeddings_pretrained.embedding_dim
VOCAB_SIZE = embeddings_pretrained.num_embeddings
EMB_PRETRAINED = True

# Models

В тетрадке запущена одна из лучших моделей (Подробнее о ее выборе ниже).

## Neural Network with attention

In [0]:
class Attention(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.linear = nn.Linear(embed_size, embed_size)

    def forward(self, input_):
        mean_input = torch.mean(input_, dim=1)
        lin_out = self.linear(mean_input).unsqueeze(2)
        bmm_out = torch.bmm(input_, lin_out).tanh()
        res = bmm_out / sum(torch.exp(bmm_out))
        return res.squeeze(1)

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size,
                 emb_pretrained, embeddings):
        super(MyModel, self).__init__()

        self.embedding =  embeddings if emb_pretrained else nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(embed_size)
        self.fc = nn.Linear(embed_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, batch):

        text, parent = self.embedding(batch.text), self.embedding(batch.parent_text)

        text_a = self.attention(text)
        parent_a = self.attention(parent)
        text = torch.bmm(text.transpose(1,2), text_a).squeeze(2)
        parent = torch.bmm(parent.transpose(1,2), parent_a).squeeze(2)
        
        # result = torch.cat((text,parent), dim=-1) # concat
        # result = text + parent  # summ
        result = (text + parent) / 2 # mean
        
        result = self.fc(result)
        result = F.relu(result)
        result = self.fc2(result)

        return result

## Bert

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, bert, embed_size, hidden_size):
        super(MyModel, self).__init__()

        self.bert = bert
        self.fc = nn.Linear(2*embed_size, 1)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, batch):

        text, parent = self.bert(batch.text)[0], self.bert(batch.parent_text)[0]
        cat = torch.cat((text[:,0,:],parent[:,0,:]), dim=-1)
        cat = self.fc(cat)

        return cat

## Model initialisation

In [27]:
BATCH_SIZE = 512
HIDDEN_SIZE = 1024
LR = 0.01

model = MyModel(vocab_size=VOCAB_SIZE,
                embed_size=EMBEDDINGS_DIM,
                hidden_size=HIDDEN_SIZE,
                emb_pretrained = EMB_PRETRAINED,
                embeddings = embeddings_pretrained
               )
bert = BertModel.from_pretrained(pretrained_weights)
bert.to(device)

# model = MyModel(bert=bert,
#                 embed_size=EMBEDDINGS_DIM,
#                 hidden_size=HIDDEN_SIZE)

model.to(device)
wandb.init(project="reddit-comments-score", name='att_full', config = {"Train size": 'All', 
                                                                 "Batch size": BATCH_SIZE,
                                                                 "Hidden_size": HIDDEN_SIZE,
                                                                 "Dropout": 'None',
                                                                 "Optimizer": "SGD",
                                                                 "Learning rate": LR})
wandb.watch(model)

[<wandb.wandb_torch.TorchGraph at 0x7f5153402ef0>]

In [0]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    shuffle=True,
    device=device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
)

optimizer = optim.SGD(model.parameters(), lr=LR)
# scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.01, max_lr=0.1)
criterion = torch.nn.MSELoss()
criterion.to(device);

In [0]:
# Freesing some of bert layers if needed

# params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(params)

# for p in model.bert.encoder.parameters(): 
#     p.requires_grad = False 

# for p in model.bert.pooler.parameters():
#     p.requires_grad = True

# for p in model.bert.embeddings.parameters(): 
#     p.requires_grad = False

# for p in model.bert.encoder.layer[-1].parameters():
#     p.requires_grad = True

# params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(params)

108312834
7681026


## Training and Testing

In [30]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

for e in tqdm([1,2,3]):
    pass

100%|██████████| 3/3 [00:00<00:00, 2301.19it/s]


In [0]:
train_losses = []
valid_losses = []

def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):
    
    model.train()
    running_loss = 0
    n_batches = len(iterator)    
    
    clean_tqdm()
    for batch in tqdm(iterator):
        x = batch
        y = batch.label.squeeze(0)
        optimizer.zero_grad()
        preds = model(x).squeeze(1).double()
        loss = criterion(preds, y)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        optimizer.step()
        
        curr_loss = loss.data.detach().item()
        train_losses.append(curr_loss)

        wandb.log({"Train Loss": curr_loss})

    return curr_loss

def _test_epoch(model, iterator, criterion, scheduler):
    model.eval()
    
    epoch_loss = 0
    n_batches = len(iterator)

    with torch.no_grad():
        for batch in iterator:
            x = batch
            y = batch.label.squeeze(0)
            preds = model(x).squeeze(1).double()

            loss = criterion(preds, y)
            valid_losses.append(loss.item())
            epoch_loss += loss.data.item()

    if scheduler:
        scheduler.step(loss.item())
    wandb.log({"Valid Loss": epoch_loss/n_batches})
    
    return epoch_loss / n_batches

def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, n_epochs=20, early_stopping=0):

    prev_loss = 1000500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):

        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion, scheduler)

        print('Epoch {} is finished'.format(epoch))
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)
        
        torch.save(model.state_dict(), '/content/drive/My Drive/jetbrains/model_full_{}'.format(epoch))
        wandb.save('model_{}.h5'.format(epoch))
    return history

In [32]:
clean_tqdm()
history = nn_train(model, train_iterator, valid_iterator,
          criterion, optimizer, scheduler=None, n_epochs=20, early_stopping=5)

100%|██████████| 6672/6672 [09:33<00:00, 11.63it/s]


Epoch 0 is finished
validation loss 44751.32853


100%|██████████| 6672/6672 [09:19<00:00, 11.93it/s]


Epoch 1 is finished
validation loss 44738.77572


100%|██████████| 6672/6672 [09:25<00:00, 11.81it/s]


Epoch 2 is finished
validation loss 44709.11157


100%|██████████| 6672/6672 [09:36<00:00, 11.58it/s]


Epoch 3 is finished
validation loss 44592.92259


100%|██████████| 6672/6672 [09:24<00:00, 11.81it/s]


Epoch 4 is finished
validation loss 43848.15903


100%|██████████| 6672/6672 [09:27<00:00, 11.75it/s]


Epoch 5 is finished
validation loss 43015.17140


100%|██████████| 6672/6672 [09:24<00:00, 11.81it/s]


Epoch 6 is finished
validation loss 42575.60597


100%|██████████| 6672/6672 [09:20<00:00, 11.91it/s]


Epoch 7 is finished
validation loss 42215.66126


100%|██████████| 6672/6672 [09:23<00:00, 11.83it/s]


Epoch 8 is finished
validation loss 42052.12775


100%|██████████| 6672/6672 [09:35<00:00, 11.60it/s]


Epoch 9 is finished
validation loss 41871.61899


100%|██████████| 6672/6672 [09:26<00:00, 11.77it/s]


Epoch 10 is finished
validation loss 42079.49998


100%|██████████| 6672/6672 [09:23<00:00, 11.85it/s]


Epoch 11 is finished
validation loss 41652.04128


100%|██████████| 6672/6672 [09:27<00:00, 11.77it/s]


Epoch 12 is finished
validation loss 41576.53803


100%|██████████| 6672/6672 [09:18<00:00, 11.95it/s]


Epoch 13 is finished
validation loss 41511.80905


100%|██████████| 6672/6672 [09:11<00:00, 12.10it/s]


Epoch 14 is finished
validation loss 41450.31199


100%|██████████| 6672/6672 [08:56<00:00, 12.44it/s]


Epoch 15 is finished
validation loss 41721.40660


100%|██████████| 6672/6672 [08:54<00:00, 12.48it/s]


Epoch 16 is finished
validation loss 41517.63726


100%|██████████| 6672/6672 [08:53<00:00, 12.51it/s]


Epoch 17 is finished
validation loss 41377.02292


100%|██████████| 6672/6672 [08:58<00:00, 12.40it/s]
  0%|          | 0/6672 [00:00<?, ?it/s]

Epoch 18 is finished
validation loss 41359.56560


100%|██████████| 6672/6672 [08:59<00:00, 12.36it/s]


Epoch 19 is finished
validation loss 41262.21379


In [0]:
def test_model(model, test_iterator):
    """
    Return mse for test_iterator
    """
    epoch_loss = 0
    n_batches = len(test_iterator)
    model.eval()
    with torch.no_grad():
        for batch in test_iterator:
            x = batch
            y = batch.label.squeeze(0)
            preds = model(x).squeeze(1).double()
            loss = criterion(preds, y)
            epoch_loss += loss.data.item()
    return epoch_loss / n_batches

In [34]:
mse_test = test_model(model, test_iterator)
wandb.log({"MSE Test": mse_test})
print('MSE for test data: {}'.format(mse_test))

MSE for test data: 40591.091599407155


# Wandb

Из-за большого кол-ва данных было решено посмотреть на работу моделей на небольшой выборке с разными параметрами.

Страница проекта на wandb: https://app.wandb.ai/2ispany3/reddit-comments-score

1. **Neural Network with attention.**

Для эмбеддингов токенов использовались предобученные эмбеддинги Берта из библиотеки transformers. Получаем эмбеддинги text и parent_text через механизм внимания и переможение, конкатенируем их и выполняем линейное преобразование. 

Лосс на валидации в зависимости от итерации:
![att_sgd_200k:](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/att_sgd_200k.png)

Таблица моделей с параметрами:
![att_sgd__200k_table:](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/att_sgd_200k_table.png)


Тут лучшая модель на тесте показывает MSE: 41 859

2. **Bert**

Также я хотела пропустить данные через Берт (заморозив почти все его слои). После Берта я брала CLS токен для text и parent, конкатенировала их и пропускала через линейные слои (от 1 до 3х). По сравнению с att_nn, модель лучше показывает себя на валидации, но она в разы медленнее (в att_nn одна эпоха на всех данных занимает 9 минут, в bert - 3 часа).

Лосс на валидации в зависимости от итерации:
![bert_small:](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/bert_small.png)

В легенде указан размер тренировочной выборки, на которой обучалась модель (от 50к до 400к) и размер батча (от 64 до 256)

Таблица моделей с параметрами:
![bert_small_table:](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/bert_small_table.png)

Тут лучшая модель на тесте показывает MSE: 40961

3. **Bert Large with different optimizers**

Дальше я попробовала модель ber-large из transformers, и заодно посмотрела на разные оптимизаторы (разницы особо нет).

Лосс на тренирвоочной выборке в зависимости от оптимизатора:
![bert_large_optims:](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/bert_large_optims.png)


4. **Neural Network with attention part 2.**

Эмбеддинги text + parent_text можно получить не только конкатенацией, так что я попробовала еще их просто складывать и усреднять. 

Лосс на валидации в зависимости от операции с эмбеддингами text и parent:
![att_nn_embeddings](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/att_nn_embeddings.png)

На выборке размером 100000 модели с суммированием и усреднением быстро переобучались, но показывали за меньшее кол-во итераций результаты лучше, чем конкатенация в целом, поэтому я остановилась на усреднении.

5. **Bert and NN+Attention Full**

Т.к. раньше использовалась только часть тренировочной выборки, я запустила лучшую модель Bert и лучшую модель NN+Attention на всей выборке. Для Берта одна эпоха заняла 3 часа, для NN было выстрее,  поэтому я на ней и остановилась. Но в принципе, Берт за одну эпоху дошел до лосса на валидации ниже, чем обычная модель за 20 эпох.

![full_models](https://github.com/ovbystrova/Reddit-Comment-Score-Prediction/raw/master/data/full_models.png)

In [42]:
# results for all runs
results = pd.read_csv('https://raw.githubusercontent.com/ovbystrova/Reddit-Comment-Score-Prediction/master/data/wandb_table.csv')
results

Unnamed: 0,Name,Train size,Learning rate,Optimizer,Batch size,MSE Test,Train Loss,Valid Loss
0,att_nn_summ,1000000,0.01,SGD,256,40411.170471,35426.627636,44090.727514
1,att_nn_base,1000000,0.01,SGD,256,40460.41456,58660.257789,44109.030938
2,att_full,All,0.01,SGD,512,40591.091599,45558.605985,41262.213786
3,att_nn_mean,1000000,0.01,SGD,512,40628.581322,25764.752528,44044.973093
4,bert_full,3000000,0.1,SGD,64,40808.274171,24684.542065,40164.464237
5,bert_sgd,400000,,SGD,64,40961.242078,32785.515115,38891.07826
6,bert_best,400000,0.1,SGD,64,40961.242078,28634.016796,39845.152171
7,bert,50000,0.1,SGD,256,41214.770787,22553.148164,43417.641305
8,bert_small,200000,,SGD,128,41437.572631,11304.61395,44515.601628
9,bert_small,200000,,SGD,256,41736.540656,111928.105778,44359.050189


Стоит отметить, что модели, натренированные на 100к примерах даже лучше, чем те, которые тренировались на большем кол-ве данных.