# Контекст

- модель регрессии (схожесть двух предложений)
- модель distilbert
- структура input_ids: индексы токенов из словаря первого sentence SEP индексы токенов из словаря второго sentence
- SEP - токен конца + токен разделения двух sentence
- метрики: MSE, RMSE, MAE, Pirson, Spierman

In [82]:
import pandas as pd
import numpy as np

from transformers import DistilBertConfig,\
                         DistilBertTokenizerFast,\
                         DistilBertForSequenceClassification,\
                         TrainingArguments,\
                         Trainer

from sklearn.model_selection import train_test_split

from scipy.stats import pearsonr,\
                        spearmanr


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Скачиваем модель и токенизатор

In [83]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased',num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', max_length = 512)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Чтение файла

In [84]:
spisok = []
with open('sts-test.csv', 'r') as file:
    for line in file.readlines():
        line = line.replace('\n', '')
        to_frame = line.split('\t')
        if len(to_frame) == 7:
            spisok.append(to_frame[-3:])


data = pd.DataFrame(spisok, columns = ['label', 'sentence_1', 'sentence_2'])
data['label'] = data['label'].astype('Float32')
data.head()

Unnamed: 0,label,sentence_1,sentence_2
0,2.5,A girl is styling her hair.,A girl is brushing her hair.
1,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
2,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
3,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
4,1.5,A man is playing a harp.,A man is playing a keyboard.


# train-val-test split

In [85]:
train, test = train_test_split(
    data,
    train_size = int(len(data)*0.6),
    random_state = 42
)

test, val = train_test_split(
    test,
    test_size = 0.5,
    random_state = 42
)

print(len(train), len(val), len(test))

657 219 219


# Dataset

In [86]:
class RegressionDataset(Dataset):

    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]

        tokenized_data = tokenizer(
            row['sentence_1'],
            row['sentence_2'],
            truncation = True,
            padding = True,
            return_attention_mask = True
            )
        
        label = row['label']
        
        return {
            'input_ids': tokenized_data['input_ids'],
            'attention_mask': tokenized_data['attention_mask'],
            'labels': label
        }
    

RegressionDataset(train)[0]

{'input_ids': [101,
  1037,
  2711,
  2003,
  16018,
  27130,
  1012,
  102,
  1037,
  4937,
  2003,
  17033,
  1037,
  5835,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': 0.0}

# Training args

In [95]:
training_args = TrainingArguments(

        output_dir='training/model_points',
        do_train=True,
        do_eval=True,
        num_train_epochs=10,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        warmup_steps=100,
        weight_decay=0.01,
        logging_strategy='steps',
        logging_dir='training/logs',
        logging_steps=50,
        evaluation_strategy="steps",
        save_steps = 500,
        fp16=True,
        load_best_model_at_end=True,
        report_to = 'wandb'

)

# Метрики

In [96]:
def compute_metrics(pred):

    preds = np.squeeze(pred.predictions)

    return {"MSE": ((preds - pred.label_ids)**2).mean().item(),
            "RMSE": (np.sqrt (( (preds - pred.label_ids)** 2).mean())).item(),
            "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
            "Pearson" : pearsonr(preds,pred.label_ids)[0],
            "Spearman's Rank":spearmanr(preds,pred.label_ids)[0]
            }


# Обучение

In [97]:
trainer = Trainer(

        model = model,
        args = training_args,
        train_dataset = RegressionDataset(train),
        eval_dataset = RegressionDataset(val),
        compute_metrics = compute_metrics,
        tokenizer = tokenizer

 )

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/210 [00:00<?, ?it/s]

{'loss': 1.4086, 'grad_norm': 22.844818115234375, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.38}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7941495180130005, 'eval_MSE': 0.7941495180130005, 'eval_RMSE': 0.8911506533622742, 'eval_MAE': 0.7150071263313293, 'eval_Pearson': 0.8217444159643199, "eval_Spearman's Rank": 0.7844024366328554, 'eval_runtime': 0.1396, 'eval_samples_per_second': 1568.556, 'eval_steps_per_second': 28.649, 'epoch': 2.38}
{'loss': 0.5632, 'grad_norm': 28.229604721069336, 'learning_rate': 4.8e-05, 'epoch': 4.76}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.826815128326416, 'eval_MSE': 0.826815128326416, 'eval_RMSE': 0.9092937707901001, 'eval_MAE': 0.7466901540756226, 'eval_Pearson': 0.8332740408136717, "eval_Spearman's Rank": 0.8077182829360665, 'eval_runtime': 0.1448, 'eval_samples_per_second': 1512.748, 'eval_steps_per_second': 27.63, 'epoch': 4.76}
{'loss': 0.3328, 'grad_norm': 7.859753608703613, 'learning_rate': 2.909090909090909e-05, 'epoch': 7.14}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.5644059777259827, 'eval_MSE': 0.5644059777259827, 'eval_RMSE': 0.7512695789337158, 'eval_MAE': 0.6069604754447937, 'eval_Pearson': 0.8552356505469322, "eval_Spearman's Rank": 0.8280123801555602, 'eval_runtime': 0.1528, 'eval_samples_per_second': 1432.901, 'eval_steps_per_second': 26.172, 'epoch': 7.14}
{'loss': 0.1599, 'grad_norm': 7.392002582550049, 'learning_rate': 6.363636363636363e-06, 'epoch': 9.52}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.581851065158844, 'eval_MSE': 0.5818511247634888, 'eval_RMSE': 0.7627916932106018, 'eval_MAE': 0.5855435132980347, 'eval_Pearson': 0.8491636079985622, "eval_Spearman's Rank": 0.8249211146562015, 'eval_runtime': 0.1509, 'eval_samples_per_second': 1451.578, 'eval_steps_per_second': 26.513, 'epoch': 9.52}
{'train_runtime': 12.1191, 'train_samples_per_second': 542.12, 'train_steps_per_second': 17.328, 'train_loss': 0.5920652423586165, 'epoch': 10.0}


TrainOutput(global_step=210, training_loss=0.5920652423586165, metrics={'train_runtime': 12.1191, 'train_samples_per_second': 542.12, 'train_steps_per_second': 17.328, 'train_loss': 0.5920652423586165, 'epoch': 10.0})

# Подсчет метрик для каждого сета

In [100]:
q=[trainer.evaluate(eval_dataset=data) for data in [RegressionDataset(train), RegressionDataset(val), RegressionDataset(test)]]
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:6]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,eval_loss,eval_MSE,eval_RMSE,eval_MAE,eval_Pearson,eval_Spearman's Rank
train,0.063369,0.063369,0.251733,0.200834,0.986426,0.985437
val,0.5924,0.5924,0.769675,0.58818,0.849793,0.827138
test,0.659939,0.659939,0.812366,0.616345,0.850734,0.831835


# Проверка на двух схожих предложениях

In [104]:
s1, s2="A plane is taking off.", "An air plane is taking off."

encoding = tokenizer(
    s1,s2,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=512
    )

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
model = model.cpu()
outputs = model(input_ids, attention_mask = attention_mask)
outputs.logits.item()

4.8192057609558105

# Проверка на двух разных предложениях

In [105]:
s1, s2="The men are playing soccer.", "A man is riding a motorcycle."

encoding = tokenizer(
    s1, s2,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=512
    )

input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

0.3200218379497528