# Контекст

- модель 'distilbert-base-uncased'
- оптимизатор AdamW (реализация алгоритма Adam, но с исправлением спада	веса)
- используем Pytorch а не Trainer


In [131]:
import pandas as pd
import numpy as np
import warnings

from transformers import DistilBertForSequenceClassification,\
                         DistilBertTokenizerFast,\
                         AdamW

import torch
from torch.utils.data import Dataset,\
                             DataLoader


from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA ?: ', torch.cuda.is_available())

CUDA ?:  True


# Скачиваем DistilBert оболочку

In [132]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 

The tokenizer class you load from this checkpoint is 'BertTokenizer'. 

The class this function is called from is 'DistilBertTokenizerFast'.


# Чтение файла

In [134]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg


In [135]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos


In [136]:
val, test = train_test_split(
    test,
    test_size = 0.5,
    random_state = 42,
    stratify = test['sentiment']
)

val['sentiment'].value_counts()

sentiment
neg    6250
pos    6250
Name: count, dtype: int64

In [137]:
# обработка меток
train['sentiment'] = train['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))
val['sentiment'] = val['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))
test['sentiment'] = test['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))

# Dataset

In [138]:
class ImdbDataset(Dataset):

    def __init__(self, df):

        self.data = df # берем не весь, для упрощения обучения
        self.tokenize_texts = tokenizer(
            self.data['text'].to_list(),
            return_tensors = 'pt',
            padding = True,
            truncation = True,
            max_length = 512
            )

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_ids = self.tokenize_texts['input_ids'][index]
        attention_mask = self.tokenize_texts['attention_mask'][index]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': row['sentiment']
        }

# Dataloader

In [139]:
torch.manual_seed(42)

train_dataloader = DataLoader(
    dataset = ImdbDataset(train),
    batch_size = 64,
    shuffle = True,
    drop_last = True
)

val_dataloader = DataLoader(
    dataset = ImdbDataset(val),
    batch_size = 64,
    shuffle = False,
    drop_last = True
)

In [None]:
optimizer = AdamW(model.parameters(), lr = 0.001)


for epoch in range(3):

    model = model.to(device)
    model.train()

    loss_train_all = 0.0
    batch_size = 0.0

    for batch in train_dataloader:

        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        loss_train_all += loss.item()


        loss.backward()
        optimizer.step()

    loss_train = loss_train_all / len(train_dataloader)

    model.eval()

    kol_vo_correct_predict = 0
    all_kol_vo = 0

    for batch in val_dataloader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        predictions = outputs.logits.argmax(dim=-1)

        kol_vo_correct_predict += (predictions == labels).sum().item()
        all_kol_vo += labels.size(0)
    
    val_accuracy = kol_vo_correct_predict / all_kol_vo

    print('Epoch {} || train_loss: {:.3f} || val accuracy: {:.3f}'.format(epoch+1,
                                                                          loss_train,
                                                                          val_accuracy))