# Контекст

- бинарная классификация текста с одни предложением (анализ тональности)
- тонкая настройка BERT (DistilBertForSequenceClassification)
- IMDB датасет
- 25000 train
- 12500 val
- 12500 test

In [11]:
# база
import pandas as pd
import warnings
import numpy as np
import json
import time

# визуализация
import seaborn as sns
import matplotlib.pyplot as plt

# nltk
from nltk.lm import Vocabulary

# pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,\
                            precision_recall_fscore_support

# transformers
from transformers import DistilBertForSequenceClassification,\
                         DistilBertTokenizerFast,\
                         TrainingArguments,\
                         Trainer

# datasets
from datasets import load_dataset

# константы
RANDOM_STATE = 42
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA ?: ', torch.cuda.is_available())

CUDA ?:  True


# Загрузка модели - токенизатора

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    id2label = {0: 'NEG', 1: 'POS'},
    label2id = {'NEG': 0, 'POS': 1})

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Чтение

In [28]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg


In [29]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos


In [30]:
val, test = train_test_split(
    test,
    test_size = 0.5,
    random_state = RANDOM_STATE,
    stratify = test['sentiment']
)

In [None]:
val['sentiment'].value_counts()

sentiment
neg    6250
pos    6250
Name: count, dtype: int64

In [None]:
test['sentiment'].value_counts()

sentiment
pos    6250
neg    6250
Name: count, dtype: int64

# Маскируем 'neg' --> 0, 'pos' --> 1

In [None]:
train['sentiment'] = train['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))
val['sentiment'] = val['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))
test['sentiment'] = test['sentiment'].map(lambda x: np.where(x == 'neg', 0, 1))

In [None]:
train['sentiment'].value_counts()

sentiment
0    12500
1    12500
Name: count, dtype: int64

# Токенизируем train

In [None]:
tokenize_data_train = tokenizer(
                    train['text'].to_list(),
                    padding = True,
                    truncation = True,
                    return_attention_mask=True
                    )

tokenize_data_val = tokenizer(
                    val['text'].to_list(),
                    padding = True,
                    truncation = True,
                    return_attention_mask=True
                    )

train['input_ids'], train['attention_mask'] = tokenize_data_train['input_ids'], tokenize_data_train['attention_mask']
val['input_ids'], val['attention_mask'] = tokenize_data_val['input_ids'], tokenize_data_val['attention_mask']

In [None]:
train.head()

Unnamed: 0,text,sentiment,input_ids,attention_mask
0,"Now, I won't deny that when I purchased this o...",0,"[101, 2085, 1010, 1045, 2180, 1005, 1056, 9772...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"The saddest thing about this ""tribute"" is that...",0,"[101, 1996, 6517, 6155, 2102, 2518, 2055, 2023...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Last night I decided to watch the prequel or s...,0,"[101, 2197, 2305, 1045, 2787, 2000, 3422, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,I have to admit that i liked the first half of...,0,"[101, 1045, 2031, 2000, 6449, 2008, 1045, 4669...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,I was not impressed about this film especially...,0,"[101, 1045, 2001, 2025, 7622, 2055, 2023, 2143...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Trainer args

In [None]:
training_args = TrainingArguments(

    output_dir = 'training/model_points', # сохранение контрольных точек модели
    do_train = True, # мониторинг производительности
    do_eval = True,
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    per_gpu_eval_batch_size = 64,
    warmup_steps = 100, # оптмизация скорости обучения
    weight_decay = 0.01, # регуляризация весов модели
    logging_strategy = 'steps', # аналог verbose с сохранением логов (также есть 'epoch')
    logging_dir = 'training/logs',
    save_steps = 200,
    logging_steps = 100,
    evaluation_strategy = 'steps',
    fp16 = True, # Указывает на использование смешанной точности и использует как 16-, так и 32-битные типы с плавающей запятой, чтобы 
                 # обучение проходило быстрее и занимало меньше памяти
    load_best_model_at_end = True
    
)

# Функция подсчета метрик

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    Precision, Recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
            'Accuracy': acc,
            'F1': f1,
            'Precision': Precision,
            'Recall': Recall
            }

# Dataset

In [None]:
class ImdbDataset(Dataset):

    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        return {
            'input_ids': row['input_ids'],
            'attention_mask': row['attention_mask'],
            'labels': row['sentiment']
        }

# Обучение

In [None]:
trainer = Trainer(

    model = model,
    args = training_args,
    train_dataset = ImdbDataset(train),
    eval_dataset = ImdbDataset(val),
    compute_metrics = compute_metrics
    
)

In [None]:
results = trainer.train()

# Проверка модели

In [None]:
q=[trainer.evaluate(eval_dataset=ImdbDataset(data)) for data in [train, val]]
pd.DataFrame(q, index=["train","val"]).iloc[:,:5]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


  0%|          | 0/391 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


  0%|          | 0/196 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.140489,0.94988,0.949877,0.949987,0.94988
val,0.198395,0.9216,0.92157,0.922254,0.9216


In [None]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True,truncation=True, max_length=250, return_tensors="pt").to(device)
    outputs = model(
        inputs["input_ids"].to(device),
        inputs["attention_mask"].to(device)
        )
    probs = outputs[0].softmax(1)
    return probs, probs.argmax()


# text = 'I don`t like this movie'
# get_prediction(text)[1].item()

# Сохранение

In [None]:
model_save_path = "MyBestIMDB_binary_Model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('MyBestIMDB_binary_Model\\tokenizer_config.json',
 'MyBestIMDB_binary_Model\\special_tokens_map.json',
 'MyBestIMDB_binary_Model\\vocab.txt',
 'MyBestIMDB_binary_Model\\added_tokens.json',
 'MyBestIMDB_binary_Model\\tokenizer.json')

In [31]:
from transformers import pipeline,\
                         DistilBertForSequenceClassification,\
                         DistilBertTokenizerFast


model = DistilBertForSequenceClassification.from_pretrained('MyBestIMDB_binary_Model')
tokenizer = DistilBertTokenizerFast.from_pretrained('MyBestIMDB_binary_Model')

nlp = pipeline(
    'sentiment-analysis',
    model = model,
    tokenizer = tokenizer
)

results = []
for text in test['text']:
    text = text[:512]
    results.append(nlp(text)[0]['label'])

In [32]:
test['predict'] = results
test.head()

Unnamed: 0,text,sentiment,predict
17392,This is one of the best Czech movies I have ev...,pos,POS
11969,I saw this movie because every review I read o...,neg,NEG
8306,I've just watched this with my three children ...,pos,POS
17625,"This film did well at the box office, and the ...",neg,NEG
22088,Surface was one of the few truly unique shows ...,pos,POS


In [33]:
test['predict'] = test['predict'].map(lambda x: x.lower())
test.head()

Unnamed: 0,text,sentiment,predict
17392,This is one of the best Czech movies I have ev...,pos,pos
11969,I saw this movie because every review I read o...,neg,neg
8306,I've just watched this with my three children ...,pos,pos
17625,"This film did well at the box office, and the ...",neg,neg
22088,Surface was one of the few truly unique shows ...,pos,pos


In [38]:
print('Точность на тестовом наборе: ', (test['sentiment'] == test['predict']).sum() / len(test))

Точность на тестовом наборе:  0.85664
