In [None]:
!pip install datasets transformers

In [None]:
!pip3 install accelerate -U

In [3]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification, AdamW
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import random

import pandas as pd
import numpy as np

In [4]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(seed)
random.seed(seed)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
 #загрузим датасет
imdb = load_dataset("imdb")

In [7]:
imdb['test']['text'][0]

'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have

In [8]:
len(imdb['test']['text'])

25000

In [9]:
train_data = pd.DataFrame({'text': imdb['train']['text'], 'label': imdb['train']['label']})
test_data = pd.DataFrame({'text': imdb['test']['text'], 'label': imdb['test']['label']})

train_data = train_data.sample(n=2000, random_state=42)
test_data = test_data.sample(n=500, random_state=42)

In [10]:
train_data['label'].value_counts()

0    1040
1     960
Name: label, dtype: int64

In [11]:
train_texts = list(train_data['text'])
test_texts = list(test_data['text'])

train_labels = torch.tensor(list(train_data['label']))
test_labels = torch.tensor(list(test_data['label']))

Замерим качество на предуобченной модели для text classification без файнтюнинга

In [12]:
#загрузим модель
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = TensorDataset(torch.tensor(train_encodings["input_ids"]),
                              torch.tensor(train_encodings["attention_mask"]),
                              train_labels)
test_dataset = TensorDataset(torch.tensor(test_encodings["input_ids"]),
                             torch.tensor(test_encodings["attention_mask"]),
                             test_labels)

In [None]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

In [15]:
def evaluate_model(model, dataloader, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask)
            predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

    return predictions


train_preds = evaluate_model(model, train_loader, device)
test_preds = evaluate_model(model, test_loader, device)

100%|██████████| 125/125 [00:40<00:00,  3.11it/s]
100%|██████████| 32/32 [00:09<00:00,  3.22it/s]


In [16]:
train_accuracy = accuracy_score(train_labels, train_preds)
train_f1 = f1_score(train_labels, train_preds)

test_accuracy = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds)

print('####Train metrics#####')
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1 Score: {train_accuracy:.4f}")

print('####Test metrics#####')
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {test_f1:.4f}")

####Train metrics#####
Accuracy: 0.5200
F1 Score: 0.5200
####Test metrics#####
Accuracy: 0.5300
F1 Score: 0.0000


**Файнтюнинг модели**

In [17]:
def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def tokenize(examples):
    outputs = tokenizer(examples['text'], truncation=True)
    return outputs

tokenized_ds = imdb.map(tokenize, batched=True)

In [38]:
training_args = TrainingArguments(num_train_epochs=5,
                                  output_dir="distilbert-imdb",
                                  push_to_hub=False,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  evaluation_strategy="epoch",
                                  fp16=True)


In [39]:
data_collator = DataCollatorWithPadding(tokenizer)

In [40]:
trainer = Trainer(model=model, tokenizer=tokenizer,
                  data_collator=data_collator,
                  args=training_args,
                  train_dataset=tokenized_ds["train"],
                  eval_dataset=tokenized_ds["test"],
                  compute_metrics=compute_metrics)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2462,0.220878,0.92108
2,0.1458,0.246636,0.9242
3,0.0735,0.322792,0.92768


  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2462,0.220878,0.92108
2,0.1458,0.246636,0.9242
3,0.0735,0.322792,0.92768
4,0.0274,0.373432,0.9306
5,0.012,0.432736,0.93228


TrainOutput(global_step=7815, training_loss=0.10578999012918405, metrics={'train_runtime': 3323.6013, 'train_samples_per_second': 37.61, 'train_steps_per_second': 2.351, 'total_flos': 1.6394784128794656e+16, 'train_loss': 0.10578999012918405, 'epoch': 5.0})

In [43]:
torch.save(model.state_dict(), "fine_tuned_bert")

In [44]:
#inference model

train_preds = evaluate_model(model, train_loader, device)
test_preds = evaluate_model(model, test_loader, device)

100%|██████████| 125/125 [00:13<00:00,  9.27it/s]
100%|██████████| 32/32 [00:03<00:00,  9.39it/s]


In [45]:
train_accuracy = accuracy_score(train_labels, train_preds)
train_f1 = f1_score(train_labels, train_preds)

test_accuracy = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds)

print('####Fine-tuned Train metrics#####')
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1 Score: {train_accuracy:.4f}")

print('####Fine-tuned Test metrics#####')
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {test_f1:.4f}")

####Fine-tuned Train metrics#####
Accuracy: 0.4980
F1 Score: 0.4980
####Fine-tuned Test metrics#####
Accuracy: 0.9320
F1 Score: 0.9280
