## Классификация текстов с использованием предобученных языковых моделей.

В данном задании вам предстоит обратиться к задаче классификации текстов и решить ее с использованием предобученной модели BERT.

In [59]:
import json
# do not change the code in the block below
# __________start of block__________
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython import display
# from IPython.display import clear_output
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

%matplotlib inline
# __________end of block__________

In [39]:
try:
    from torchinfo import summary
except:
    !pip install torchinfo
    from torchinfo import summary

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


Обратимся к набору данных SST-2. Holdout часть данных (которая понадобится вам для посылки) доступна по ссылке ниже.

In [2]:
# do not change the code in the block below
# __________start of block__________

!wget https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
# __________end of block__________

--2024-11-21 23:10:56--  https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51581 (50K) [text/plain]
Saving to: ‘texts_holdout.json’


2024-11-21 23:10:56 (5.53 MB/s) - ‘texts_holdout.json’ saved [51581/51581]



In [125]:
# do not change the code in the block below
# __________start of block__________
df = pd.read_csv(
    "https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv",
    delimiter="\t",
    header=None,
)
texts_train = df[0].values[:5000]
y_train = df[1].values[:5000]
texts_test = df[0].values[5000:]
y_test = df[1].values[5000:]
with open("texts_holdout.json") as iofile:
    texts_holdout = json.load(iofile)
# __________end of block__________

Весь остальной код предстоит написать вам.

Для успешной сдачи на максимальный балл необходимо добиться хотя бы __84.5% accuracy на тестовой части выборки__.

In [4]:
# your beautiful experiments here

#### get embeddings

In [127]:
from transformers import DistilBertModel, DistilBertTokenizer

In [128]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [129]:
tokenized_train = tokenizer(texts_train.tolist(), return_tensors='pt', padding=True)
tokenized_test = tokenizer(texts_test.tolist(), return_tensors='pt', padding=True)
tokenized_holdout = tokenizer(texts_holdout, return_tensors='pt', padding=True)

In [130]:
for key, value in tokenized_train.items():
    value_type = type(value).__name__
    print(f'{key}: {value_type}. {value.shape}')

input_ids: Tensor. torch.Size([5000, 67])
attention_mask: Tensor. torch.Size([5000, 67])


In [131]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [132]:
def get_features(model, tokenized_text, num_batches, batch_size):
    features = []
    with torch.no_grad():
        for i in range(0, num_batches, batch_size):
            text_batch = tokenized_text['input_ids'][i:i + batch_size].to(device)
            masks_batch = tokenized_text['attention_mask'][i:i + batch_size].to(device)
            output = model(text_batch, masks_batch)
            batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)

    return features

In [133]:
batch_size = 32

train_features = get_features(model, tokenized_train, len(texts_train), batch_size)
test_features = get_features(model, tokenized_test, len(texts_test), batch_size)
holdout_features = get_features(model, tokenized_holdout, len(texts_holdout), batch_size)

#### manual logreg

In [69]:
class logreg(nn.Module):
    def __init__(self, in_size, out_size):
        super().__init__()

        self.linear = nn.Linear(in_features=in_size, out_features=out_size)

    def forward(self, x):
        return self.linear(x)

In [None]:
train_features = torch.FloatTensor(train_features).to(device)
test_features = torch.FloatTensor(test_features).to(device)
holdout_features = torch.FloatTensor(holdout_features).to(device)
y_train = torch.LongTensor(y_train).to(device)
y_test = torch.LongTensor(y_test).to(device)

In [83]:
manual_logreg = logreg(768, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(manual_logreg.parameters(), lr=0.001)

In [85]:
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    manual_logreg.train()
    epoch_loss = 0
    for i in range(0, len(train_features), batch_size):
        # Получаем батч данных
        batch_features = train_features[i:i+batch_size]
        batch_labels = y_train[i:i+batch_size]  # CrossEntropyLoss ожидает long (целые числа)

        # Прямой проход
        outputs = manual_logreg(batch_features)  # Логиты
        loss = criterion(outputs, batch_labels)

        # Обратный проход и обновление весов
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_features):.4f}")

Epoch 1/10, Loss: 0.0166
Epoch 2/10, Loss: 0.0134
Epoch 3/10, Loss: 0.0125
Epoch 4/10, Loss: 0.0120
Epoch 5/10, Loss: 0.0117
Epoch 6/10, Loss: 0.0115
Epoch 7/10, Loss: 0.0113
Epoch 8/10, Loss: 0.0111
Epoch 9/10, Loss: 0.0110
Epoch 10/10, Loss: 0.0109


In [92]:
from torch.nn.functional import softmax

manual_logreg.eval()
with torch.no_grad():
    train_outputs = manual_logreg(train_features)
    train_probabilities = softmax(train_outputs, dim=1)
    train_predictions = torch.argmax(train_outputs, dim=1)

    # Рассчитайте точность
    accuracy = (train_predictions == y_train).float().mean()
    print(f"Train Accuracy: {accuracy:.4f}")

Train Accuracy: 0.8564


In [93]:
manual_logreg.eval()
with torch.no_grad():
    test_outputs = manual_logreg(test_features)  # Логиты
    test_probabilities = softmax(test_outputs, dim=1)  # Вероятности по каждому классу
    test_predictions = torch.argmax(test_probabilities, dim=1)  # Индексы максимальных вероятностей

    # Рассчитайте точность
    accuracy = (test_predictions == y_test).float().mean()
    print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8432


In [94]:
with torch.no_grad():
    holdout_outputs = manual_logreg(holdout_features)  # Логиты
    holdout_probabilities = softmax(holdout_outputs, dim=1)  # Вероятности
    holdout_predictions = torch.argmax(holdout_probabilities, dim=1)

In [120]:
pos_train = [probs[0].item() for probs in train_probabilities]
pos_test = [probs[0].item() for probs in test_probabilities]
pos_holdout = [probs[0].item() for probs in holdout_probabilities]

#### sklean logreg

In [134]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_features, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [135]:
log_reg.score(test_features, y_test)

0.8458333333333333

In [136]:
train_pred_proba = log_reg.predict_proba(train_features)
test_pred_proba = log_reg.predict_proba(test_features)
holdout_pred_proba = log_reg.predict_proba(holdout_features)

In [143]:
pos_train = [probs[1] for probs in train_pred_proba]
pos_test = [probs[1] for probs in test_pred_proba]
pos_holdout = [probs[1] for probs in holdout_pred_proba]

#### Сдача взадания в контест
Сохраните в словарь `out_dict` вероятности принадлежности к первому (положительному) классу

In [144]:
out_dict = {
    'train': pos_train,  # list of length 5000 with probas
    'test': pos_test,  # list of length 1920 with probas
    'holdout': pos_holdout  # list of length 500 with probas
}

Несколько `assert`'ов для проверки вашей посылки:

In [145]:
assert isinstance(out_dict["train"], list), "Object must be a list of floats"
assert isinstance(out_dict["train"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["train"]) == 5000
), "The predicted probas list length does not match the train set size"

assert isinstance(out_dict["test"], list), "Object must be a list of floats"
assert isinstance(out_dict["test"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["test"]) == 1920
), "The predicted probas list length does not match the test set size"

assert isinstance(out_dict["holdout"], list), "Object must be a list of floats"
assert isinstance(out_dict["holdout"][0], float), "Object must be a list of floats"
assert (len(out_dict["holdout"]) == 500
), "The predicted probas list length does not match the holdout set size"

Запустите код ниже для генерации посылки.

In [146]:
# do not change the code in the block below
# __________start of block__________
FILENAME = "submission_dict_hw_text_classification_with_bert.json"

with open(FILENAME, "w") as iofile:
    json.dump(out_dict, iofile)
print(f"File saved to `{FILENAME}`")
# __________end of block__________

File saved to `submission_dict_hw_text_classification_with_bert.json`


На этом задание завершено. Поздравляем!