Необходимо решить задачу классификации текстов, используя рекуррентные сети.

## Data cleaning

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('data/train.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41159 entries, 0 to 41158
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41158 non-null  object
 1   Text        41158 non-null  object
 2   Sentiment   41155 non-null  object
dtypes: object(3)
memory usage: 964.8+ KB


In [4]:
train_df[train_df.Sentiment.isna()]

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
33122,33122,@PrivyCouncilCA #SocialDistancing isnÂt enoug...,
33123,,Neutral,
39205,39204,@TanDhesi @foreignoffice @Afzal4Gorton @Expres...,
39206,Neutral,,


Стоит удалить из обучающей выборки неразмеченные данные

In [5]:
# Удаляем пустышки и ненужные индексы
train_df = train_df.dropna(subset=['Sentiment'])
train_df = train_df.drop('Unnamed: 0', axis=1).reset_index(drop=True)

In [6]:
train_df.Sentiment.value_counts()

Positive              11422
Negative               9917
Neutral                7711
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['Sentiment_encoded'] = le.fit_transform(train_df['Sentiment'])
text_labels = train_df['Sentiment'].unique()
numeric_labels = le.fit_transform(text_labels)
print(text_labels, numeric_labels)

['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive'] [3 4 0 2 1]


In [8]:
train_df.head(5)

Unnamed: 0,Text,Sentiment,Sentiment_encoded
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,3
1,advice Talk to your neighbours family to excha...,Positive,4
2,Coronavirus Australia: Woolworths to give elde...,Positive,4
3,My food stock is not the only one which is emp...,Positive,4
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,0


In [9]:
# Чистим текст от мусора

import re

def clean_text(text):
    # Удаление слов, начинающихся с "@"
    text = re.sub(r'@\w+', '', text)

    # Удаление URL, начинающихся с "https://"
    text = re.sub(r'https?://\S+', '', text)

    # Удаление '#' из начала слов
    text = re.sub(r'#\w+\b', lambda x: x.group()[1:], text)

    return text

train_df.Text = train_df.Text.apply(lambda text: clean_text(text))

In [10]:
train_df.head(5)

Unnamed: 0,Text,Sentiment,Sentiment_encoded
0,and and,Neutral,3
1,advice Talk to your neighbours family to excha...,Positive,4
2,Coronavirus Australia: Woolworths to give elde...,Positive,4
3,My food stock is not the only one which is emp...,Positive,4
4,"Me, ready to go at supermarket during the COVI...",Extremely Negative,0


In [11]:
# train_data.to_csv('/content/drive/MyDrive/NLP/text_data.csv', index=False, mode='w')

## Tokenization

In [12]:
#pip install -U torchtext==0.6

In [13]:

import torch
import torch.nn as nn
import torchtext
from torchtext.data import TabularDataset, Field, BucketIterator, Example, Dataset

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [14]:
"""TEXT = Field(sequential=True, tokenize='basic_english', lower=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.int64)

dataset = TabularDataset(
    path='/content/drive/MyDrive/NLP/text_data.csv',
    format='csv',
    fields={'Text': ('text', TEXT), 'Sentiment_encoded': ('label', LABEL)},
    skip_header=False
)

train_data, valid_data, test_data = dataset.split(split_ratio=[0.7, 0.2, 0.1])"""

"TEXT = Field(sequential=True, tokenize='basic_english', lower=True)\nLABEL = Field(sequential=False, use_vocab=False, dtype=torch.int64)\n\ndataset = TabularDataset(\n    path='/content/drive/MyDrive/NLP/text_data.csv',\n    format='csv',\n    fields={'Text': ('text', TEXT), 'Sentiment_encoded': ('label', LABEL)},\n    skip_header=False\n)\n\ntrain_data, valid_data, test_data = dataset.split(split_ratio=[0.7, 0.2, 0.1])"

In [15]:
TEXT = Field(sequential=True, tokenize='basic_english', lower=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.int64)

# Создаем Dataset
examples = []
for i, row in train_df.iterrows():
    text = row['Text']
    label = row['Sentiment_encoded']
    examples.append(Example.fromlist([text, label], fields=[('text', TEXT), ('label', LABEL)]))

dataset = Dataset(examples, fields=[('text', TEXT), ('label', LABEL)])

# Разбиваем train_data, valid_data, test_data
train_data, valid_data, test_data = dataset.split(
    split_ratio=[0.7, 0.2, 0.1],
    stratified=True,
    strata_field='label'
)

# Построение словаря
TEXT.build_vocab(train_data, min_freq=1)

In [16]:
for example in dataset.examples:
    print("Label:", example.label)
    print("Text:", example.text)
    print("\n")
    break

Label: 3
Text: ['and', 'and']




## Архитектуры RNN

In [17]:
# Определяем архитектуру модели RNN
"""class RNNClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))"""

# Определяем архитектуру модели RNN c Dropout
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)  # Добавлен Dropout-слой

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        # Применяем Dropout к выходу перед полносвязным слоем
        output = self.dropout(output)
        return self.fc(hidden.squeeze(0))

# Определение архитектуры модели RNN c GRU
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.gru(embedded)
        hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

# Определение архитектуры модели RNN c LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1, :, :])
        return self.fc(hidden)

In [18]:

# Функция для обучения модели
def train(model, iterator, optimizer, criterion):
    model.train()
    total_loss = 0

    for batch in iterator:
        optimizer.zero_grad()
        text = batch.text
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(iterator)


# Функция для оценки модели
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, batch.label)
            total_loss += loss.item()

    return total_loss / len(iterator)


# Функция для оценки модели
def evaluate_metrics(model, iterator):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            predictions = model(text).squeeze(1).argmax(dim=1).cpu().numpy()
            labels = batch.label.cpu().numpy()

            all_labels.extend(labels)
            all_predictions.extend(predictions)

    accuracy = accuracy_score(all_labels, all_predictions)
    confusion = confusion_matrix(all_labels, all_predictions)
    class_report = classification_report(all_labels, all_predictions)

    return accuracy, confusion, class_report


## vanilla RNN

In [20]:

# Определяем параметры модели RNN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 5
n_layers = 1
dropout = 0.2

model = RNNClassifier(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

# Определяем функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())

# Создаем итераторы для данных
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
#    repeat=True, # Repeat the iterator for multiple epochs - слишком требовательна к ресурсам
    shuffle=True,  # Shuffle data on each epoch run
    sort_key = lambda x: len(x.text), # Function to use for sorting examples
    sort=False, # Sort all examples in data using sort_key
    sort_within_batch=True # Use `sort_key` to sort examples in each batch
)

# Обучаем модель
best_valid_loss = float('inf')  # Инициализация переменной для отслеживания лучшей потери
patience = 3  # Установка порога терпимости

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
#        torch.save(model.state_dict(), 'best_model.pt')  # Сохранение лучших весов модели

    # Проверка Early Stopping
    if epoch > patience and valid_loss >= best_valid_loss:
        print(f'Early stopping after {epoch - patience} epochs of no improvement.')
        break

# Загрузка лучших весов модели
# model.load_state_dict(torch.load('best_model.pt'))

# Оцениваем модель на тестовых данных
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')

test_accuracy, test_confusion, test_class_report = evaluate_metrics(model, test_iterator)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Confusion Matrix:')
print(test_confusion)
print('Test Classification Report:')
print(test_class_report)

Epoch: 01
	Train Loss: 1.587
	Validation Loss: 1.586
Epoch: 02
	Train Loss: 1.533
	Validation Loss: 1.507
Epoch: 03
	Train Loss: 1.431
	Validation Loss: 1.459
Epoch: 04
	Train Loss: 1.392
	Validation Loss: 1.439
Epoch: 05
	Train Loss: 1.267
	Validation Loss: 1.420
Early stopping after 1 epochs of no improvement.
Test Loss: 1.397
Test Accuracy: 0.3936
Test Confusion Matrix:
[[664  44 296  25  67]
 [ 49 755  78  43 400]
 [654 178 668 166 317]
 [215 181 364 388 394]
 [211 729 378 202 764]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.61      0.46      1096
           1       0.40      0.57      0.47      1325
           2       0.37      0.34      0.35      1983
           3       0.47      0.25      0.33      1542
           4       0.39      0.33      0.36      2284

    accuracy                           0.39      8230
   macro avg       0.40      0.42      0.39      8230
weighted avg       0.40      0.39      0.38   

## RNN с LSTM (Long Short-Term Memory)

In [21]:
# Определение гиперпараметров модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 5
n_layers = 2
dropout = 0.2

# Создание экземпляра модели
model = LSTMClassifier(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

# Определяем функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())

# Создаем итераторы для данных
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
#    repeat=True, # Repeat the iterator for multiple epochs - слишком требовательна к ресурсам
    shuffle=True,  # Shuffle data on each epoch run
    sort_key = lambda x: len(x.text), # Function to use for sorting examples
    sort=False, # Sort all examples in data using sort_key
    sort_within_batch=True # Use `sort_key` to sort examples in each batch
)

# Обучаем модель
best_valid_loss = float('inf')  # Инициализация переменной для отслеживания лучшей потери
patience = 3  # Установка порога терпимости

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
#        torch.save(model.state_dict(), 'best_model.pt')  # Сохранение лучших весов модели

    # Проверка Early Stopping
    if epoch > patience and valid_loss >= best_valid_loss:
        print(f'Early stopping after {epoch - patience} epochs of no improvement.')
        break

# Загрузка лучших весов модели
# model.load_state_dict(torch.load('best_model.pt'))

# Оцениваем модель на тестовых данных
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')

test_accuracy, test_confusion, test_class_report = evaluate_metrics(model, test_iterator)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Confusion Matrix:')
print(test_confusion)
print('Test Classification Report:')
print(test_class_report)

Epoch: 01
	Train Loss: 1.518
	Validation Loss: 1.398
Epoch: 02
	Train Loss: 1.310
	Validation Loss: 1.286
Epoch: 03
	Train Loss: 1.122
	Validation Loss: 1.181
Epoch: 04
	Train Loss: 0.945
	Validation Loss: 1.042
Epoch: 05
	Train Loss: 0.782
	Validation Loss: 1.057
Early stopping after 1 epochs of no improvement.
Test Loss: 1.026
Test Accuracy: 0.6180
Test Confusion Matrix:
[[ 535   15  466   25   55]
 [   2  922   46   16  339]
 [ 142   46 1216  246  333]
 [   4   23  223 1047  245]
 [  12  367  325  214 1366]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.49      0.60      1096
           1       0.67      0.70      0.68      1325
           2       0.53      0.61      0.57      1983
           3       0.68      0.68      0.68      1542
           4       0.58      0.60      0.59      2284

    accuracy                           0.62      8230
   macro avg       0.65      0.61      0.62      8230
weighted avg       0.

## RNN с GRU (Gated Recurrent Unit)

In [24]:

# Определение гиперпараметров модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 5
n_layers = 2
dropout = 0.1

# Создание экземпляра модели
model = GRUClassifier(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

# Определяем функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)

# Создаем итераторы для данных
BATCH_SIZE = 16

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
#    repeat=True, # Repeat the iterator for multiple epochs - слишком требовательна к ресурсам
    shuffle=True,  # Shuffle data on each epoch run
    sort_key = lambda x: len(x.text), # Function to use for sorting examples
    sort=False, # Sort all examples in data using sort_key
    sort_within_batch=True # Use `sort_key` to sort examples in each batch
)

# Обучаем модель
best_valid_loss = float('inf')  # Инициализация переменной для отслеживания лучшей потери
patience = 3  # Установка порога терпимости

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, valid_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # torch.save(model.state_dict(), 'best_model.pt')  # Сохранение лучших весов модели

    # Проверка Early Stopping
    if epoch > patience and valid_loss >= best_valid_loss:
        print(f'Early stopping after {epoch - patience} epochs of no improvement.')
        break

# Загрузка лучших весов модели
# model.load_state_dict(torch.load('best_model.pt'))

# Оцениваем модель на тестовых данных
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')

test_accuracy, test_confusion, test_class_report = evaluate_metrics(model, test_iterator)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Confusion Matrix:')
print(test_confusion)
print('Test Classification Report:')
print(test_class_report)

Epoch: 01
	Train Loss: 1.319
	Validation Loss: 1.033
Epoch: 02
	Train Loss: 0.813
	Validation Loss: 0.746
Epoch: 03
	Train Loss: 0.546
	Validation Loss: 0.643
Epoch: 04
	Train Loss: 0.366
	Validation Loss: 0.635
Epoch: 05
	Train Loss: 0.251
	Validation Loss: 0.709
Early stopping after 1 epochs of no improvement.
Test Loss: 0.687
Test Accuracy: 0.7836
Test Confusion Matrix:
[[ 801    2  277    1   15]
 [   1 1004   12    2  306]
 [  99    3 1568   93  220]
 [   4    9  153 1236  140]
 [  11  140  193  100 1840]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.73      0.80      1096
           1       0.87      0.76      0.81      1325
           2       0.71      0.79      0.75      1983
           3       0.86      0.80      0.83      1542
           4       0.73      0.81      0.77      2284

    accuracy                           0.78      8230
   macro avg       0.81      0.78      0.79      8230
weighted avg       0.

## Выгрузка результата

Для финальной модели используем RNN с GRU как более быструю. Переобучим ее на полном датасете и выгрузим результат.

In [25]:
TEXT = Field(sequential=True, tokenize='basic_english', lower=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.int64)

# Создаем Dataset
examples = []
for i, row in train_df.iterrows():
    text = row['Text']
    label = row['Sentiment_encoded']
    examples.append(Example.fromlist([text, label], fields=[('text', TEXT), ('label', LABEL)]))

dataset = Dataset(examples, fields=[('text', TEXT), ('label', LABEL)])

# Разбиваем train_data, valid_data, test_data
train_data, test_data = dataset.split(
    split_ratio=[0.9, 0.1],
    stratified=True,
    strata_field='label'
)

# Построение словаря
TEXT.build_vocab(train_data, min_freq=1)

# Определение гиперпараметров модели
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 5
n_layers = 2
dropout = 0.1

# Создание экземпляра модели
model = GRUClassifier(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)

# Определяем функцию потерь и оптимизатор
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)

# Создаем итераторы для данных
BATCH_SIZE = 16

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
#    repeat=True, # Repeat the iterator for multiple epochs - слишком требовательна к ресурсам
    shuffle=True,  # Shuffle data on each epoch run
    sort_key = lambda x: len(x.text), # Function to use for sorting examples
    sort=False, # Sort all examples in data using sort_key
    sort_within_batch=True # Use `sort_key` to sort examples in each batch
)

# Обучаем модель
best_valid_loss = float('inf')  # Инициализация переменной для отслеживания лучшей потери
patience = 3  # Установка порога терпимости

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    valid_loss = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        # torch.save(model.state_dict(), '/content/drive/MyDrive/NLP/best_model.pt')  # Сохранение лучших весов модели

    # Проверка Early Stopping
    if epoch > patience and valid_loss >= best_valid_loss:
        print(f'Early stopping after {epoch - patience} epochs of no improvement.')
        break

# Загрузка лучших весов модели
# model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/best_model.pt'))

# Оцениваем модель на тестовых данных
test_loss = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')

test_accuracy, test_confusion, test_class_report = evaluate_metrics(model, test_iterator)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('')
print('Test Confusion Matrix:')
print(test_confusion)
print('')
print('Test Classification Report:')
print(test_class_report)

Epoch: 01
	Train Loss: 1.232
	Validation Loss: 0.873
Epoch: 02
	Train Loss: 0.700
	Validation Loss: 0.620
Epoch: 03
	Train Loss: 0.471
	Validation Loss: 0.563
Epoch: 04
	Train Loss: 0.329
	Validation Loss: 0.557
Epoch: 05
	Train Loss: 0.237
	Validation Loss: 0.605
Early stopping after 1 epochs of no improvement.
Test Loss: 0.603
Test Accuracy: 0.8044

Test Confusion Matrix:
[[447   0  96   1   4]
 [  1 489   7   0 165]
 [102   1 801  16  72]
 [  4   1  72 638  56]
 [  2  48 130  27 935]]

Test Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       548
           1       0.91      0.74      0.81       662
           2       0.72      0.81      0.76       992
           3       0.94      0.83      0.88       771
           4       0.76      0.82      0.79      1142

    accuracy                           0.80      4115
   macro avg       0.83      0.80      0.81      4115
weighted avg       0.81      0.80      0.81 

In [None]:
# Загрузка данных из test.csv
test_df = pd.read_csv('data/test.csv')
test_df.head(3)

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...


In [None]:
# Чистим текст от мусора
test_df.Text = test_df.Text.apply(lambda text: clean_text(text))

# Переводим модель в режим оценки
model.eval()

# Создаем Dataset
examples = []
for i, row in test_df.iterrows():
    text = row['Text']
    examples.append(Example.fromlist([text], fields=[('text', TEXT)]))

test_dataset = Dataset(examples, fields=[('text', TEXT)])

# Создаем итератор для данных
BATCH_SIZE = 128
test_iterator = BucketIterator(
    test_dataset,
    batch_size=BATCH_SIZE,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    shuffle=False
)

# Строим предсказания
all_predictions = []

for batch in test_iterator:
    with torch.no_grad():
        predictions = model(batch.text)
    predicted_classes = torch.argmax(predictions, dim=1)
    predicted_classes = predicted_classes.cpu().numpy()
    all_predictions.extend(predicted_classes)

test_df['Sentiment_encoded'] = all_predictions

# Обратно перекодируем Sentiment
test_df['Sentiment'] = le.inverse_transform(test_df['Sentiment_encoded'])

# Выгружаем данные в submission.csv
submission_df = test_df[['id', 'Sentiment']]
submission_df.to_csv('data/sample_submission.csv', index=False)