<a href="https://colab.research.google.com/github/lonsst/ML_practice/blob/main/ML_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('surnames.csv')
df.head(5)

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [None]:
nationalities = df['nationality'].unique()
nationality_to_index = {nationality: index for index, nationality in enumerate(nationalities)}
index_to_nationality = {index: nationality for index, nationality in enumerate(nationalities)}

df['nationality'] = df['nationality'].map(nationality_to_index)

print(df.head())

    surname  nationality
0  Woodford            0
1      Coté            1
2      Kore            0
3     Koury            2
4    Lebzak            3


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 8784
Test size: 2196


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset
import torch

In [None]:
class Vocab:
    def __init__(self):
        self.char_to_index = {'<PAD>': 0}
        self.index_to_char = {}
        self.vocab_size = 0
        self.max_seq_len = 0
        self.vectorizer = None

    def build_vocab(self, data):
        chars = set(' '.join(data))
        chars.add('<PAD>')

        self.char_to_index = {char: index for index, char in enumerate(chars)}
        self.index_to_char = {index: char for char, index in self.char_to_index.items()}
        self.vocab_size = len(chars)
        self.max_seq_len = (max(len(token) for token in data) + 1)

    def vectorize(self, data):
        self.vectorizer = CountVectorizer(analyzer='char', vocabulary=self.char_to_index)
        self.vectorizer.fit(data)
        return self.vectorizer

    def get_max_seq_len(self):
        return self.max_seq_len

In [None]:
vocab = Vocab()
vocab.build_vocab(df['surname'])
train_data = df[['surname', 'nationality']]
print("Vocabulary size:", vocab.vocab_size)

Vocabulary size: 86


In [None]:
print("Char to Index:", vocab.char_to_index)
print("Max Sequence Length:", vocab.get_max_seq_len())

Char to Index: {'B': 0, 'H': 1, 'ł': 2, 'k': 3, 'í': 4, 'è': 5, 'ö': 6, 'r': 7, 'v': 8, 'ú': 9, 'b': 10, 'ą': 11, 'Z': 12, 'n': 13, 'ß': 14, 'ê': 15, 'ń': 16, 'y': 17, 'Q': 18, 'D': 19, 'õ': 20, 'z': 21, 'U': 22, 'X': 23, 'P': 24, 'C': 25, '1': 26, 'g': 27, '-': 28, 'e': 29, 'E': 30, 'á': 31, 'a': 32, 'L': 33, 'q': 34, 'ó': 35, 'I': 36, 'w': 37, 'J': 38, 'N': 39, 'A': 40, 'Ś': 41, 'ñ': 42, 'f': 43, 'o': 44, '/': 45, 'ü': 46, 'R': 47, 'V': 48, 't': 49, 'i': 50, 'j': 51, 'h': 52, 'ż': 53, 'm': 54, 's': 55, 'S': 56, 'M': 57, 'ç': 58, 'à': 59, 'x': 60, 'u': 61, 'Ż': 62, 'é': 63, 'É': 64, ':': 65, 'ã': 66, 'c': 67, "'": 68, 'O': 69, '<PAD>': 70, 'ì': 71, ' ': 72, 'K': 73, 'Á': 74, 'W': 75, 'p': 76, 'F': 77, 'l': 78, 'ä': 79, 'G': 80, 'd': 81, 'Y': 82, 'ò': 83, 'T': 84, 'ù': 85}
Max Sequence Length: 18


In [None]:
max_seq_len = vocab.get_max_seq_len()
print("Max Sequence Length:", max_seq_len)

Max Sequence Length: 18


In [None]:
vectorizer = vocab.vectorize(train_data['surname'])



In [None]:
from torch.utils.data import Dataset
import torch
from torch.nn.utils.rnn import pad_sequence

In [None]:
class SurnamesDataset(Dataset):
    def __init__(self, data, vocab, target_col='nationality'):
        self.data = data
        self.vocab = vocab
        self.target_col = target_col

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        surname = self.data.iloc[idx]['surname']
        encoded_surname = [self.vocab.char_to_index[char] for char in surname]


        max_seq_len = self.vocab.get_max_seq_len()

        # дополняем последовательность токеном <PAD> до нужной длины
        if len(encoded_surname) < max_seq_len:
            encoded_surname += [self.vocab.char_to_index['<PAD>']] * (max_seq_len - len(encoded_surname))

        target = self.data.iloc[idx][self.target_col]
        return (torch.tensor(encoded_surname, dtype=torch.long), torch.tensor(target, dtype=torch.long))

In [None]:
dataset = SurnamesDataset(train_df, vocab)
sample = dataset[0]
print(sample)

(tensor([57, 61,  7,  7, 29, 78, 78, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]), tensor(0))


In [None]:
train_dataset = SurnamesDataset(train_data, vocab)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
vocab_size = vocab.vocab_size
num_classes = len(nationalities)

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий.


In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn_cell = nn.RNNCell(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        h_t = torch.zeros(x.size(0), self.rnn_cell.hidden_size, dtype=embedded.dtype, device=embedded.device)


        for time_step in range(embedded.size(1)):
            h_t = self.rnn_cell(embedded[:, time_step, :], h_t)

        output = self.fc(h_t)
        return output

1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, rnn_type='rnn'):
        super(RNNClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        if rnn_type == 'rnn':
            self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Supported types: 'rnn', 'lstm', 'gru'.")

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)

        rnn_output, _ = self.rnn(embedded)
        output = self.fc(rnn_output[:, -1, :])

        return output

In [None]:
hidden_size = 128
num_epochs = 10

In [None]:
model = RNNClassifier(vocab_size, embedding_dim=32, hidden_size=hidden_size, output_size=num_classes)

In [None]:
model_rnn = RNNClassifier(vocab_size, embedding_dim=32, hidden_size=hidden_size, output_size=num_classes, rnn_type='rnn')

In [None]:
model_lstm = RNNClassifier(vocab_size, embedding_dim=32, hidden_size=hidden_size, output_size=num_classes, rnn_type='lstm')

In [None]:
model_gru = RNNClassifier(vocab_size, embedding_dim=32, hidden_size=hidden_size, output_size=num_classes, rnn_type='gru')

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
#optimizer = optim.Adam(model_rnn.parameters(), lr=0.001)
#optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)
optimizer = optim.Adam(model_gru.parameters(), lr=0.001)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
for epoch in range(num_epochs):
    total_rnn, correct_rnn = 0, 0
    all_labels, all_predicted = [], []

    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()

        outputs_rnn = model(inputs)

        loss_rnn = criterion(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer.step()

        _, predicted_rnn = torch.max(outputs_rnn, 1)
        total_rnn += labels.size(0)
        correct_rnn += (predicted_rnn == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted_rnn.cpu().numpy())

    accuracy_rnn = 100 * correct_rnn / total_rnn

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy (RNN): {accuracy_rnn:.2f}%')
    print(f'Precision (RNN): {precision:.2f}, Recall (RNN): {recall:.2f}, F1 Score (RNN): {f1:.2f}')

  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Accuracy (RNN): 25.91%
Precision (RNN): 0.12, Recall (RNN): 0.26, F1 Score (RNN): 0.15


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Accuracy (RNN): 26.60%
Precision (RNN): 0.12, Recall (RNN): 0.27, F1 Score (RNN): 0.14


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/10], Accuracy (RNN): 26.79%
Precision (RNN): 0.12, Recall (RNN): 0.27, F1 Score (RNN): 0.15


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/10], Accuracy (RNN): 26.68%
Precision (RNN): 0.13, Recall (RNN): 0.27, F1 Score (RNN): 0.14


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/10], Accuracy (RNN): 33.01%
Precision (RNN): 0.21, Recall (RNN): 0.33, F1 Score (RNN): 0.25


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [6/10], Accuracy (RNN): 46.71%
Precision (RNN): 0.32, Recall (RNN): 0.47, F1 Score (RNN): 0.37


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [7/10], Accuracy (RNN): 51.22%
Precision (RNN): 0.38, Recall (RNN): 0.51, F1 Score (RNN): 0.41


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [8/10], Accuracy (RNN): 55.38%
Precision (RNN): 0.43, Recall (RNN): 0.55, F1 Score (RNN): 0.46


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [9/10], Accuracy (RNN): 57.62%
Precision (RNN): 0.46, Recall (RNN): 0.58, F1 Score (RNN): 0.49
Epoch [10/10], Accuracy (RNN): 60.38%
Precision (RNN): 0.49, Recall (RNN): 0.60, F1 Score (RNN): 0.53


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for epoch in range(num_epochs):
    total_rnn, correct_rnn = 0, 0
    all_labels, all_predicted = [], []

    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()

        outputs_rnn = model_rnn(inputs)

        loss_rnn = criterion(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer.step()

        _, predicted_rnn = torch.max(outputs_rnn, 1)
        total_rnn += labels.size(0)
        correct_rnn += (predicted_rnn == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted_rnn.cpu().numpy())

    accuracy_rnn = 100 * correct_rnn / total_rnn

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy_rnn:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Accuracy: 26.83%
Precision: 0.13, Recall: 0.27, F1 Score: 0.15


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Accuracy: 26.82%
Precision: 0.12, Recall: 0.27, F1 Score: 0.13


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/10], Accuracy: 29.56%
Precision: 0.18, Recall: 0.30, F1 Score: 0.19


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/10], Accuracy: 40.27%
Precision: 0.28, Recall: 0.40, F1 Score: 0.30


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/10], Accuracy: 44.08%
Precision: 0.32, Recall: 0.44, F1 Score: 0.36


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [6/10], Accuracy: 49.91%
Precision: 0.35, Recall: 0.50, F1 Score: 0.40


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [7/10], Accuracy: 54.31%
Precision: 0.44, Recall: 0.54, F1 Score: 0.46


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [8/10], Accuracy: 58.64%
Precision: 0.47, Recall: 0.59, F1 Score: 0.51


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [9/10], Accuracy: 61.97%
Precision: 0.50, Recall: 0.62, F1 Score: 0.55
Epoch [10/10], Accuracy: 64.85%
Precision: 0.53, Recall: 0.65, F1 Score: 0.57


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for epoch in range(num_epochs):
    total_rnn, correct_rnn = 0, 0
    all_labels, all_predicted = [], []

    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()

        outputs_rnn = model_lstm(inputs)

        loss_rnn = criterion(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer.step()

        _, predicted_rnn = torch.max(outputs_rnn, 1)
        total_rnn += labels.size(0)
        correct_rnn += (predicted_rnn == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted_rnn.cpu().numpy())

    accuracy_rnn = 100 * correct_rnn / total_rnn

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy_rnn:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Accuracy: 33.26%
Precision: 0.24, Recall: 0.33, F1 Score: 0.24


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Accuracy: 51.07%
Precision: 0.39, Recall: 0.51, F1 Score: 0.41


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/10], Accuracy: 56.61%
Precision: 0.43, Recall: 0.57, F1 Score: 0.47


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/10], Accuracy: 60.90%
Precision: 0.50, Recall: 0.61, F1 Score: 0.52


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/10], Accuracy: 66.94%
Precision: 0.58, Recall: 0.67, F1 Score: 0.60


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [6/10], Accuracy: 70.00%
Precision: 0.64, Recall: 0.70, F1 Score: 0.65


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [7/10], Accuracy: 71.61%
Precision: 0.67, Recall: 0.72, F1 Score: 0.67


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [8/10], Accuracy: 73.60%
Precision: 0.70, Recall: 0.74, F1 Score: 0.70


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [9/10], Accuracy: 74.78%
Precision: 0.73, Recall: 0.75, F1 Score: 0.71
Epoch [10/10], Accuracy: 76.31%
Precision: 0.74, Recall: 0.76, F1 Score: 0.73


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
for epoch in range(num_epochs):
    total_rnn, correct_rnn = 0, 0
    all_labels, all_predicted = [], []

    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()

        outputs_rnn = model_gru(inputs)

        loss_rnn = criterion(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer.step()

        _, predicted_rnn = torch.max(outputs_rnn, 1)
        total_rnn += labels.size(0)
        correct_rnn += (predicted_rnn == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted_rnn.cpu().numpy())

    accuracy_rnn = 100 * correct_rnn / total_rnn

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy_rnn:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Accuracy: 40.02%
Precision: 0.31, Recall: 0.40, F1 Score: 0.31


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Accuracy: 60.77%
Precision: 0.53, Recall: 0.61, F1 Score: 0.53


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/10], Accuracy: 68.56%
Precision: 0.62, Recall: 0.69, F1 Score: 0.63


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/10], Accuracy: 72.47%
Precision: 0.68, Recall: 0.72, F1 Score: 0.68


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/10], Accuracy: 74.65%
Precision: 0.71, Recall: 0.75, F1 Score: 0.71


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [6/10], Accuracy: 76.01%
Precision: 0.73, Recall: 0.76, F1 Score: 0.73


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [7/10], Accuracy: 77.26%
Precision: 0.75, Recall: 0.77, F1 Score: 0.74


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [8/10], Accuracy: 78.23%
Precision: 0.76, Recall: 0.78, F1 Score: 0.76


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [9/10], Accuracy: 79.50%
Precision: 0.77, Recall: 0.79, F1 Score: 0.77
Epoch [10/10], Accuracy: 81.03%
Precision: 0.79, Recall: 0.81, F1 Score: 0.79


  _warn_prf(average, modifier, msg_start, len(result))


1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

In [None]:
class PretrainedEmbeddingRNNClassifier(nn.Module):
    def __init__(self, embedding_path1, embedding_path2, vocab_size, hidden_size, output_size, rnn_type='gru'):
        super(PretrainedEmbeddingRNNClassifier, self).__init__()

        embeddings1 = self.load_embeddings(embedding_path1, vocab_size)
        embeddings2 = self.load_embeddings(embedding_path2, vocab_size)
        embeddings = torch.cat([embeddings1, embeddings2], dim=0)

        self.embedding = nn.Embedding.from_pretrained(embeddings, padding_idx=0)

        if rnn_type == 'rnn':
            self.rnn = nn.RNN(embeddings.size(1), hidden_size, batch_first=True)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(embeddings.size(1), hidden_size, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embeddings.size(1), hidden_size, batch_first=True)
        #else:
            #raise ValueError("Invalid RNN type. Supported types: 'rnn', 'lstm', 'gru'.")

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_output, _ = self.rnn(embedded)
        output = self.fc(rnn_output[:, -1, :])
        return output

    def load_embeddings(self, embedding_path, vocab_size):
        with open(embedding_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        embeddings = [list(map(float, line.strip().split()[1:])) for line in lines[:vocab_size]]

        embeddings = [[0.0] * len(embeddings[0])] + embeddings
        embeddings = torch.tensor(embeddings, dtype=torch.float32)

        return embeddings

In [None]:
rnn_type = 'gru'
pretrained_embeddings_path1 = './globe_100_rows.txt'
pretrained_embeddings_path2 = './glove.6B.50d.txt'
hidden_size = hidden_size
output_size = num_classes

model_pretrained = PretrainedEmbeddingRNNClassifier(pretrained_embeddings_path1, pretrained_embeddings_path2, vocab_size, hidden_size, output_size, rnn_type)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_pretrained.parameters(), lr=0.001)

In [None]:
for epoch in range(num_epochs):
    total_rnn, correct_rnn = 0, 0
    all_labels, all_predicted = [], []

    for batch in train_loader:
        inputs, labels = batch
        optimizer.zero_grad()

        outputs_rnn = model_pretrained(inputs)

        loss_rnn = criterion(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer.step()

        _, predicted_rnn = torch.max(outputs_rnn, 1)
        total_rnn += labels.size(0)
        correct_rnn += (predicted_rnn == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predicted.extend(predicted_rnn.cpu().numpy())

    accuracy_rnn = 100 * correct_rnn / total_rnn

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy_rnn:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Accuracy: 32.59%
Precision: 0.22, Recall: 0.33, F1 Score: 0.23


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Accuracy: 49.54%
Precision: 0.41, Recall: 0.50, F1 Score: 0.41


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/10], Accuracy: 57.70%
Precision: 0.50, Recall: 0.58, F1 Score: 0.51


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/10], Accuracy: 63.32%
Precision: 0.56, Recall: 0.63, F1 Score: 0.57


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/10], Accuracy: 66.70%
Precision: 0.62, Recall: 0.67, F1 Score: 0.62


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [6/10], Accuracy: 69.79%
Precision: 0.64, Recall: 0.70, F1 Score: 0.65


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [7/10], Accuracy: 71.68%
Precision: 0.67, Recall: 0.72, F1 Score: 0.68


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [8/10], Accuracy: 73.12%
Precision: 0.68, Recall: 0.73, F1 Score: 0.69


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [9/10], Accuracy: 74.46%
Precision: 0.71, Recall: 0.74, F1 Score: 0.71
Epoch [10/10], Accuracy: 74.76%
Precision: 0.71, Recall: 0.75, F1 Score: 0.71


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict_surname(model, vocab, surname):
    encoded_surname = [vocab.char_to_index[char] for char in surname]
    input_tensor = torch.tensor(encoded_surname, dtype=torch.long).view(1, -1)
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted_index = torch.topk(output, 3)
    predicted_nationalities = [index_to_nationality[idx.item()] for idx in predicted_index[0]]
    return predicted_nationalities

example_surnames = ["Chevachin", "Makhovoy", "Higashikata"]

In [None]:
for surname in example_surnames:
    predictions = predict_surname(model, vocab, surname)
    print(f"Predictions for '{surname}': {predictions}")

Predictions for 'Chevachin': ['Vietnamese', 'Portuguese', 'Korean']
Predictions for 'Makhovoy': ['Vietnamese', 'Portuguese', 'Korean']
Predictions for 'Higashikata': ['Portuguese', 'Japanese', 'Arabic']


In [None]:
for surname in example_surnames:
    predictions = predict_surname(model_pretrained, vocab, surname)
    print(f"Predictions for '{surname}': {predictions}")

Predictions for 'Chevachin': ['Italian', 'Vietnamese', 'Polish']
Predictions for 'Makhovoy': ['Greek', 'Korean', 'Russian']
Predictions for 'Higashikata': ['Japanese', 'Greek', 'Russian']


In [None]:
for surname in example_surnames:
    predictions = predict_surname(model_gru, vocab, surname)
    print(f"Predictions for '{surname}': {predictions}")

Predictions for 'Chevachin': ['Russian', 'Irish', 'Polish']
Predictions for 'Makhovoy': ['Russian', 'Greek', 'Irish']
Predictions for 'Higashikata': ['Japanese', 'Russian', 'Irish']


## 2. Классификация обзоров на фильмы (RNN)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

2.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding`
    - подберите адекватную размерность вектора эмбеддинга:
    - модуль `nn.Embedding` обучается

  * Используйте рекуррентные слои (`nn.RNN`, `nn.LSTM`, `nn.GRU`)


2.3 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [None]:
import os

In [None]:
def load_reviews_from_folder(label, folder_path):
    reviews = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    review_text = line.strip()
                    reviews.append({'text': review_text, 'label': label})
    return reviews

In [None]:
folder_path_positive_reviews = './polarity'
positive_label = 1
positive_reviews = load_reviews_from_folder(positive_label, folder_path_positive_reviews)

folder_path_negative_reviews = './polarity'
negative_label = 0
negative_reviews = load_reviews_from_folder(negative_label, folder_path_negative_reviews)

In [None]:
df = pd.DataFrame(positive_reviews + negative_reviews)

In [None]:
df.head(5)

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [None]:
df.tail(5)

Unnamed: 0,text,label
21319,a terrible movie that some people will neverth...,0
21320,there are many definitions of 'time waster' bu...,0
21321,"as it stands , crocodile hunter has the hurrie...",0
21322,the thing looks like a made-for-home-video qui...,0
21323,"enigma is well-made , but it's just too dry an...",0


In [None]:
# Разбиение на обучающую и тестовую выборку
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 17059
Test size: 4265


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

In [None]:
import torch
from torch.utils.data import Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import string

In [None]:
class Vocab:
    def __init__(self):
        self.token_to_index = {}
        self.index_to_token = {}
        self.vocab_size = 0

    def build_vocab(self, data):
        vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), stop_words='english')
        vectorized_data = vectorizer.fit_transform(data)
        self.token_to_index = {token: index for index, token in enumerate(vectorizer.get_feature_names_out())}
        self.index_to_token = {index: token for token, index in self.token_to_index.items()}
        self.vocab_size = len(self.token_to_index)

    def vectorize(self, text):
        tokens = text.split()
        vectorized_text = [self.token_to_index[token] if token in self.token_to_index else self.token_to_index['<UNK>'] for token in tokens]
        return vectorized_text

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]['text']
        processed_review = preprocess_text(review)
        vectorized_review = self.vocab.vectorize(processed_review)
        label = self.data.iloc[idx]['label']
        return torch.tensor(vectorized_review, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [None]:
vocab = Vocab()
vocab.build_vocab(train_df['text'])



In [None]:
vocab.token_to_index['<UNK>'] = len(vocab.token_to_index)
vocab.index_to_token[len(vocab.index_to_token)] = '<UNK>'
vocab.vocab_size += 1

In [None]:
train_dataset = ReviewsDataset(test_df, vocab)
test_dataset = ReviewsDataset(test_df, vocab)

In [None]:
print(vocab.vocab_size)

20688


In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class TextClassifierRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(TextClassifierRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        x = h_n.squeeze(0)
        x = self.fc(x)
        return x

In [None]:
num_epochs = 15
input_size = vocab.vocab_size + 1
hidden_size = 128
output_size = 2
batch_size = 32
embedding_dim = 100

In [None]:
model2 = TextClassifierRNN(input_size, embedding_dim, hidden_size, output_size + 1)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: (pad_sequence([item[0] for item in x], batch_first=True), torch.tensor([item[1] for item in x])))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: (pad_sequence([item[0] for item in x], batch_first=True), torch.tensor([item[1] for item in x])))

In [None]:
from sklearn.metrics import precision_recall_fscore_support

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model2(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Оценка качества на тестовом наборе данных
    correct = 0
    total = 0
    all_labels, all_predicted = [], []

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model2(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Collect labels and predictions for the entire test set
            all_labels.extend(labels.cpu().numpy())
            all_predicted.extend(predicted.cpu().numpy())

    accuracy = 100 * correct / total

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predicted, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')

print('Training finished')


Epoch [1/15], Accuracy: 50.79%
Precision: 0.55, Recall: 0.51, F1 Score: 0.36
Epoch [2/15], Accuracy: 50.88%
Precision: 0.54, Recall: 0.51, F1 Score: 0.37
Epoch [3/15], Accuracy: 50.79%
Precision: 0.52, Recall: 0.51, F1 Score: 0.37
Epoch [4/15], Accuracy: 50.76%
Precision: 0.60, Recall: 0.51, F1 Score: 0.35
Epoch [5/15], Accuracy: 50.13%
Precision: 0.57, Recall: 0.50, F1 Score: 0.35
Epoch [6/15], Accuracy: 50.18%
Precision: 0.67, Recall: 0.50, F1 Score: 0.35
Epoch [7/15], Accuracy: 50.20%
Precision: 0.58, Recall: 0.50, F1 Score: 0.36
Epoch [8/15], Accuracy: 51.86%
Precision: 0.52, Recall: 0.52, F1 Score: 0.51
Epoch [9/15], Accuracy: 51.49%
Precision: 0.63, Recall: 0.51, F1 Score: 0.37
Epoch [10/15], Accuracy: 63.59%
Precision: 0.64, Recall: 0.64, F1 Score: 0.63
Epoch [11/15], Accuracy: 72.40%
Precision: 0.73, Recall: 0.72, F1 Score: 0.72
Epoch [12/15], Accuracy: 79.55%
Precision: 0.80, Recall: 0.80, F1 Score: 0.80
Epoch [13/15], Accuracy: 83.38%
Precision: 0.83, Recall: 0.83, F1 Score: 

In [None]:
model_simple_rnn = model
torch.save(model_simple_rnn.state_dict(), 'model_simple_rnn.pth')

model_gru = model_gru
torch.save(model_gru.state_dict(), 'model_gru.pth')

model_gru_embed = model_pretrained
torch.save(model_gru_embed.state_dict(), 'model_gru_embed.pth')

In [None]:
model_task3 = model2
torch.save(model_task3.state_dict(), 'model_task3.pth')

In [None]:
# придумываем короткие отзывы для проверки работоспособности модели
positive_review = "Great! Fantastic plot, destined to be one of the greatest movies of the century!"
negative_review = "A terrible movie with massive amount of plotholes and boring characters."

positive_review = preprocess_text(positive_review)
negative_review = preprocess_text(negative_review)

In [None]:
# кодировка
encoded_positive_review = [vocab.token_to_index.get(word, vocab.token_to_index['<UNK>']) for word in positive_review.split()]
encoded_negative_review = [vocab.token_to_index.get(word, vocab.token_to_index['<UNK>']) for word in negative_review.split()]

# передаем через модель
tensor_positive_review = torch.tensor(encoded_positive_review, dtype=torch.long).unsqueeze(0)
tensor_negative_review = torch.tensor(encoded_negative_review, dtype=torch.long).unsqueeze(0)

model2.eval()
output_positive = model2(tensor_positive_review)
output_negative = model2(tensor_negative_review)

_, predicted_positive = torch.max(output_positive, 1)
_, predicted_negative = torch.max(output_negative, 1)

print("Предсказанный класс для Позитивного отзыва:", predicted_positive.item())
print("Предсказанный класс для Негативного отзыва:", predicted_negative.item())

Предсказанный класс для Позитивного отзыва: 1
Предсказанный класс для Негативного отзыва: 0
