
# Reti neurali ricorrenti (RNN)

In [11]:
%cd ~/src/laboratori

/home/jovyan/src/laboratori


Librerie necessarie:
- `pytorch` 
    - https://pytorch.org/
    - https://docs.pytorch.org/docs/stable/index.html
- Utilit√†: `os`, `glob`

#### Caricamento del dataset

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim

import os
import glob
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Device configuration
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Embedding
def embedding(text, vocab_size, embedding_dim):
    char_indices = torch.tensor([ord(c) for c in text], dtype=torch.long)
    embed = nn.Embedding(vocab_size, embedding_dim)

    return embed(char_indices)


# Create a DataLoader for the dataset
class NamesDataset(Dataset):
    def __init__(self, folder_path):
        self.data = []
        self.labels = []
        self.labels_uniq = set()

        # Read all .txt files in the folder
        for file_path in glob.glob(os.path.join(folder_path, "*.txt")):
            label = os.path.splitext(os.path.basename(file_path))[0]
            self.labels_uniq.add(label)
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    self.data.append(line.strip())
                    self.labels.append(label)

        # Map labels to indices
        self.label_to_idx = {label: idx for idx, label in enumerate(self.labels_uniq)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.labels[idx]
        label_tensor = torch.tensor(self.label_to_idx[label], dtype=torch.long)
        text_tensor = embedding(text, len(self.data), 10)
        return label_tensor, text_tensor, label, text

# Initialize the dataset
dataset = NamesDataset("data/names")
print(f"There are {len(dataset)} names in the dataset.")
print(f"Labels are {dataset.labels_uniq}")
print(f"For example, {dataset[0][3]} is a {dataset[0][2]} name,")
print(f"{dataset[0][1]} is the embedding and {dataset[0][0]} the label idx.")

batch_size = 64


There are 20074 names in the dataset.
Labels are {'Korean', 'Polish', 'Greek', 'German', 'Japanese', 'Irish', 'Spanish', 'Italian', 'Arabic', 'Portuguese', 'Russian', 'Scottish', 'English', 'French', 'Dutch', 'Vietnamese', 'Chinese', 'Czech'}
For example, Abl is a Czech name,
tensor([[ 0.4759, -0.1484,  1.3907,  1.0443, -1.0916,  0.5134,  0.8396,  0.8863,
          0.9279, -2.0891],
        [ 0.4183,  0.4901,  1.4757, -0.2952,  0.2672,  0.8611, -0.8653,  1.8523,
          0.5517, -0.8969],
        [ 2.2401, -0.1645, -1.0844, -0.0219,  1.3170, -0.4733, -1.4697,  0.9314,
          0.7578, -0.3100]], grad_fn=<EmbeddingBackward0>) is the embedding and 17 the label idx.


In [13]:
# Create the DataLoader
train_set, test_set = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))])

def collate_fn(batch):
    labels, text_tensors, label_names, texts = zip(*batch)
    return (
        torch.stack(labels),
        torch.nn.utils.rnn.pad_sequence(text_tensors, batch_first=True),
        label_names,
        texts,
    )
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(f"train examples = {len(train_set)}, validation examples = {len(test_set)}")

train examples = 16059, validation examples = 4015


#### Configurazione degli iperparametri

In [14]:
# Hyperparameters
num_epochs = 10
learning_rate = 0.1

#### Definizione della rete neurale

Creare un modulo pytorch (classe derivata da `nn.Module`) che definisce l'architettura della rete.  
Dev'essere presente uno strato ricorrente e un layer FC seguito da una funzione di attivazione log-softmax.

In [15]:
# Define the RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # x shape: (batch, seq_len, input_size)
        out, _ = self.rnn(x)  # out: (batch, seq_len, hidden_size)
        out = out[:, -1, :]   # Take the output at the last time step
        out = self.fc(out)
        out = self.log_softmax(out)
        return out

# Initialize the model, loss function, and optimizer
input_size = 10  # Size of the embedding
hidden_size = 128  # Size of the hidden state
num_classes = len(dataset.labels_uniq)  # Number of unique labels
model = SimpleRNN(input_size, hidden_size, num_classes).to(device)

In [16]:
input = embedding('Albert', len(dataset.data), input_size).unsqueeze(0).to(device)

output = model(input)

_, predicted_label_idx = torch.max(output, 1)
predicted_label = list(dataset.label_to_idx.keys())[list(dataset.label_to_idx.values()).index(predicted_label_idx.item())]
print(f"The predicted label for the input 'Albert' is: {predicted_label}")

The predicted label for the input 'Albert' is: Italian


#### Scelta della funzione Loss e dell'ottimizzatore

- Cross-entropy loss
- SGD optimizer

In [17]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

Implementare l'algoritmo di training della rete neurale sopra definita, ricordando che:
- per un certo numero di epoche, si itera su tutto il dataset
    - inoltre, i comandi `inputs.to(device), labels.to(device)` assicurano di utilizzare la GPU
- l'ottimizzatore va resettato a ogni iterazione, con `optimizer.zero_grad()`
- per ogni iterazione, si procede con il passo forward
- per ogni iterazione, si calcola la loss relativa a quell'output della rete
    - con `loss = criterion(outputs, labels)`
- per ogni iterazione, si propaga all'indietro la loss
- per ogni iterazione, si esegue uno step dell'ottimizzatore

In [18]:
# Training the model
model.train()
for epoch in range(num_epochs):
    for i, (labels, text_tensor, label, text) in enumerate(train_loader):
        labels = labels.to(device)
        text_tensor = text_tensor.to(device)

        optimizer.zero_grad()                # Reset gradients
        outputs = model(text_tensor)         # Forward pass
        loss = criterion(outputs, labels)    # Compute loss
        loss.backward()                      # Backward pass
        optimizer.step()                     # Update parameters

        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')


Epoch [1/10], Step [100/251], Loss: 2.1638
Epoch [1/10], Step [200/251], Loss: 1.9488
Epoch [2/10], Step [100/251], Loss: 2.0503
Epoch [2/10], Step [200/251], Loss: 1.8754
Epoch [3/10], Step [100/251], Loss: 1.9993
Epoch [3/10], Step [200/251], Loss: 1.8462
Epoch [4/10], Step [100/251], Loss: 1.8380
Epoch [4/10], Step [200/251], Loss: 2.1286
Epoch [5/10], Step [100/251], Loss: 1.9515
Epoch [5/10], Step [200/251], Loss: 1.5676
Epoch [6/10], Step [100/251], Loss: 1.7754
Epoch [6/10], Step [200/251], Loss: 1.7205
Epoch [7/10], Step [100/251], Loss: 1.8606
Epoch [7/10], Step [200/251], Loss: 1.8796
Epoch [8/10], Step [100/251], Loss: 1.9494
Epoch [8/10], Step [200/251], Loss: 1.9913
Epoch [9/10], Step [100/251], Loss: 1.7358
Epoch [9/10], Step [200/251], Loss: 1.6650
Epoch [10/10], Step [100/251], Loss: 1.7773
Epoch [10/10], Step [200/251], Loss: 1.7837


#### Inferenza

In [19]:
# Make inference on the first 64 instances of the test set
model.eval()
with torch.no_grad():
    for i, (labels, text_tensor, label_names, texts) in enumerate(test_loader):
        if i == 0:  # Only process the first batch
            text_tensor = text_tensor.to(device)
            outputs = model(text_tensor)
            _, predicted = torch.max(outputs, 1)
            predicted_labels = [list(dataset.label_to_idx.keys())[list(dataset.label_to_idx.values()).index(idx.item())] for idx in predicted]
            for text, predicted_label in zip(texts, predicted_labels):
                print(f"Text: {text}, Predicted Label: {predicted_label}")
            break


Text: Daryalov, Predicted Label: Russian
Text: Sztegon, Predicted Label: Russian
Text: Grankin, Predicted Label: Russian
Text: Shalhoub, Predicted Label: Russian
Text: Jigily, Predicted Label: Russian
Text: Bekleshov, Predicted Label: Russian
Text: Matocha, Predicted Label: Russian
Text: Yahnyuk, Predicted Label: Russian
Text: Gribnov, Predicted Label: Russian
Text: Blackburn, Predicted Label: Russian
Text: Abbas, Predicted Label: Russian
Text: Grossman, Predicted Label: Russian
Text: Nakadai, Predicted Label: Russian
Text: Oatway, Predicted Label: Russian
Text: Hlopiev, Predicted Label: Russian
Text: Partlett, Predicted Label: Russian
Text: Vakulentchuk, Predicted Label: Russian
Text: Awturhanoff, Predicted Label: Russian
Text: Sam, Predicted Label: Russian
Text: Chuhray, Predicted Label: Russian
Text: Holstov, Predicted Label: Russian
Text: Podshivalov, Predicted Label: Russian
Text: Vozdvijensky, Predicted Label: Russian
Text: Nazari, Predicted Label: Russian
Text: Tsarakov, Predict