<center>

# Natural language processing
## Project - Sentence segmentation
## Model training

### 2023./2024.
## Matea Kunac, Marijana Rendulić
</center>

# 1. Introduction

This notebook focuses on the model training for sentence segmentation.

#2. Code

##Libraires

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import os
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

##Functions

In [None]:
class TextChunkDataset(Dataset):
    """
    Loads data chunks and their corresponding labels from specified pickle files
    """
    def __init__(self, chunks_file, labels_file):
        with open(chunks_file, 'rb') as f:
            self.chunks = pickle.load(f)
        with open(labels_file, 'rb') as f:
            self.labels = pickle.load(f)

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        return torch.tensor(self.chunks[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float32)

In [None]:
class MultiLayerBiGRUModel(nn.Module):
    """
    multi-layer Bidirectional GRU architecture

    The model consists of the following components:
    - Embedding Layer: Converts input tokens into dense vectors of a specified size (embedding_dim)
    - Multi-Layer BiGRU: Processes the embedded input sequentially in both forward and backward directions across multiple layers (num_layers)
    - Fully Connected (Linear) Layer: Transforms the BiGRU's output to the desired output dimension (output_dim)
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim=1, num_layers=3):
        super(MultiLayerBiGRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Times 2 because it's bidirectional

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.gru(embedded)
        final_output = self.fc(output)
        return final_output

In [None]:
def pad_collate(batch):
    """
    Pads sequences to match the longest sequence in a batch
    """
    (xx, yy) = zip(*batch)

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)

    yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

    return xx_pad, yy_pad

In [None]:
def evaluate(model, data_loader):
    """
    A function to evaluate the model's performance on a given dataset.
    It switches the model to evaluation mode, computes predictions for the dataset,
    and calculates evaluation metrics such as accuracy, precision, recall, and F1 score.
    """
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for chunks, labels in data_loader:
            chunks, labels = chunks.to(device), labels.to(device)
            outputs = model(chunks).squeeze(-1)
            predicted = torch.round(torch.sigmoid(outputs))

            all_predictions.extend(predicted.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)

    accuracy = 100 * (all_predictions == all_labels).mean()
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)

    return accuracy, precision, recall, f1

##Data loading

In [None]:
train_dataset = TextChunkDataset('/content/drive/MyDrive/Sentence_segmentation_popravak/data/train_encoded_chunks.pkl', '/content/drive/MyDrive/Sentence_segmentation_popravak/data/train_encoded_labels.pkl')
dev_dataset = TextChunkDataset('/content/drive/MyDrive/Sentence_segmentation_popravak/data/dev_encoded_chunks.pkl', '/content/drive/MyDrive/Sentence_segmentation_popravak/data/dev_encoded_labels.pkl')

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_collate)

In [None]:
def load_pickle(file_path):
    """
    Load and return the contents of a pickle file.
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

vocab = load_pickle('/content/drive/MyDrive/Sentence_segmentation_popravak/data/vocab.pkl')

# Model parameters
vocab_size = len(vocab) + 1
embedding_dim = 100
hidden_dim = 128

##Weighted

- pos_weight Parameter: This parameter is used to weight the loss differently for the positive class, which can be beneficial when dealing with imbalanced datasets like the one that we have.

In [None]:
positive_labels = sum(label for sublist in train_dataset.labels for label in sublist if label == 1)
total_labels = sum(len(sublist) for sublist in train_dataset.labels)
negative_labels = total_labels - positive_labels

print(positive_labels)
print(negative_labels)

if positive_labels > 0:
    pos_weight = negative_labels / positive_labels
else:
    pos_weight = 1

print(pos_weight)

19791
379141
19.157243191349604


##Model training

In [None]:
# Initialize the model
model = MultiLayerBiGRUModel(vocab_size, embedding_dim, hidden_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pos_weight_tensor = torch.tensor([pos_weight], device=device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Early Stopping Parameters
patience = 10  # Number of epochs to wait for improvement before stopping
counter = 0  # Tracks how many epochs have gone by without improvement
best_f1_score = 0.0
early_stop = False

model.to(device)

best_f1_score = 0.0
best_model_state = None
model_save_dir = '/content/drive/MyDrive/Sentence_segmentation_popravak/models'
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    if early_stop:
        print("Early stopping triggered")
        break

    model.train()
    epoch_loss = 0  # Initialize epoch loss
    for i, (chunks, labels) in enumerate(train_loader):
        chunks, labels = chunks.to(device), labels.to(device)

        # Forward pass
        predictions = model(chunks).squeeze(-1)
        loss = criterion(predictions, labels)
        epoch_loss += loss.item()  # Accumulate loss over the epoch

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate average loss for the epoch
    avg_epoch_loss = epoch_loss / len(train_loader)

    # Evaluate on dev set
    accuracy, precision, recall, f1 = evaluate(model, dev_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}, Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}')

    # Check if current model is the best so far
    if f1 > best_f1_score:
        best_f1_score = f1
        counter = 0  # Reset counter
        best_model_state = model.state_dict()
        # Save the current best model
        best_model_path = os.path.join(model_save_dir, f'best_model_epoch_{epoch+1}_f1_{f1:.2f}.pth')
        torch.save(best_model_state, best_model_path)
    else:
        counter += 1
        if counter >= patience:
            early_stop = True
            print(f"No improvement in {counter} epochs, stopping early.")

Epoch [1/100], Loss: 0.4444, Accuracy: 98.91%, Precision: 0.86, Recall: 0.89, F1: 0.88
Epoch [2/100], Loss: 0.2122, Accuracy: 97.89%, Precision: 0.69, Recall: 0.91, F1: 0.79
Epoch [3/100], Loss: 0.1646, Accuracy: 98.42%, Precision: 0.77, Recall: 0.90, F1: 0.83
Epoch [4/100], Loss: 0.1209, Accuracy: 98.81%, Precision: 0.84, Recall: 0.89, F1: 0.87
Epoch [5/100], Loss: 0.0768, Accuracy: 98.77%, Precision: 0.84, Recall: 0.88, F1: 0.86
Epoch [6/100], Loss: 0.0456, Accuracy: 98.66%, Precision: 0.82, Recall: 0.88, F1: 0.85
Epoch [7/100], Loss: 0.0294, Accuracy: 98.96%, Precision: 0.89, Recall: 0.87, F1: 0.88
Epoch [8/100], Loss: 0.0202, Accuracy: 98.97%, Precision: 0.89, Recall: 0.87, F1: 0.88
Epoch [9/100], Loss: 0.0147, Accuracy: 99.03%, Precision: 0.90, Recall: 0.87, F1: 0.89
Epoch [10/100], Loss: 0.0118, Accuracy: 99.15%, Precision: 0.94, Recall: 0.86, F1: 0.90
Epoch [11/100], Loss: 0.0088, Accuracy: 99.07%, Precision: 0.92, Recall: 0.86, F1: 0.89
Epoch [12/100], Loss: 0.0073, Accuracy: 9