# Libraries

In [1]:
# IO
import os
import csv
import pathlib
from pathlib import Path
import chardet
import warnings
from tqdm.notebook import tqdm

# Utilities
import numpy as np 
import pandas as pd
import copy
from functools import partial

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from torchtext.vocab import vocab
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling and training
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F

# Evaluation
from sklearn.metrics import (
    roc_curve, auc, roc_auc_score,
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix
)
from sklearn.preprocessing import label_binarize

warnings.filterwarnings("ignore", category=FutureWarning)



Download `nltk` resources (only needs to run once per machine)

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/exterior/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/exterior/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Classes

Early stopping class (pytorch does not have a built in early stopping callback)

In [3]:
class EarlyStopping:
    def __init__(self, patience=1, restore_best_weights=True, mode='min'):
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.mode = mode  # 'min' for loss, 'max' for accuracy, etc.
        self.best_score = None
        self.counter = 0
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, current_score, model):
        # Determine if the current score is better
        if self.best_score is None:
            self.best_score = current_score
            if self.restore_best_weights:
                self.best_model_state = copy.deepcopy(model.state_dict())
        elif (self.mode == 'min' and current_score < self.best_score) or \
             (self.mode == 'max' and current_score > self.best_score):
            self.best_score = current_score
            self.counter = 0
            if self.restore_best_weights:
                self.best_model_state = copy.deepcopy(model.state_dict())
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def restore(self, model):
        if self.restore_best_weights and self.best_model_state:
            model.load_state_dict(self.best_model_state)

In [4]:
class TextDataset(Dataset):
    def __init__(self, df, vocab_inst):
        self.vocab_inst = vocab_inst
        self.texts = df['Sentence'].tolist()
        self.labels = df['Sentiment_encoded'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = torch.tensor(encode_text(self.texts[idx], self.vocab_inst), dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return encoded, label

## Models

In [5]:
class CorpoSpeakDecoder(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embed_dim=128, 
        lstm_out=196,
        dropout_spatial=0.5, 
        dropout_lstm=0.3,
        dropout_1=0.2,
        dropout_2=0.4, 
        dense_1=100,
        output_classes=3):
        
        super(CorpoSpeakDecoder, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.spatial_dropout = nn.Dropout2d(p=dropout_spatial)  # Approximate SpatialDropout1D
        self.lstm = nn.LSTM(embed_dim, lstm_out, batch_first=True, dropout=dropout_lstm)
        
        self.dropout_1 = nn.Dropout(p=dropout_1)
        self.dense_1 = nn.Linear(lstm_out, dense_1)
        self.dropout_2 = nn.Dropout(p=dropout_2)
        self.output_layer = nn.Linear(dense_1, output_classes)

    def forward(self, x):
        x = self.embedding(x)                      # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)                     # For Dropout2d: (batch, embed_dim, seq_len)
        x = self.spatial_dropout(x)
        x = x.permute(0, 2, 1)                     # Back to (batch, seq_len, embed_dim)
        
        x, _ = self.lstm(x)                        # LSTM returns (output, (h_n, c_n))
        x = x[:, -1, :]                            # Get the output from the last timestep
        
        x = self.dropout_1(x)
        x = F.relu(self.dense_1(x))
        x = self.dropout_2(x)
        x = self.output_layer(x)
        return x       

In [6]:
class SimpleRNNClassifier(nn.Module):
    def __init__(
        self, 
        max_features, 
        embedding_dim=16, 
        rnn_units=64, 
        num_classes=3
        ):
        super(SimpleRNNClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=max_features, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=rnn_units, batch_first=True, nonlinearity='tanh')
        self.fc = nn.Linear(rnn_units, num_classes)

    def forward(self, x):
        x = self.embedding(x)                        
        output, hidden = self.rnn(x)                 
        x = hidden.squeeze(0)                        
        x = self.fc(x)                    
        return x

# Functions

### Preprocessing

The encoding of the text file is automatically detected here with some confidence. It could be extracted from terminal using
```
file -I file.txt
```

Terminal method (not os agnostic):

In [7]:
#!file -i ~/Documents/StudyResources/IML/Project/Part2/_data/FinancialPhraseBank-v1.0/Sentences_50Agree.txt

Python method:

In [8]:
def extract_sentences_from_file(filepath):
    """Given an input txt file, extract sentences 
    and associated sentiments

    Args:
        filepath (string): path to string 

    Returns:
        list: list of sentences and sentiments
    """
    sentences = []
    # automatically detect endoding (best guess)
    with open(filepath, 'rb') as file:
        encoding = chardet.detect(file.read())['encoding']
    # read and split
    with open(filepath, 'r', encoding=encoding) as file:
        for line in file:
            line = line.strip()
            if '.@' in line:
                sentence, sentiment = line.rsplit('.@', 1)
                sentence = sentence.strip()
                sentiment = sentiment.strip().lower()
                sentence = fix_common_mojibake(sentence)
                sentences.append((sentence, sentiment))
    return sentences

In [9]:
def txt_to_csv(txt_path, output_csv):
    """Takes a file path as input and generates a .csv file
    which contains sentences and sentiments as columns, extracted
    from the .txt file corresponding to the path

    Args:
        txt_path (string): path to the .txt file
        output_csv (string): path to desired output .csv file
    """
    # Skip processing if CSV already exists
    if os.path.exists(output_csv):
        print(f"{output_csv} already exists. Skipping processing.")
        return

    sentences = extract_sentences_from_file(txt_path)

    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Sentence', 'Sentiment'])  # Header
        writer.writerows(sentences)
    print(f"Processed files and wrote output to {output_csv}")


This function is needed as the text files seem to be corrupted, for example one sentence is:
```
Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004
```
The characters `+ñ` are a result of choosing the wrong encoding; in fact, here the detected encoding is latin-1, and those characters correspond to `ä` in utf-8. The function below is used to fix these common mistakes

In [10]:
def fix_common_mojibake(text):
    """Function to manually handle encoding errors

    Args:
        text (string): input sentence

    Returns:
        string: output corrected sentence
    """
    replacements = {
        '+ñ': 'ä',
        '+í': 'é',
        '+ô': 'ö',
        '+ü': 'ü'
        }
    for wrong, right in replacements.items():
        text = text.replace(wrong, right)
    return text

This function summarizes the common preprocessing pipeline for natural language models.

In [11]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

Function that creates csv files (if not created already) and returns the dataframe extracted from them

In [12]:
def create_df():
    """Import data and creates csv's and pandas dataframes

    Returns:
        pandas dataframes for training and testing
    """
    print("Please select which dataset to use. Data is categorized based on the percentage of agreement in the sentiment estimator.")
    print("Options:")
    print("(1) 50")
    print("(2) 66")
    print("(3) 75")
    print("(4) 100")
    percentage = input()
    if percentage=='50':
        txt_path = RAW_DATA_FOLDER + 'Sentences_50Agree.txt'
        csv_path = DATASET_FOLDER + 'sentences_50.csv'
        txt_to_csv(txt_path, csv_path)
    elif percentage=='66':
        txt_path = RAW_DATA_FOLDER + 'Sentences_66Agree.txt'
        csv_path = DATASET_FOLDER + 'sentences_66.csv'
        txt_to_csv(txt_path, csv_path)
    elif percentage=='75':
        txt_path = RAW_DATA_FOLDER + 'Sentences_75Agree.txt'
        csv_path = DATASET_FOLDER + 'sentences_75.csv'
        txt_to_csv(txt_path, csv_path)
    elif percentage=='100':
        txt_path = RAW_DATA_FOLDER + 'Sentences_AllAgree.txt'
        csv_path = DATASET_FOLDER + 'sentences_100.csv'
        txt_to_csv(txt_path, csv_path)
    else:
        print("The percentage provided is not admitted, skipping.")
        return

    df = pd.read_csv(csv_path)

    le = LabelEncoder()
    le.fit(df['Sentiment'])
    df['Sentiment_encoded'] = le.transform(df['Sentiment'])

    return df, le

In [13]:
def create_dataloaders(train_df, val_df, vocab_inst=None):
    if vocab_inst is None:
        vocab_inst = build_vocab(train_df)

    train_dataset = TextDataset(train_df, vocab_inst)
    val_dataset = TextDataset(val_df, vocab_inst)

    collate_with_vocab = partial(collate_fn, vocab_inst=vocab_inst)

    train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_with_vocab, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_with_vocab)

    return train_loader, val_loader, vocab_inst


The following functions are needed to create token vocabulary and dataloaders

In [14]:
def build_vocab(df):
    counter = Counter()
    for text in df['Sentence']:
        counter.update(preprocess_text(text))

    vocab_inst = vocab(counter, specials=["<PAD>", "<UNK>"])
    vocab_inst.set_default_index(vocab_inst["<UNK>"])
    return vocab_inst

In [15]:
def encode_text(text, vocab_inst):
    tokens = preprocess_text(text)
    return [vocab_inst[token] for token in tokens]

In [16]:
def collate_fn(batch, vocab_inst):
    texts, labels = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=vocab_inst["<PAD>"])
    labels = torch.stack(labels)
    return padded_texts, labels

## Hyperparameter tuning

TODO
Maybe it's better to feed it the dictionary generated outside of the objective function?

In [17]:
def objective_corpo(
    trial, 
    df,
    cv,
    device
    ):
    # Sample hyperparameters
    embed_dim = trial.suggest_int("embed_dim", 64, 256, step=32)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    wd = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)
    lstm_out = trial.suggest_int("lstm_out", 64, 256, step=32)
    dropout_spatial = trial.suggest_float("dropout_spatial", 0.1, 0.5, step=0.2)
    dropout_lstm = trial.suggest_float("dropout_lstm", 0.1, 0.5, step=0.2)
    dropout_1 = trial.suggest_float("dropout_1", 0.1, 0.5, step=0.2)
    dropout_2 = trial.suggest_float("dropout_2", 0.1, 0.5, step=0.2)
    dense_1 = trial.suggest_int("dense_1", 64, 256, step=64)

    accuracies = []

    # Cross-validation loop
    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(df, df['Sentiment_encoded']), desc="CV Folds", total=cv.get_n_splits(), leave=True)):

        early_stop = EarlyStopping(patience=3, mode='max')

        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)

        # Build vocab only on training fold
        vocab_inst = build_vocab(train_df)
        vocab_size = len(vocab_inst)

        train_loader, val_loader, _ = create_dataloaders(train_df, val_df, vocab_inst=vocab_inst)

        # Define model (custom RNN or LSTM)
        model = CorpoSpeakDecoder(
            vocab_size, 
            embed_dim=embed_dim, 
            lstm_out=lstm_out,
            dropout_spatial=dropout_spatial,
            dropout_lstm=dropout_lstm, 
            dropout_1=dropout_1,
            dropout_2=dropout_2,
            dense_1=dense_1
            )
        model = model.to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
        loss_crit = nn.CrossEntropyLoss().to(device)

        # Train for 5 epochs for speed (or early stopping)
        for epoch in tqdm(range(5), desc=f"Training Fold {fold_idx+1}", leave=False):
            _, _ = training_epoch(model, optimizer, loss_crit, train_loader, device)
            val_loss, val_acc = validation_epoch(model, loss_crit, val_loader, device)

            early_stop(val_acc, model)
            if early_stop.early_stop:
                print(f"Early stopping triggered on fold {fold_idx+1}, epoch {epoch+1}.")
                break

        #early_stop.restore(model)
        
        # Save best accuracy score
        accuracies.append(early_stop.best_score)

    # Return average CV accuracy
    return sum(accuracies) / len(accuracies)


In [18]:
def objective_rnn(
    trial, 
    df,
    cv,
    device
    ):
    # Sample hyperparameters
    embed_dim = trial.suggest_int("embed_dim", 16, 112, step=32)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    wd = trial.suggest_float("weight_decay", 1e-4, 1e-1, log=True)
    rnn_units = trial.suggest_int("rnn_units", 32, 128, step=32)

    accuracies = []

    # Cross-validation loop
    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(df, df['Sentiment_encoded']), desc="CV Folds", total=cv.get_n_splits(), leave=True)):

        early_stop = EarlyStopping(patience=3, mode='max')

        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)

        # Build vocab only on training fold
        vocab_inst = build_vocab(train_df)
        vocab_size = len(vocab_inst)

        train_loader, val_loader, _ = create_dataloaders(train_df, val_df, vocab_inst=vocab_inst)

        # Define model (custom RNN or LSTM)
        model = SimpleRNNClassifier(
            vocab_size, 
            embedding_dim=embed_dim,
            rnn_units=rnn_units, 
            num_classes=3
            )
        model = model.to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
        loss_crit = nn.CrossEntropyLoss().to(device)

        # Train for 5 epochs for speed (or early stopping)
        for epoch in tqdm(range(5), desc=f"Training Fold {fold_idx+1}", leave=False):
            _, _ = training_epoch(model, optimizer, loss_crit, train_loader, device)
            val_loss, val_acc = validation_epoch(model, loss_crit, val_loader, device)

            early_stop(val_acc, model)
            if early_stop.early_stop:
                print(f"Early stopping triggered on fold {fold_idx+1}, epoch {epoch+1}.")
                break

        #early_stop.restore(model)
        
        # Save best accuracy score
        accuracies.append(early_stop.best_score)

    # Return average CV accuracy
    return sum(accuracies) / len(accuracies)


## Training

Define training and validation epochs

In [19]:
def training_epoch(model, optimizer, loss_crit, train_loader, device):
    model.train()
    avg_train_loss = 0
    correct_train = 0
    total_train = 0
    
    # Load batches and train
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)  # Move data to device
        optimizer.zero_grad()
        outputs = model(batch_x)

        # Compute loss
        loss = loss_crit(outputs, batch_y)
        loss.backward()
        optimizer.step()

        # Accumulate train loss
        avg_train_loss += loss.item() * batch_x.size(0)

        # Accumulate train accuracy
        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == batch_y).sum().item()
        total_train += batch_y.size(0)
    
    # Average training loss and accuracy over batches
    avg_train_loss /= total_train
    avg_train_accuracy = correct_train / total_train

    return avg_train_loss, avg_train_accuracy

In [20]:
def validation_epoch(model, criterion, val_loader, device):
    model.eval()
    avg_val_loss = 0
    correct_val = 0
    total_val = 0
    
    with torch.no_grad(): # No training
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)  # Move data to device
            outputs = model(batch_x)

            # Compute loss
            loss = criterion(outputs, batch_y)

            # Accumulate train loss
            avg_val_loss += loss.item() * batch_x.size(0)

            # Accumulate validation accuracy
            _, predicted = torch.max(outputs, 1)
            correct_val += (predicted == batch_y).sum().item()
            total_val += batch_y.size(0)

    
    # Average validation loss for the fold
    avg_val_loss /= total_val
    avg_val_accuracy = correct_val / total_val
    
    return avg_val_loss, avg_val_accuracy

In [21]:
def train_model(
    model,
    df_train,
    vocab_inst,
    device,
    best_params=None,
    num_epochs=10,
    early_stopping=None
    ):

    train_df, val_df = train_test_split(
        df_train,
        test_size=0.2,
        stratify=df_train["Sentiment_encoded"],
        random_state=42
        )

    train_loader, val_loader, vocab_inst = create_dataloaders(train_df, val_df, vocab_inst=vocab_inst)

    history = {
        "train_loss": [],
        "train_acc": [],
        "val_loss": [],
        "val_acc": []
        }
        
    if best_params:
        wd = best_params['weight_decay']
        lr = best_params['lr']
    else:
        wd = 0.01
        lr = 0.001

    early_stop = EarlyStopping(patience=5, mode='max')

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    loss_crit = nn.CrossEntropyLoss().to(device)

    for epoch in tqdm(range(1, num_epochs + 1), desc="Training Epochs"):

        print(f"Epoch {epoch}/{num_epochs}")

        # Training step
        train_loss, train_acc = training_epoch(model, optimizer, loss_crit, train_loader, device)

        # Validation step
        val_loss, val_acc = validation_epoch(model, loss_crit, val_loader, device)

        # Log metrics
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

        early_stop(val_acc, model)
        if early_stop.early_stop:
            print(f"Early stopping triggered")
            break

    early_stop.restore(model)

    return model, history


### Preliminary evaluation

In [22]:
def cross_validate_model(
    model, 
    df_train, 
    device,
    cv, 
    best_params = None,
    epochs=10,
    ):
    
    val_scores = []

    if best_params:
        wd = best_params['weight_decay']
        lr = best_params['lr']
    else:
        wd = 0.01
        lr = 0.001
    

    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(df_train, df_train['Sentiment_encoded']), desc="CV Folds", total=cv.get_n_splits(), leave=True)):
        
        early_stop = EarlyStopping(patience=3, mode='max')

        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)

        train_loader, val_loader, _ = create_dataloaders(train_df, val_df, vocab_inst=vocab_inst)

        # New model for each fold
        model_copy = model
        model_copy = model_copy.to(device)

        optimizer = torch.optim.Adam(model_copy.parameters(), lr=lr, weight_decay=wd)
        loss_crit = nn.CrossEntropyLoss().to(device)

        # Training
        for epoch in tqdm(range(epochs), desc=f"Training Fold {fold_idx+1}", leave=False):
            _, _ = training_epoch(model_copy, optimizer, loss_crit, train_loader, device)
            val_loss, val_acc = validation_epoch(model_copy, loss_crit, val_loader, device)

            early_stop(val_acc, model_copy)
            if early_stop.early_stop:
                print(f"Early stopping triggered on fold {fold_idx+1}, epoch {epoch+1}.")
                break

        val_scores.append(early_stop.best_score)

    avg_score = np.mean(val_scores)
    dev_score = np.std(val_scores)
    print(f"Validation Score: {avg_score:.4f} ± {dev_score:.4f}")
    return val_scores

## Evaluation

In [23]:
def evaluate_model_sklearn(model, dataloader, num_classes, device='cpu'):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            probs = outputs.detach().cpu().numpy()
            preds = np.argmax(probs, axis=1)

            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    all_probs = np.array(all_probs)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    print("📊 Evaluation Results")
    print("-------------------------")
    print(f"Accuracy         : {accuracy_score(all_labels, all_preds):.4f}")
    print(f"Precision (macro): {precision_score(all_labels, all_preds, average='macro'):.4f}")
    print(f"Recall    (macro): {recall_score(all_labels, all_preds, average='macro'):.4f}")
    print(f"F1 Score  (macro): {f1_score(all_labels, all_preds, average='macro'):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

    # Binarize the labels for ROC computation
    y_true_bin = label_binarize(all_labels, classes=np.arange(num_classes))

    # Compute ROC curve and AUC for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], all_probs[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute macro-average ROC curve and AUC
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
    mean_tpr = np.zeros_like(all_fpr)

    for i in range(num_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    mean_tpr /= num_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    print(f"\nROC-AUC (macro-average): {roc_auc['macro']:.4f}")

    # Plot all ROC curves
    plt.figure(figsize=(8, 6))
    for i in range(num_classes):
        plt.plot(fpr[i], tpr[i], label=f"Class {i} (AUC = {roc_auc[i]:.2f})")
    plt.plot(fpr["macro"], tpr["macro"], label=f"Macro Avg (AUC = {roc_auc['macro']:.2f})", 
             linestyle='--', color='navy')
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multiclass ROC Curves')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

TODO multiple folds

# Main

## Import data and create dataframes

OS-agnostic working folder and data folder definition

In [24]:
CodeDirectory = Path(os.path.abspath(''))
DATASET_FOLDER = os.path.join(str(CodeDirectory.parent.absolute()), "_data","")
RAW_DATA_FOLDER = os.path.join(str(DATASET_FOLDER), "FinancialPhraseBank-v1.0","")

In [25]:
data_csv_path = DATASET_FOLDER+'data.csv'

Import data into pandas dataframe

In [26]:
df, le = create_df()

Please select which dataset to use. Data is categorized based on the percentage of agreement in the sentiment estimator.
Options:
(1) 50
(2) 66
(3) 75
(4) 100
/Users/exterior/Documents/IML/Project/Part2/_data/sentences_100.csv already exists. Skipping processing.


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2208 entries, 0 to 2207
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sentence           2208 non-null   object
 1   Sentiment          2208 non-null   object
 2   Sentiment_encoded  2208 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 51.9+ KB


Split data into testing and training

In [28]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Sentiment_encoded"],
    random_state=42
    )

## Preliminary analysis

## Train

### Hyperparameter search

Select device

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [31]:
rskf = RepeatedStratifiedKFold(n_repeats=3, n_splits=3, random_state=42)

Hyperparameter search

In [32]:
study = optuna.create_study(direction="maximize")  # You want to maximize accuracy
study.optimize(
    partial(objective_rnn, df=df, cv=rskf, device=device),
    n_trials=10,  # Number of trials to try
)

[I 2025-06-19 17:41:42,168] A new study created in memory with name: no-name-6b830db7-5c53-4c94-ba55-fdf89ff00b18


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 4.


Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 5, epoch 4.


Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:44:40,630] Trial 0 finished with value: 0.616243961352657 and parameters: {'embed_dim': 112, 'lr': 0.0014789211435075106, 'weight_decay': 0.012800954826601414, 'rnn_units': 96}. Best is trial 0 with value: 0.616243961352657.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 5.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 5.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:47:40,075] Trial 1 finished with value: 0.6091485507246377 and parameters: {'embed_dim': 80, 'lr': 0.00013514866837327085, 'weight_decay': 0.05268203716976238, 'rnn_units': 96}. Best is trial 0 with value: 0.616243961352657.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:50:38,959] Trial 2 finished with value: 0.6139794685990339 and parameters: {'embed_dim': 80, 'lr': 0.0007611779549887681, 'weight_decay': 0.0018229077246780577, 'rnn_units': 96}. Best is trial 0 with value: 0.616243961352657.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 5.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 5.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 3, epoch 5.


Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 5.


Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 5, epoch 4.


Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 4.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:52:41,754] Trial 3 finished with value: 0.6159420289855073 and parameters: {'embed_dim': 80, 'lr': 0.005709454068067647, 'weight_decay': 0.0004034989014055676, 'rnn_units': 32}. Best is trial 0 with value: 0.616243961352657.


Early stopping triggered on fold 9, epoch 4.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 5.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 4.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 3, epoch 5.


Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 5.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 7, epoch 5.


Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:54:53,533] Trial 4 finished with value: 0.6156400966183574 and parameters: {'embed_dim': 16, 'lr': 0.0013518912611376842, 'weight_decay': 0.02608289229362822, 'rnn_units': 64}. Best is trial 0 with value: 0.616243961352657.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 4.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 5.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 4.


Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 5.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 17:59:35,767] Trial 5 finished with value: 0.6153381642512077 and parameters: {'embed_dim': 112, 'lr': 0.005988977898358928, 'weight_decay': 0.06798857983141247, 'rnn_units': 128}. Best is trial 0 with value: 0.616243961352657.


Early stopping triggered on fold 9, epoch 5.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 4.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 4.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 3, epoch 5.


Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 5.


Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 5.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 7, epoch 4.


Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 18:01:52,298] Trial 6 finished with value: 0.6163949275362319 and parameters: {'embed_dim': 112, 'lr': 0.005510571920516659, 'weight_decay': 0.0002572717683751624, 'rnn_units': 64}. Best is trial 6 with value: 0.6163949275362319.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 5.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 4.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 5.


Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 5, epoch 5.


Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 5.


Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 7, epoch 4.


Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 5.


Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 18:06:14,333] Trial 7 finished with value: 0.6145833333333334 and parameters: {'embed_dim': 16, 'lr': 0.009836756974255415, 'weight_decay': 0.002801842899497931, 'rnn_units': 128}. Best is trial 6 with value: 0.6163949275362319.


Early stopping triggered on fold 9, epoch 5.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 5.


Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 4.


Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 18:09:09,194] Trial 8 finished with value: 0.613677536231884 and parameters: {'embed_dim': 48, 'lr': 0.0002856313255110819, 'weight_decay': 0.0016721498700646635, 'rnn_units': 96}. Best is trial 6 with value: 0.6163949275362319.


CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 2:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 4:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 5:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 6:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 7:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 8:   0%|          | 0/5 [00:00<?, ?it/s]

Training Fold 9:   0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-06-19 18:13:48,450] Trial 9 finished with value: 0.611262077294686 and parameters: {'embed_dim': 112, 'lr': 0.00014422329043224617, 'weight_decay': 0.002143532861057693, 'rnn_units': 128}. Best is trial 6 with value: 0.6163949275362319.


In [33]:
study.best_params

{'embed_dim': 112,
 'lr': 0.005510571920516659,
 'weight_decay': 0.0002572717683751624,
 'rnn_units': 64}

## Train

In [34]:
vocab_inst = build_vocab(df_train)

### Preliminary evaluation

In [35]:
rnn_classifier = SimpleRNNClassifier(
    len(vocab_inst),
    embedding_dim=study.best_params['embed_dim'],
    rnn_units=study.best_params['rnn_units'],
    num_classes=3
    )

In [36]:
cv_scores = cross_validate_model(
    rnn_classifier,
    df_train,
    device,
    rskf,
    best_params = study.best_params
    )

CV Folds:   0%|          | 0/9 [00:00<?, ?it/s]

Training Fold 1:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 1, epoch 4.


Training Fold 2:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 2, epoch 4.


Training Fold 3:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 3, epoch 4.


Training Fold 4:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 4, epoch 4.


Training Fold 5:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 5, epoch 4.


Training Fold 6:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 6, epoch 7.


Training Fold 7:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 7, epoch 4.


Training Fold 8:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 8, epoch 7.


Training Fold 9:   0%|          | 0/10 [00:00<?, ?it/s]

Early stopping triggered on fold 9, epoch 5.
Validation Score: 0.6840 ± 0.0162


In [37]:
cv_scores

[0.6808149405772496,
 0.6689303904923599,
 0.6989795918367347,
 0.6960950764006791,
 0.6723259762308998,
 0.6870748299319728,
 0.6723259762308998,
 0.7164685908319185,
 0.6632653061224489]

### Actual training

In [38]:
rnn_classifier, history = train_model(
    rnn_classifier,
    df_train,
    vocab_inst,
    device,
    study.best_params
    )

Training Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10
Train Loss: 0.9399, Acc: 0.6140 | Val Loss: 0.9157, Acc: 0.6158
Epoch 2/10
Train Loss: 0.9454, Acc: 0.6147 | Val Loss: 0.9192, Acc: 0.6186
Epoch 3/10
Train Loss: 0.9271, Acc: 0.6147 | Val Loss: 0.9305, Acc: 0.6158
Epoch 4/10
Train Loss: 0.9302, Acc: 0.6147 | Val Loss: 0.9146, Acc: 0.6158
Epoch 5/10
Train Loss: 0.9220, Acc: 0.6147 | Val Loss: 0.9581, Acc: 0.6186
Epoch 6/10
Train Loss: 0.9257, Acc: 0.6147 | Val Loss: 0.9352, Acc: 0.6158
Epoch 7/10
Train Loss: 0.9338, Acc: 0.6147 | Val Loss: 0.9218, Acc: 0.6158
Early stopping triggered
