Note: i'm using colab for training, so i need to import my github repository and do all the steps here

# Clone Repo

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/MyLabs/

Mounted at /content/drive
/content/drive/MyDrive/MyLabs


In [5]:
import os
if not os.path.exists("twitter_emo_classification"):
  !git clone https://github.com/moka-co/twitter_emo_classification.git

%cd twitter_emo_classification
!git pull

/content/drive/MyDrive/MyLabs/twitter_emo_classification
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects: 100% (4/4), 2.63 KiB | 2.00 KiB/s, done.
From https://github.com/moka-co/twitter_emo_classification
   3e699c5..97eb52a  main       -> origin/main
Updating 3e699c5..97eb52a
Fast-forward
 notebooks/train.ipynb | 203 [32m+++++++++++++++++++++++++[m[31m-------------------------[m
 1 file changed, 103 insertions(+), 100 deletions(-)


In [6]:
!pip install -e .

!chmod +x scripts/run_scripts.sh
!./scripts/run_scripts.sh

Obtaining file:///content/drive/MyDrive/MyLabs/twitter_emo_classification
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting demoji (from emotion-classification==1.0)
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Collecting emoji (from emotion-classification==1.0)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: emotion-classification
  Building editable for em

In [25]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import random
from sklearn.metrics import f1_score, confusion_matrix, matthews_corrcoef, classification_report
from collections import defaultdict


Set seed for reproducibility

In [12]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(42)

## Vocabulary Construction

- Words are initialized with GloVe weight if the word is present
- otherwise they are initialized with zero.

Considering just a small subset i.e only the words present in the dataset, instead of full GloVe greatly improved the memory footpring from 480 MB (full GloVe) to less for 80k tokens.

For Out of Vocabulary words, they are initialized with zero and their embedding are learned during the training.

Since the dataset comes from twitter, i expect to have some out of vocabulary words because of twitter slangs.

Example: "sick" in GloVe (illness) vs tweets (slang for "awesome")

In [13]:
# Function that defines a glove embeddings matrix
def load_glove_embeddings(path, word2idx, embedding_dim=100):
    """
    path: path to glove.6B.100d.txt
    word2idx: dictionary mapping words to integers from your dataset
    """
    vocab_size = len(word2idx)
    # Initialize matrix with random values (or zeros)
    embedding_matrix = torch.randn(vocab_size, embedding_dim)
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word2idx:
                vector = torch.tensor([float(x) for x in values[1:]])
                idx = word2idx[word]
                embedding_matrix[idx] = vector

    return embedding_matrix


# Define Class Wrapper for Dataset
class EmoDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = [torch.tensor(s) for s in sequences]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [18]:
# Load vocabulary
vocabulary_path = "data/vocab.json"

with open(vocabulary_path, 'r', encoding='utf-8') as f:
    word2idx = json.load(f)

print(f"Loaded vocabulary with {len(word2idx)} tokens.")

# Load dataset
dataset_path = "data/datasets/final/dataset.parquet"
df = pd.read_parquet(dataset_path)


# Divide dataset into train and test
from sklearn.model_selection import train_test_split
# x = your sequences (list of lists of integers)
# y = your labels (0 to 5)
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    df['sequences'].values,
    df['label'].values,
    test_size=0.2,          # 20% for testing
    random_state=42,        # For reproducibility
    stratify=df['label'].values
)

Loaded vocabulary with 79287 tokens.


In [19]:
def collate_fn(batch):
    # Sort by length (optional but helps LSTM efficiency)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels = zip(*batch)

    # Pad sequences to the length of the longest one in this batch
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return padded_sequences, labels

# Define seed for reproducibility
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# Create the final DataLoaders
BATCH_SIZE=32

train_loader = DataLoader(
    EmoDataset(train_sequences, train_labels),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    worker_init_fn=seed_worker
)

test_loader = DataLoader(
    EmoDataset(test_sequences, test_labels),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

In [20]:
# Define train, validate and compute top-k function
def train(model, train_loader, device, optimizer, criterion):
  model.train()
  running_loss = 0.0
  correct_train = 0
  total_train = 0

  for inputs, labels in train_loader:
        # Cast labels to long (int64) which is required by CrossEntropyLoss
        inputs, labels = inputs.to(device), labels.to(device).long()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        if hasattr(model, 'attention'):
          outputs, _ = model(inputs) # this is different for baseline and lstm model
        else:
          outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

  train_loss = running_loss / len(train_loader)
  train_acc = 100 * correct_train / total_train

  return train_loss, train_acc


def top_k_accuracy(output, target, k=2):
  """
  Computes the accuracy over the k top predictions
  """
  with torch.no_grad():
      batch_size = target.size(0)

      # Get the indices of the top k predictions
      # _, pred shape: [batch_size, k]
      _, pred = output.topk(k, 1, True, True)

      # Transpose to [k, batch_size] to compare with target
      pred = pred.t()

      # Compare pred with target (target is broadcasted)
      # correct shape: [k, batch_size] (Boolean)
      correct = pred.eq(target.view(1, -1).expand_as(pred))

      # Sum the correct predictions
      correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
      return correct_k.item()


def validate(model, test_loader, device, criterion):
  model.eval()

  val_loss = 0.0 # Validation Loss
  correct_val = 0 # Accuracy counter
  correct_topk = 0 # Top-k Accuracy counter
  total_val = 0 # Total validation loss

  all_preds = []
  all_labels = []

  with torch.no_grad():
      for inputs, labels in test_loader:
          # Cast labels to long for validation as well
          inputs, labels = inputs.to(device), labels.to(device).long()

          # Get output
          if hasattr(model, 'attention'):
            outputs, _ = model(inputs) # this is different for baseline and lstm model
          else:
            outputs = model(inputs)

          # Standard Validation Loss
          loss = criterion(outputs, labels)
          val_loss += loss.item()

          # Standard Accuracy (Top-1)
          _, predicted = torch.max(outputs.data, 1)
          total_val += labels.size(0)
          correct_val += (predicted == labels).sum().item()

          # Top-2 accuracy
          correct_topk += top_k_accuracy(outputs, labels, k=2)

          all_preds.extend(predicted.cpu().numpy())
          all_labels.extend(labels.cpu().numpy())

  val_loss = val_loss / len(test_loader)
  val_acc = 100 * correct_val / total_val
  topk_acc = 100 * correct_topk / total_val
  f1_weighted = f1_score(all_labels, all_preds, average='weighted')
  f1_macro = f1_score(all_labels, all_preds, average='macro')

  return val_loss, val_acc, f1_weighted, f1_macro, topk_acc


## Weighted CrossEntropy Loss
Cross Entropy Loss is a loss function used for classification, defined as:
$$
\ell(\hat{\mathbf y}, t) = -\log \hat{y}_t = -\sum_i p(i)\log \hat{y}_i
$$
- $p(i)=1[i-t]$ is one hot vector representing true labels distributions

Since the problem is unbalanced, i weighted the cross entropy loss in such way:
1. $w_i$ weights are chosen inversely proportional to class frequency
2. Cross Entropy Loss is initialized with this bias: `nn.CrossEntropyLoss(weight=class_weights.to(device))`

In this way the loss function becomes:
$$
\ell(\hat{\mathbf y}, t) = - w_t \log \hat{y}_t = -\sum_i w_i \; p(i)\log \hat{y}_i
$$

This penalizes the model more if it missclassifies rarer classes rather than common ones.

In [36]:
# Load TOML configuration for Hyperparameters
import tomllib
load_path_toml = "config.toml"
def load_config(path=load_path_toml):
    with open(path, "rb") as f:
        return tomllib.load(f)

config = load_config()

# Save shared configuration parameters
embedding_dim = config["project"]["embedding_dim"]
output_dim = config["project"]["output_dim"]
epoch_num = config["project"]["epoch_num"]


# Compute distributions
distributions=df['emotions'].value_counts(normalize=True)

# Create directory where to save models
if not os.path.exists("models"):
  os.makedirs("models")

# Set glove path
glove_path = 'data/glove/glove.6B.300d.txt'

# Train Baseline Model

In [32]:
from src.models.EmoBaseline import EmoBaseline

lr = config["model"]["baseline"]["lr"]
wd = config["model"]["baseline"]["wd"]
hidden_dim = config["model"]["baseline"]["hidden_dim"]

# 1. Hyperparameters
hyps = {
    "embedding_dim" : embedding_dim, # Embedding dimension
    "hidden_dim" : hidden_dim, # Hidden dimension
    "output_dim" : output_dim, # Output dimension
    "lr" : lr, # Learning rate
    "wd" : wd, # weight decay
    "epoch_num" : epoch_num,
    } # Number of training epoch

# 2. Create obj glove weight matrix and model
weights = load_glove_embeddings(glove_path, word2idx, embedding_dim=hyps["embedding_dim"])
model_base = EmoBaseline(len(word2idx), # Dict
                                       hyps["embedding_dim"],
                                       hyps["hidden_dim"],
                                       hyps["output_dim"],
                                       weights, # Glove Weights
                                       distributions=list(distributions) # Prior Initialization
                                       )

# 3. Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_base = model_base.to(device)

# 4. Loss and Optimizer
# add bias to Cross Entropy Loss
freqs = torch.tensor(list(distributions))
class_weights = 1.0 / freqs
class_weights = class_weights / class_weights.sum() * len(freqs)
criterion_base = nn.CrossEntropyLoss(weight=class_weights.to(device))

optimizer_base = torch.optim.Adam(model_base.parameters(), lr=hyps['lr'], weight_decay=hyps["wd"])

In [33]:
# Save F1-Score history, both weighted and macro
history_base = {'weighted': [], 'macro': [], 'train_loss': [], 'val_loss': []}
best_val_loss = float('inf')
patience_counter = 0
patience_limit = 5

# Train loop
try:
  for epoch in range(hyps["epoch_num"]):
    # Calls train and validate custom functions
    train_loss, train_acc = train(model_base, train_loader, device, optimizer_base, criterion_base)
    val_loss, val_acc, f1_weighted, f1_macro, topk_acc = validate(model_base, test_loader, device, criterion_base)

    # Print metrics
    print(f'Epoch [{epoch+1}/{hyps["epoch_num"]}], '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%,  '
          f"Top-2 Val Accuracy: {topk_acc:.2f}%, "
          f'Weighted F1-Score: {f1_weighted:.4f},  '
          f'Macro F1-Score: {f1_macro:.4f},  ')

    # Add results to history dictionary
    history_base['weighted'].append(f1_weighted)
    history_base['macro'].append(f1_macro)
    history_base['train_loss'].append(train_loss)
    history_base['val_loss'].append(val_loss)


    # Early stopping
    patience_counter += 1
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        file_path_base = f"./models/best_model_baseline_v1_f1_{f1_macro:.4f}.pt"
        torch.save({
          'epoch': epoch + 1,
          'model_state_dict': model_base.state_dict(),
          'optimizer_state_dict': optimizer_base.state_dict(),
          'loss': best_val_loss,
          'history': history_base
          }, file_path_base)

    if patience_counter == patience_limit:
      print(f"Early stopping, model didn't improved for {patience_limit} epochs")
      break

except KeyboardInterrupt:
  print("\n" + "-"*30)
  print("Training manually interrupted")


Epoch [1/20], Train Loss: 1.1096, Train Acc: 55.93%, Val Loss: 0.4311, Val Acc: 86.41%,  Top-2 Val Accuracy: 95.96%, Weighted F1-Score: 0.8654,  Macro F1-Score: 0.8247,  
Epoch [2/20], Train Loss: 0.3820, Train Acc: 87.50%, Val Loss: 0.3347, Val Acc: 88.30%,  Top-2 Val Accuracy: 96.62%, Weighted F1-Score: 0.8862,  Macro F1-Score: 0.8523,  
Epoch [3/20], Train Loss: 0.3175, Train Acc: 88.68%, Val Loss: 0.3131, Val Acc: 88.56%,  Top-2 Val Accuracy: 97.23%, Weighted F1-Score: 0.8883,  Macro F1-Score: 0.8573,  
Epoch [4/20], Train Loss: 0.2823, Train Acc: 89.35%, Val Loss: 0.3140, Val Acc: 88.77%,  Top-2 Val Accuracy: 97.93%, Weighted F1-Score: 0.8891,  Macro F1-Score: 0.8471,  
Epoch [5/20], Train Loss: 0.2629, Train Acc: 89.64%, Val Loss: 0.2686, Val Acc: 89.23%,  Top-2 Val Accuracy: 98.35%, Weighted F1-Score: 0.8942,  Macro F1-Score: 0.8586,  
Epoch [6/20], Train Loss: 0.2505, Train Acc: 89.77%, Val Loss: 0.3051, Val Acc: 88.30%,  Top-2 Val Accuracy: 97.68%, Weighted F1-Score: 0.8893,  

# Train LSTM with attention
For LSTM with Attention, i'm considering a smaller **hidden dimension** because the LSTM already has more parameters than the baseline and risks to overfit the dataset.

In [38]:
from src.models.EmoLSTM import EmoLSTM

lr = config["model"]["lstm"]["lr"]
wd = config["model"]["lstm"]["wd"]
hidden_dim = config["model"]["lstm"]["hidden_dim"]

# 1. Hyperparameters
hyps = {
    "embedding_dim" : embedding_dim, # Embedding dimension
    "hidden_dim" : hidden_dim, # Hidden dimension
    "output_dim" : output_dim, # Output dimension
    "lr" : lr, # Learning rate
    "wd" : wd, #Weight Decay
    "epoch_num" : epoch_num} # Number of training epoch

# 2. Create obj glove weight matrix and model
weights = load_glove_embeddings(glove_path, word2idx, embedding_dim=hyps["embedding_dim"])
model_lstm = EmoLSTM(len(word2idx), # Vocab Size
                                   hyps["embedding_dim"],
                                   hyps["hidden_dim"],
                                   hyps["output_dim"],
                                   weights, # Glove Weights
                                   distributions=list(distributions)) # Prior Initialization


# 3. Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_lstm = model_lstm.to(device)

# 4. Loss and Optimizer
# Add bias to CrossEntropyLoss
freqs = torch.tensor(list(distributions))
class_weights = 1.0 / freqs
class_weights = class_weights / class_weights.sum() * len(freqs)
criterion_lstm = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer_lstm = torch.optim.Adam(model_lstm.parameters(), lr=hyps['lr'], weight_decay=hyps["wd"])

# 5. Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_lstm, mode='min', factor=0.5, patience=2)


In [39]:
# Save F1-Score history, both weighted and macro
history_lstm = {'weighted': [], 'macro': [], 'train_loss' : [], 'val_loss': []}
best_val_loss = float('inf')
patience_counter = 0
patience_limit = 5

# Train loop, calls train and validate custom functions
try:
  for epoch in range(hyps["epoch_num"]):
    # Calls train and validate custom functions
    train_loss, train_acc = train(model_lstm, train_loader, device, optimizer_lstm, criterion_lstm)
    val_loss, val_acc, f1_weighted, f1_macro, topk_acc = validate(model_lstm, test_loader, device, criterion_lstm)
    scheduler.step(val_loss)

    # Print metrics
    print(f'Epoch [{epoch+1}/{hyps["epoch_num"]}], '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%,  '
          f"Top-2 Val Accuracy: {topk_acc:.2f}%, "
          f'Weighted F1-Score: {f1_weighted:.4f},  '
          f'Macro F1-Score: {f1_macro:.4f},  ')

    # Add results to history dictionary
    history_lstm['weighted'].append(f1_weighted)
    history_lstm['macro'].append(f1_macro)
    history_lstm['train_loss'].append(train_loss)
    history_lstm['val_loss'].append(val_loss)


    # Early stopping
    patience_counter += 1
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        file_path = f"models/best_model_lstm_attn_v1_f1_{f1_macro:.4f}.pt"
        torch.save({
          'epoch': epoch + 1,
          'model_state_dict': model_lstm.state_dict(),
          'optimizer_state_dict': optimizer_lstm.state_dict(),
          'loss': best_val_loss,
          'history' : history_lstm
          }, file_path)

    if patience_counter == patience_limit:
      print(f"\nEarly stopping, model didn't improved for {patience_limit} epochs")
      break


except KeyboardInterrupt:
  print("\n" + "-"*30)
  print("Training manually interrupted")
  print("-"*30)


Epoch [1/20], Train Loss: 0.2132, Train Acc: 90.55%, Val Loss: 0.1377, Val Acc: 93.31%,  Top-2 Val Accuracy: 99.28%, Weighted F1-Score: 0.9349,  Macro F1-Score: 0.9078,  
Epoch [2/20], Train Loss: 0.1317, Train Acc: 93.56%, Val Loss: 0.1339, Val Acc: 93.41%,  Top-2 Val Accuracy: 99.48%, Weighted F1-Score: 0.9358,  Macro F1-Score: 0.9083,  
Epoch [3/20], Train Loss: 0.1238, Train Acc: 93.75%, Val Loss: 0.1317, Val Acc: 93.56%,  Top-2 Val Accuracy: 99.46%, Weighted F1-Score: 0.9375,  Macro F1-Score: 0.9101,  

------------------------------
Training manually interrupted
------------------------------
