In [None]:
import torch
import torch.nn as nn
import datasets
import pandas as pd
import random
import os
from collections import Counter
import re

# Data Acquisition

In [None]:
# Clone the repo
if not os.path.exists("twitter_emo_classification"):
  !git clone https://github.com/moka-co/twitter_emo_classification.git
  %cp twitter_emo_classification/data/* .

Cloning into 'twitter_emo_classification'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 0), reused 7 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 17.44 MiB | 15.78 MiB/s, done.


In [None]:
dataset = pd.read_pickle("merged_training.pkl")

In [None]:
print(type(dataset))

print(dataset.describe())



<class 'pandas.core.frame.DataFrame'>
                                                 text emotions
count                                          416809   416809
unique                                         393822        6
top     i feel more adventurous willing to take risks      joy
freq                                               16   141067


In [None]:
dataset.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [None]:
# Get Glove embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2026-01-08 00:14:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-01-08 00:14:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-01-08 00:14:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
# Function that defines a glove embeddings matrix
def load_glove_embeddings(path, word2idx, embedding_dim=100):
    """
    path: path to glove.6B.100d.txt
    word2idx: dictionary mapping words to integers from your dataset
    """
    vocab_size = len(word2idx)
    # Initialize matrix with random values (or zeros)
    embedding_matrix = torch.randn(vocab_size, embedding_dim)

    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word2idx:
                vector = torch.tensor([float(x) for x in values[1:]])
                idx = word2idx[word]
                embedding_matrix[idx] = vector

    return embedding_matrix

# Data Pre processing

In [None]:
df = dataset

# Count unique words in the dataset
all_words = []

def simple_tokenizer(text):
    # Basic cleaning: lowercase and remove non-alphanumeric
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

for tweet in df['text']:
    all_words.extend(simple_tokenizer(tweet))


# count words
word_counts = Counter(all_words)

print(word_counts)

# This allows you to limit vocab size later if needed
most_common_words = sorted(word_counts, key=word_counts.get, reverse=True)

# Define special tokens
# 0 is usually reserved for padding
word2idx = {"<PAD>": 0, "<UNK>": 1}

# Add unique words from your dataframe starting from index 2
for i, word in enumerate(most_common_words):
    word2idx[word] = i + 2

idx2word = {v: k for k, v in word2idx.items()}

print(f"Unique words found: {len(word2idx)}")

def tweet_to_indices(text, word2idx):
    tokens = simple_tokenizer(text)
    # Use .get() with 1 to default to <UNK> if word isn't in vocab
    return [word2idx.get(token, 1) for token in tokens]

# Apply to your dataframe
df['sequences'] = df['text'].apply(lambda x: tweet_to_indices(x, word2idx))

Unique words found: 75304


In [None]:
# Create a mapping from emotion strings to numerical IDs
# This will automatically assign a unique integer to each unique emotion string
df['label_id'] = df['emotions'].astype('category').cat.codes

# Display the mapping and a sample of the converted data
emotion_to_id = dict(enumerate(df['emotions'].astype('category').cat.categories))
id_to_emotion = {v: k for k, v in emotion_to_id.items()}

print("Emotion to ID mapping:", emotion_to_id)
print("ID to Emotion mapping:", id_to_emotion)
print("\nDataFrame with numerical labels:")
display(df.head())

Emotion to ID mapping: {0: 'anger', 1: 'fear', 2: 'joy', 3: 'love', 4: 'sadness', 5: 'surprise'}
ID to Emotion mapping: {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}

DataFrame with numerical labels:


Unnamed: 0,text,emotions,sequences,label_id
27383,i feel awful about it too because it s my job ...,sadness,"[2, 3, 473, 28, 13, 95, 37, 13, 84, 11, 330, 5...",4
110083,im alone i feel awful,sadness,"[17, 217, 2, 3, 473]",4
140764,ive probably mentioned this before but i reall...,joy,"[73, 313, 1352, 23, 167, 20, 2, 41, 39, 3, 390...",2
100071,i was feeling a little low few days back,sadness,"[2, 21, 8, 7, 56, 408, 189, 164, 102]",4
2837,i beleive that i am much more sensitive to oth...,love,"[2, 15334, 9, 2, 24, 76, 38, 1820, 5, 117, 149...",3


# Model Architecture

In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, glove_weights):
        super(EmotionClassifier, self).__init__()

        # Define the embedding layer
        self.embedding = nn.Embedding.from_pretrained(
            glove_weights,
            freeze=False  # Set to True if you don't want to fine-tune the embeddings
        )

        # Example Bi-LSTM setup following the embedding
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.embedding(text)
        # embedded shape: [batch_size, seq_len, embedding_dim]

        output, (hidden, cell) = self.lstm(embedded)

        # Concatenate the final forward and backward hidden states
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        return self.fc(hidden_cat)

# Training

In [None]:
from sklearn.model_selection import train_test_split
# x = your sequences (list of lists of integers)
# y = your labels (0 to 5)
train_sequences, test_sequences, train_labels, test_labels = train_test_split(
    df['sequences'].values,
    df['label_id'].values,
    test_size=0.2,          # 20% for testing
    random_state=42,        # For reproducibility
    stratify=df['label_id'].values
)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class EmoDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = [torch.tensor(s) for s in sequences]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [None]:
def collate_fn(batch):
    # Sort by length (optional but helps LSTM efficiency)
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels = zip(*batch)

    # Pad sequences to the length of the longest one in this batch
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return padded_sequences, labels

# Create the final DataLoaders
train_loader = DataLoader(
    EmoDataset(train_sequences, train_labels),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    EmoDataset(test_sequences, test_labels),
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
# Create obj glove weight matrix and model
weights = load_glove_embeddings('glove.6B.100d.txt', word2idx)
model = EmotionClassifier(len(word2idx), 100, 256, 6, weights)

In [None]:
# Hyperparameters
LEARNING_RATE = 0.001
EPOCH_NUM = 10

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [None]:
# Training Loop
for epoch in range(EPOCH_NUM):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for inputs, labels in train_loader:
        # Cast labels to long (int64) which is required by CrossEntropyLoss
        inputs, labels = inputs.to(device), labels.to(device).long()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100 * correct_train / total_train

    # Validation Loop
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            # Cast labels to long for validation as well
            inputs, labels = inputs.to(device), labels.to(device).long()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss = val_loss / len(test_loader)
    val_acc = 100 * correct_val / total_val

    print(f'Epoch [{epoch+1}/{EPOCH_NUM}], '
          f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')

Epoch [1/5], Train Loss: 0.1839, Train Acc: 91.26%, Val Loss: 0.0940, Val Acc: 93.91%
Epoch [2/5], Train Loss: 0.0915, Train Acc: 94.06%, Val Loss: 0.0919, Val Acc: 93.93%
Epoch [3/5], Train Loss: 0.0862, Train Acc: 94.22%, Val Loss: 0.0902, Val Acc: 93.85%
Epoch [4/5], Train Loss: 0.0832, Train Acc: 94.32%, Val Loss: 0.0937, Val Acc: 93.83%
Epoch [5/5], Train Loss: 0.0810, Train Acc: 94.50%, Val Loss: 0.0976, Val Acc: 93.85%


In [None]:
# Switch model to evaluation mode
model.eval()

# Select a random tweet from the dataset
random_row = df.sample(1).iloc[0]
tweet = random_row['text']
true_label = random_row['emotions']

# Preprocess the tweet
indices = tweet_to_indices(tweet, word2idx)

# Convert to tensor and add batch dimension (batch_size=1)
input_tensor = torch.tensor(indices).unsqueeze(0).to(device)

# Perform inference
with torch.no_grad():
    output = model(input_tensor)
    predicted_idx = torch.argmax(output, dim=1).item()
    # Use emotion_to_id which maps int -> string (variable names were swapped in definition)
    predicted_label = emotion_to_id[predicted_idx]

# Display results
print(f"Tweet: {tweet}")
print(f"True Emotion: {true_label}")
print(f"Predicted Emotion: {predicted_label}")

Tweet: i feel like it is my sweet dream to be able to be debt free and to be able to use my money to help on things i previously felt helpless
True Emotion: love
Predicted Emotion: love
