In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import random
import nltk
import pickle
from nltk.corpus import wordnet
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re
from collections import Counter

nltk.download('wordnet')  # Only needed once


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Configuration
MAX_LEN = 100
BATCH_SIZE = 32
EPOCHS = 2
EMBED_DIM = 128
HIDDEN_DIM = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
# Load CSV
df = pd.read_csv("cleaned_data.csv")  # Ensure columns: 'cleaned_text' & 'emotion'
df = df.dropna(subset=['cleaned_text', 'emotion'])  # Safety

# Encode labels
le = LabelEncoder()
df['emotion'] = le.fit_transform(df['emotion'])


In [10]:
# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'], df['emotion'], test_size=0.2, stratify=df['emotion'], random_state=42)


In [11]:
# Build vocabulary
vocab_dict = {'<pad>': 0, '<unk>': 1}
index = 2

def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

for text in train_texts:
    for token in tokenize(text):
        if token not in vocab_dict:
            vocab_dict[token] = index
            index += 1


In [7]:
# Convert text to padded sequences
def text_to_sequence(text):
    tokens = tokenize(text)
    ids = [vocab_dict.get(token, vocab_dict['<unk>']) for token in tokens[:MAX_LEN]]
    padded = ids + [vocab_dict['<pad>']] * (MAX_LEN - len(ids))
    return padded

X_train = torch.tensor([text_to_sequence(text) for text in train_texts])
X_val = torch.tensor([text_to_sequence(text) for text in val_texts])
y_train = torch.tensor(train_labels.tolist())
y_val = torch.tensor(val_labels.tolist())


In [12]:
# Dataloaders
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [13]:
class BiLSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.lstm(embedded)
        attn_weights = torch.softmax(self.attention(outputs).squeeze(-1), dim=1)
        context = torch.sum(outputs * attn_weights.unsqueeze(-1), dim=1)
        return self.fc(context)


In [14]:
model = BiLSTMWithAttention(
    vocab_size=len(vocab_dict),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=len(le.classes_)
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [15]:
def train(model, loader):
    model.train()
    total_loss = 0
    for inputs, labels in loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


In [16]:
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    print(classification_report(all_labels, all_preds, target_names=le.classes_))


In [17]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    loss = train(model, train_loader)
    print(f"Train Loss: {loss:.4f}")
    evaluate(model, val_loader)


Epoch 1/2
Train Loss: 1.7740


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       angry       0.45      0.67      0.54       875
    confused       0.74      0.56      0.64       511
   depressed       0.52      0.61      0.56      1109
     excited       0.51      0.40      0.44       534
        fear       0.78      0.73      0.75       873
    grateful       0.43      0.18      0.25       502
       happy       0.00      0.00      0.00       296
     hopeful       0.68      0.71      0.70      1124
      lonely       0.51      0.69      0.59       987
      regret       0.62      0.33      0.43       573
    rejected       0.96      0.41      0.58       303
         sad       0.31      0.40      0.35       965

    accuracy                           0.54      8652
   macro avg       0.54      0.47      0.48      8652
weighted avg       0.54      0.54      0.53      8652

Epoch 2/2
Train Loss: 1.2702
              precision    recall  f1-score   support

       angry       0.58      0.62      0.60      

In [18]:
# Save model
torch.save(model.state_dict(), "bilstm_model.pt")

# Save vocab and label encoder
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab_dict, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)
