In [1]:
import torch # Importing PyTorch for deep learning tasks
import torch.nn as nn # Importing neural network module from PyTorch
import torch.optim as optim # Importing optimization algorithms from PyTorch
from torch.utils.data import DataLoader, Dataset # Importing DataLoader and Dataset classes for handling data
import pandas as pd
import numpy as np
import random #
import nltk # Importing Natural Language Toolkit for text processing
import pickle # Importing pickle for saving and loading Python objects
from nltk.corpus import wordnet # Importing WordNet corpus from NLTK
from sklearn.preprocessing import LabelEncoder # Importing LabelEncoder for encoding labels
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from collections import defaultdict # Importing defaultdict for creating dictionaries with default values
import streamlit as st 
import re # Importing regular expressions for text processing
from collections import Counter # Importing Counter for counting hashable objects

nltk.download('wordnet') # Downloading WordNet corpus for NLTK


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
MAX_LEN = 100        # Maximum length of input sequences
BATCH_SIZE = 32      # Batch size for training
EPOCHS = 2           # Number of training epochs
EMBED_DIM = 128      # Dimension of word embeddings
HIDDEN_DIM = 64      # Dimension of LSTM hidden layers
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Hardware selection

In [3]:
class BiLSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.lstm(embedded)
        attn_weights = torch.softmax(self.attention(outputs).squeeze(-1), dim=1)
        context = torch.sum(outputs * attn_weights.unsqueeze(-1), dim=1)
        return self.fc(context)



**Key Features:**
- **Embedding Layer:** Converts word indices to dense vectors.
- **Bidirectional LSTM:** Captures context from both directions in the sequence.
- **Attention Mechanism:** Learns to focus on the most relevant words for emotion detection.
- **Fully Connected Layer:** Outputs class scores for each emotion.

---

## 7. **Training and Evaluation**

- **Training Loop:**  
  The `train` function performs forward and backward passes, updating model weights and tracking loss.

- **Evaluation:**  
  The `evaluate` function computes predictions on the validation set and prints a detailed classification report (precision, recall, F1-score) for each emotion class.

- **Epochs:**  
  The model is trained for the specified number of epochs, with performance metrics displayed after each epoch.

---

## 8. **Saving Model and Artifacts**

After training, the following artifacts are saved for deployment:

- **Model Weights:** bilstm_model.pt
- **Vocabulary:** vocab.pkl
- **Label Encoder:** label_encoder.pkl

These files are essential for inference and integration with applications such as Streamlit.

---

## 9. **Integration and Deployment**

- The saved model and assets can be loaded in a Streamlit app (app.py or new.py) for real-time emotion detection and recommendations.
- The modular design allows easy extension to other NLP tasks or integration with external APIs.

---

## 10. **Best Practices and Recommendations**

- **Data Quality:** Ensure cleaned_data.csv is properly preprocessed for optimal model performance.
- **Hyperparameter Tuning:** Experiment with `EMBED_DIM`, `HIDDEN_DIM`, and `EPOCHS` for best results.
- **Model Interpretability:** The attention mechanism provides insights into which words influence predictions.
- **Scalability:** The script is designed to handle large datasets efficiently using PyTorch’s DataLoader.

---

## 11. **Conclusion**

This script forms the backbone of the Vibe Bot’s emotion classification engine. It demonstrates best practices in NLP preprocessing, deep learning model construction, and artifact management for deployment. The approach is robust, scalable, and ready for integration into production systems.

---

**For further details or live demonstrations, refer to the Streamlit application and associated deployment scripts in the project directory.**

In [4]:

# ========== CONFIG ========== #
MAX_LEN = 100 # Maximum length of input sequences
BATCH_SIZE = 32 # Batch size for training
EPOCHS = 2 # Number of epochs for training
EMBED_DIM = 128 # Dimension of word embeddings
HIDDEN_DIM = 64 # Dimension of hidden layers in LSTM
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== DATA LOADING ========== #
df = pd.read_csv("cleaned_data.csv")  # Ensure this has 'text' and 'label'
le = LabelEncoder() #   LabelEncoder to convert string labels to integers
df['emotion'] = le.fit_transform(df['emotion']) # Encode the 'emotion' column

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'], df['emotion'], test_size=0.2, stratify=df['emotion'], random_state=42)

# Build vocab without torchtext
vocab_dict = {'<pad>': 0, '<unk>': 1} # Initialize vocabulary dictionary with padding and unknown tokens
index = 2 # Start indexing from 2 to reserve 0 for padding and 1 for unknown

def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower()) # Tokenize text into words, converting to lowercase

for text in train_texts:
    for token in tokenize(text):
        if token not in vocab_dict:
            vocab_dict[token] = index
            index += 1

# Text to indices
def text_to_sequence(text):
    tokens = tokenize(text)
    ids = [vocab_dict.get(token, vocab_dict['<unk>']) for token in tokens[:MAX_LEN]]
    padded = ids + [vocab_dict['<pad>']] * (MAX_LEN - len(ids))
    return padded

X_train = torch.tensor([text_to_sequence(text) for text in train_texts]) # Convert training texts to sequences
X_val = torch.tensor([text_to_sequence(text) for text in val_texts]) # Convert validation texts to sequences
y_train = torch.tensor(train_labels.tolist()) # Convert training labels to tensor
y_val = torch.tensor(val_labels.tolist()) # Convert validation labels to tensor

train_dataset = torch.utils.data.TensorDataset(X_train, y_train) # Create TensorDataset for training data
val_dataset = torch.utils.data.TensorDataset(X_val, y_val) # Create TensorDataset for validation data
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) # Create DataLoader for training data
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE) # Create DataLoader for validation data

# ========== MODEL ========== #
class BiLSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(BiLSTMWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.lstm(embedded)
        attn_weights = torch.softmax(self.attention(outputs).squeeze(-1), dim=1)
        context = torch.sum(outputs * attn_weights.unsqueeze(-1), dim=1)
        return self.fc(context)

model = BiLSTMWithAttention(len(vocab_dict), embed_dim=EMBED_DIM, hidden_dim=HIDDEN_DIM, output_dim=len(le.classes_)).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# ========== TRAINING LOOP ========== #
def train(model, loader):
    model.train()
    total_loss = 0
    for inputs, labels in loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    print(classification_report(all_labels, all_preds, target_names=le.classes_))

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    loss = train(model, train_loader)
    print(f"Train Loss: {loss:.4f}")
    evaluate(model, val_loader)

# Save model and vocab
torch.save(model.state_dict(), "bilstm_model.pt")
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab_dict, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)



FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_data.csv'