<a href="https://colab.research.google.com/github/og-hayden/ai-ml/blob/main/Multiclass_Sentiment_Analysis_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiclass Sentiment Analysis with RNN

In [232]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [233]:
device = "cuda" if torch.cuda.is_available else "cpu"

# 1. Preprocessing the Data

In [234]:
from torch.utils.data import Dataset

# Initializing custom dataset from training data .txt files
class EmotionData(Dataset):
  def __init__(self, folder):
    self.data = []
    self.labels = []
    self.label_to_index = {'joy': 0, 'sadness': 1, 'anger': 2}
    self.index_to_label = self.index_to_label = {v: k for k, v in self.label_to_index.items()} # reverse of `self.label_to_index`

    # Loop over files and add them to dataset with correct labeling
    for label in self.label_to_index:
      file_path = f"/{folder}/{label}.txt" # File path for each emotion
      with open(file_path, 'r') as file:
        words = file.read().splitlines() # Returns list of words
        self.data.extend(words) # Add `words` list to data
        self.labels.extend([self.label_to_index[label]] * len(words)) # Add corrosponding labels to data

  # Length method
  def __len__(self):
    return len(self.data)

  # Search dataset by index method
  def __getitem__(self, index):
    return self.data[index], self.labels[index]

In [235]:
# Initializing the datasets
training_dataset = EmotionData("training-data")
testing_dataset = EmotionData("testing-data")

In [236]:
# Number of characters in longest word
max_train_length = max(len(word) for word, _ in training_dataset)
max_test_length = max(len(word) for word, _ in testing_dataset)
max_length = max(max_train_length, max_test_length)

In [237]:
from torch.nn.utils.rnn import pad_sequence

CHARS = sorted(set("abcdefghijklmnopqrstuvwxyz")) # List of possible characters

def word_to_matrix(word, chars=CHARS, max_length=max_length):
    # Initialize a zero tensor with the shape of (max_length, len(chars))
    tensor = torch.zeros((max_length, len(chars)), dtype=torch.float)
    for i, letter in enumerate(word.lower()):
        if letter in chars:
            tensor[i, chars.index(letter)] = 1
    return tensor

In [238]:
word_to_matrix("Hello")

tensor([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0.,

# 2. Defining the Model

In [239]:
class EmotionRNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.rnn = nn.RNN(input_size=26, hidden_size=128, batch_first=True) # `batch_first=True` denotes an input shape representing (batch_size, seq_length, features)
    self.fc = nn.Linear(in_features=128, out_features=3) # Outputs logits
  def forward(self, x):
    out, _ = self.rnn(x)
    out = self.fc(out[:, -1, :]) # Get last step output for classification (batch_size, seq_length, features)
    return out

In [240]:
model = EmotionRNN().to(device)

# 3. Picking a Loss Function and Optimizer

In [241]:
loss_fn = nn.CrossEntropyLoss() # Input: raw logit vector, true label
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=0.01)

# 4. Building a Training Loop

In [242]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(training_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testing_dataset, batch_size=64, shuffle=True)

epochs = 500

for epoch in range(epochs):
  train_loss = 0
  test_loss = 0

  # Model training
  model.train()
  for words, labels in train_dataloader:
    # Convert words to one-hot matrices
    words_tensor = pad_sequence([word_to_matrix(w) for w in words],
                                batch_first=True).to(device)
    labels = labels.to(dtype=torch.long, device=device)

    outputs = model(words_tensor)
    loss = loss_fn(outputs, labels)
    train_loss = train_loss + loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  # Model testing
  model.eval()
  for t_words, labels in test_dataloader:
    # Convert words to one-hot matrices
    words_tensor = pad_sequence([word_to_matrix(w) for w in t_words],
                                batch_first=True).to(device)
    labels = labels.to(dtype=torch.long, device=device)
    with torch.inference_mode():
      outputs = model(words_tensor)
    loss = loss_fn(outputs, labels)
    test_loss = test_loss + loss.item()

  # Normalizing loss
  train_loss /= len(words)
  test_loss /= len(t_words)

  # Printing losses
  if (epoch+1) % 50 == 0:
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')

Epoch [50/500], Loss: 0.1960, Test Loss: 0.2238
Epoch [100/500], Loss: 0.2016, Test Loss: 0.2282
Epoch [150/500], Loss: 0.2088, Test Loss: 0.2324
Epoch [200/500], Loss: 0.1995, Test Loss: 0.2230
Epoch [250/500], Loss: 0.2003, Test Loss: 0.2467
Epoch [300/500], Loss: 0.2058, Test Loss: 0.2214
Epoch [350/500], Loss: 0.2011, Test Loss: 0.2338
Epoch [400/500], Loss: 0.2078, Test Loss: 0.2729
Epoch [450/500], Loss: 0.2054, Test Loss: 0.2253
Epoch [500/500], Loss: 0.2050, Test Loss: 0.2273
