In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter
import re
import string

In [2]:
# Step 1: Data Preprocessing
class TextPreprocessor:
    def __init__(self, max_vocab_size=10000, max_seq_length=100):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {0: "<PAD>", 1: "<UNK>"}
        self.word_counts = Counter()
        self.vocab_size = 2  # Starting with PAD and UNK tokens

    def clean_text(self, text):
        """Basic text cleaning"""
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation
        text = re.sub(f'[{string.punctuation}]', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def fit(self, texts):
        """Build vocabulary from texts"""
        # Clean and tokenize all texts
        for text in texts:
            clean_text = self.clean_text(text)
            tokens = clean_text.split()
            self.word_counts.update(tokens)
        
        # Keep only the most common words (minus PAD and UNK which we already have)
        vocab_words = [word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)]
        
        # Create word to index mapping
        for word in vocab_words:
            self.word2idx[word] = self.vocab_size
            self.idx2word[self.vocab_size] = word
            self.vocab_size += 1
            
        print(f"Vocabulary size: {self.vocab_size}")
    
    def transform(self, texts):
        """Convert texts to sequences of indices"""
        sequences = []
        for text in texts:
            clean_text = self.clean_text(text)
            tokens = clean_text.split()
            # Truncate if longer than max_seq_length
            if len(tokens) > self.max_seq_length:
                tokens = tokens[:self.max_seq_length]
            
            # Convert tokens to indices
            seq = [self.word2idx.get(word, self.word2idx["<UNK>"]) for word in tokens]
            sequences.append(seq)
        
        return sequences

In [3]:
# Step 2: Create PyTorch Dataset
class TextClassificationDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx]), torch.tensor(self.labels[idx])

In [4]:
# Step 3: Create collate function for batching
def collate_fn(batch):
    """Collate function to pad sequences in a batch"""
    texts, labels = zip(*batch)
    
    # Pad sequences to the length of the longest sequence in the batch
    padded_texts = pad_sequence([text for text in texts], batch_first=True, padding_value=0)
    
    return padded_texts, torch.tensor(labels)

In [5]:
from lstm import LSTM
# Step 4: Build LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1, 
                 bidirectional=False, dropout=0.5):
        super().__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM layer
        self.lstm = LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           )
        
        # Calculate the size of the output from LSTM
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(lstm_output_dim, output_dim)
        
    def forward(self, text):
        # text = [batch size, sentence length]
        
        # Embed the words
        embedded = self.embedding(text)
        # embedded = [batch size, sentence length, embedding dim]
        
        # Pass through LSTM
        output, (hidden, cell) = self.lstm(embedded)
        # If bidirectional, concatenate the final forward and backward hidden states
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        
        # Apply dropout
        hidden = self.dropout(hidden)
        
        # Pass through fully connected layer
        return self.fc(hidden)

In [6]:
# Step 5: Training Function
def train_model(model, train_loader, valid_loader, criterion, optimizer, device, num_epochs=5):
    # Initialize best validation loss
    best_valid_loss = float('inf')
    
    # Training history
    train_losses = []
    valid_losses = []
    train_accs = []
    valid_accs = []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_train_loss = 0
        epoch_train_acc = 0
        train_samples = 0
        
        for batch_idx, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.to(device), labels.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            predictions = model(texts)
            
            # Calculate loss
            loss = criterion(predictions, labels)
            
            # Backward pass
            loss.backward()
            
            # Update parameters
            optimizer.step()
            
            # Calculate accuracy
            predictions_class = torch.argmax(predictions, dim=1)
            correct = (predictions_class == labels).float().sum()
            
            # Update metrics
            epoch_train_loss += loss.item() * len(labels)
            epoch_train_acc += correct.item()
            train_samples += len(labels)
            
            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{len(train_loader)}, ' 
                      f'Loss: {loss.item():.4f}, Acc: {correct.item()/len(labels):.4f}')
        
        # Calculate average loss and accuracy for the epoch
        epoch_train_loss /= train_samples
        epoch_train_acc /= train_samples
        
        # Validation
        model.eval()
        epoch_valid_loss = 0
        epoch_valid_acc = 0
        valid_samples = 0
        
        with torch.no_grad():
            for texts, labels in valid_loader:
                texts, labels = texts.to(device), labels.to(device)
                
                # Forward pass
                predictions = model(texts)
                
                # Calculate loss
                loss = criterion(predictions, labels)
                
                # Calculate accuracy
                predictions_class = torch.argmax(predictions, dim=1)
                correct = (predictions_class == labels).float().sum()
                
                # Update metrics
                epoch_valid_loss += loss.item() * len(labels)
                epoch_valid_acc += correct.item()
                valid_samples += len(labels)
        
        # Calculate average validation loss and accuracy
        epoch_valid_loss /= valid_samples
        epoch_valid_acc /= valid_samples
        
        # Save the best model
        if epoch_valid_loss < best_valid_loss:
            best_valid_loss = epoch_valid_loss
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Model saved with validation loss: {best_valid_loss:.4f}')
        
        # Update history
        train_losses.append(epoch_train_loss)
        valid_losses.append(epoch_valid_loss)
        train_accs.append(epoch_train_acc)
        valid_accs.append(epoch_valid_acc)
        
        # Print epoch statistics
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.4f}')
        print(f'Valid Loss: {epoch_valid_loss:.4f}, Valid Acc: {epoch_valid_acc:.4f}')
        print('-' * 60)
    
    return {
        'train_losses': train_losses,
        'valid_losses': valid_losses,
        'train_accs': train_accs,
        'valid_accs': valid_accs
    }

In [7]:
# Step 6: Evaluation Function
def evaluate_model(model, test_loader, criterion, device, label_names=None):
    model.eval()
    test_loss = 0
    test_acc = 0
    test_samples = 0
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            # Forward pass
            predictions = model(texts)
            
            # Calculate loss
            loss = criterion(predictions, labels)
            
            # Calculate accuracy
            predictions_class = torch.argmax(predictions, dim=1)
            correct = (predictions_class == labels).float().sum()
            
            # Collect predictions and labels for classification report
            all_predictions.extend(predictions_class.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
            # Update metrics
            test_loss += loss.item() * len(labels)
            test_acc += correct.item()
            test_samples += len(labels)
    
    # Calculate average test loss and accuracy
    test_loss /= test_samples
    test_acc /= test_samples
    
    print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
    
    # Print classification report
    if label_names is not None:
        print('\nClassification Report:')
        print(classification_report(all_labels, all_predictions, target_names=label_names))
    else:
        print('\nClassification Report:')
        print(classification_report(all_labels, all_predictions))
    
    return test_loss, test_acc

In [8]:
df = pd.read_csv('../dataset/rotten_tomatoes_movies.csv')
df = df.dropna()
texts = df['critics_consensus'].values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder().fit(df['tomatometer_status'].unique())
label_names = le.classes_
labels = le.transform(df['tomatometer_status'].values)

In [9]:
# Step 7: Main function to run everything
def main():
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    # Further split training data into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
    )
    
    # Preprocess text data
    preprocessor = TextPreprocessor(max_vocab_size=5000, max_seq_length=50)
    preprocessor.fit(X_train)
    
    # Transform texts to sequences
    X_train_seq = preprocessor.transform(X_train)
    X_val_seq = preprocessor.transform(X_val)
    X_test_seq = preprocessor.transform(X_test)
    
    # Create datasets
    train_dataset = TextClassificationDataset(X_train_seq, y_train)
    val_dataset = TextClassificationDataset(X_val_seq, y_val)
    test_dataset = TextClassificationDataset(X_test_seq, y_test)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Model hyperparameters
    VOCAB_SIZE = preprocessor.vocab_size
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 128
    OUTPUT_DIM = len(set(labels))  # Number of unique classes
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.5
    
    # Initialize model
    model = LSTMClassifier(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=OUTPUT_DIM,
        n_layers=N_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT
    )
    
    # Move model to device
    model = model.to(device)
    
    # Print model summary
    print(model)
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Train the model
    history = train_model(
        model=model, 
        train_loader=train_loader, 
        valid_loader=val_loader, 
        criterion=criterion, 
        optimizer=optimizer, 
        device=device, 
        num_epochs=10
    )
    
    # Load best model
    model.load_state_dict(torch.load('best_model.pt'))
    
    # Evaluate on test set
    test_loss, test_acc = evaluate_model(
        model=model, 
        test_loader=test_loader, 
        criterion=criterion, 
        device=device,
        label_names=label_names
    )
    
    print(f"Final Test Accuracy: {test_acc:.4f}")

In [10]:
main()

Vocabulary size: 5000
Using device: cpu
LSTMClassifier(
  (embedding): Embedding(5000, 100, padding_idx=0)
  (lstm): LSTM(
    (layers_forward): ModuleList(
      (0): LSTMCell(
        (W_x): Linear(in_features=100, out_features=512, bias=True)
        (W_h): Linear(in_features=128, out_features=512, bias=True)
      )
      (1): LSTMCell(
        (W_x): Linear(in_features=128, out_features=512, bias=True)
        (W_h): Linear(in_features=128, out_features=512, bias=True)
      )
    )
    (layers_backward): ModuleList(
      (0): LSTMCell(
        (W_x): Linear(in_features=100, out_features=512, bias=True)
        (W_h): Linear(in_features=128, out_features=512, bias=True)
      )
      (1): LSTMCell(
        (W_x): Linear(in_features=128, out_features=512, bias=True)
        (W_h): Linear(in_features=128, out_features=512, bias=True)
      )
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=3, bias=True)
)
Epoch 1/10, Batch 10/182, Lo

  model.load_state_dict(torch.load('best_model.pt'))


Test Loss: 0.8271, Test Acc: 0.6448

Classification Report:
                 precision    recall  f1-score   support

Certified-Fresh       0.61      0.71      0.65       550
          Fresh       0.34      0.24      0.29       382
         Rotten       0.80      0.81      0.81       684

       accuracy                           0.64      1616
      macro avg       0.58      0.59      0.58      1616
   weighted avg       0.62      0.64      0.63      1616

Final Test Accuracy: 0.6448
