In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
import re
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df = pd.read_csv('IMDB Dataset.csv')

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
stop_words = set(stopwords.words('english'))
remove_html = re.compile(r'<.*?>')
non_alpha_pattern = re.compile(r'[^a-z\s]')

def clean_text(text):
    text = text.lower()
    text = remove_html.sub(' ', text)
    text = non_alpha_pattern.sub('', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [12]:
def process_data_in_chunks(df, chunk_size = 5000):
    cleaned_chunks = []
    for start in range(0, df.shape[0], chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end].copy()
        chunk['cleaned_review'] = chunk['review'].apply(clean_text)
        cleaned_chunks.append(chunk)
    return pd.concat(cleaned_chunks, ignore_index = True)

In [13]:
df_cleaned = process_data_in_chunks(df)

In [14]:
df_cleaned

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,negative,im going disagree previous comment side maltin...


In [15]:
df_cleaned['tokens'] = df_cleaned['cleaned_review'].apply(word_tokenize)

In [16]:
train_data, val_data = train_test_split(df_cleaned, test_size = 0.2, random_state = 42)

#### Vanilla RNNs and LSTMs with GloVe Embeddings

In [17]:
import gensim.downloader as api
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score

In [18]:
# Download pre-trained GloVe embeddings
glove_vectors = api.load("glove-wiki-gigaword-100")  # 100-dimensional GloVe embeddings

# Create an embedding matrix for the words in our dataset
embedding_dim = 100
word_to_index = {}
embedding_matrix = []

# Add special tokens
word_to_index['<PAD>'] = 0
embedding_matrix.append(np.zeros(embedding_dim))  # Zero vector for padding

word_to_index['<UNK>'] = 1
embedding_matrix.append(np.random.randn(embedding_dim))  # Random vector for unknown words

index = 2  # Start index for actual words
for word in glove_vectors.key_to_index.keys():
    if word not in word_to_index:
        word_to_index[word] = index
        embedding_matrix.append(glove_vectors[word])
        index += 1

embedding_matrix = np.array(embedding_matrix)

# Function to encode tokens to integers
def encode_tokens(tokens):
    return [word_to_index.get(token, word_to_index['<UNK>']) for token in tokens]

# Assuming `train_data` and `val_data` DataFrames are already defined with 'tokens' and 'target' columns.
train_data['encoded'] = train_data['tokens'].apply(encode_tokens)
val_data['encoded'] = val_data['tokens'].apply(encode_tokens)



In [21]:
labels = {'positive': 1, 'negative': 0}
train_data['target'] = train_data['sentiment'].map(labels)
val_data['target'] = val_data['sentiment'].map(labels)

In [22]:
class MovieData(Dataset):
    def __init__(self, encoded_texts, targets):
        self.encoded_texts = encoded_texts
        self.targets = torch.tensor(targets.values, dtype=torch.long)  # Change to long for classification

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts.iloc[idx], dtype=torch.long), self.targets[idx]

# Pad sequences for batching
def collate_fn(batch):
    texts, targets = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts_padded, torch.stack(targets)

# Create DataLoaders
batch_size = 32
train_dataset = MovieData(train_data['encoded'], train_data['target'])
val_dataset = MovieData(val_data['encoded'], val_data['target'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [23]:
# Vanilla RNN Model
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=2, dropout=0.3, num_classes=2):
        super(RNNModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Freeze embedding layer

        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers,
                          batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)  # Output layer for classification
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.dropout(x)
        x = x[:, -1, :]  # Use the output from the last time step
        x = self.fc(x)
        return x

In [24]:
# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=2, dropout=0.3, num_classes=2):
        super(LSTMModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Freeze embedding layer

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=False, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)  # Output layer for classification
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = x[:, -1, :]  # Use the output from the last time step
        x = self.fc(x)
        return x

In [53]:
# Initialize models, loss function, and optimizer
num_classes = len(train_data['target'].unique())  # Number of classes

rnn_model = RNNModel(embedding_matrix, num_classes=num_classes).to(device)
lstm_model = LSTMModel(embedding_matrix, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=1e-3)
lstm_optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)

In [54]:
next(rnn_model.parameters()).device

device(type='cuda', index=0)

In [55]:
# Function to train the model
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device):

    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for texts, targets in train_loader:
            # Sending data to target device
            texts = texts.to(device)
            targets = targets.to(device)

            # Calculating the loss
            outputs = model(texts)
            loss = criterion(outputs, targets)
            total_loss += loss.item()

            # Gradient descent steps
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate training batch accuracy (average accuracy)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

        train_accuracy = correct / total

        val_loss = 0
        val_correct = 0
        val_total = 0
        # model.eval()
        with torch.inference_mode():
            for texts, targets in val_loader:
                # Sending data to target device
                texts = texts.to(device)
                targets = targets.to(device)

                outputs = model(texts)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                # Calculate validation accuracy
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == targets).sum().item()
                val_total += targets.size(0)

        val_accuracy = val_correct / val_total

        print(f'Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader):.4f}, Training Accuracy: {train_accuracy:.4f}, '
              f'Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}')

In [56]:
# Train the RNN model
print("Training Vanilla RNN Model...")
train_model(rnn_model, train_loader, val_loader, criterion, rnn_optimizer, num_epochs=3)

Training Vanilla RNN Model...
Epoch 1, Training Loss: 0.6989, Training Accuracy: 0.5028, Validation Loss: 0.6987, Validation Accuracy: 0.5001
Epoch 2, Training Loss: 0.6980, Training Accuracy: 0.5021, Validation Loss: 0.6946, Validation Accuracy: 0.5050
Epoch 3, Training Loss: 0.6981, Training Accuracy: 0.5040, Validation Loss: 0.6992, Validation Accuracy: 0.5070


In [57]:
# Train the LSTM model
print("\nTraining LSTM Model...")
train_model(lstm_model, train_loader, val_loader, criterion, lstm_optimizer, num_epochs=3)


Training LSTM Model...
Epoch 1, Training Loss: 0.6932, Training Accuracy: 0.5034, Validation Loss: 0.6926, Validation Accuracy: 0.5087
Epoch 2, Training Loss: 0.6915, Training Accuracy: 0.5026, Validation Loss: 0.7149, Validation Accuracy: 0.4990
Epoch 3, Training Loss: 0.6739, Training Accuracy: 0.5484, Validation Loss: 0.6411, Validation Accuracy: 0.6686


In [58]:
# Evaluation function for the models
def evaluate_model(model, val_loader):
    model.to('cpu')
    model.eval()
    all_preds = []
    all_targets = []

    with torch.inference_mode():
        for texts, targets in val_loader:
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    accuracy = accuracy_score(all_targets, all_preds)
    print(f'Validation Accuracy: {accuracy:.4f}')

In [59]:
# Evaluate the RNN model
print("Evaluating Vanilla RNN Model...")
evaluate_model(rnn_model, val_loader)

Evaluating Vanilla RNN Model...
Validation Accuracy: 0.5039


In [60]:
# Evaluate the LSTM model
print("\nEvaluating LSTM Model...")
evaluate_model(lstm_model, val_loader)


Evaluating LSTM Model...
Validation Accuracy: 0.6706
