In [1]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [2]:
import numpy as np
import pandas as pd

In [3]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
import spacy
import nltk
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split, ParameterGrid

In [5]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
TOKENIZER_CHOICE = 'spacy'

In [8]:
if TOKENIZER_CHOICE == 'spacy':
    spacy_en = spacy.load("en_core_web_sm")
    def tokenizer(text):
        return [tok.text.lower() for tok in spacy_en.tokenizer(text) if tok.text.isalpha() and tok.text not in stop_words]
else:
    def tokenizer(text):
        return [word.lower() for word in nltk.word_tokenize(text) if word.isalpha() and word not in stop_words]

In [9]:
import os

def load_data(csv_file):
    df = pd.read_csv(csv_file)
    df = pd.read_csv(csv_file, usecols=['review', 'sentiment'])
    df.dropna(inplace=True)
    df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
    return df

def preprocess_data(df):
    df['tokens'] = df['review'].apply(tokenizer)
    return df

In [10]:
import gensim.downloader as api
glove = api.load("glove-wiki-gigaword-300")



In [11]:
def create_embedding_matrix(vocab):
    embedding_dim = 300
    embedding_matrix = torch.zeros(len(vocab) + 1, embedding_dim)
    for word, i in vocab.items():
        if word in glove:
            embedding_matrix[i] = torch.tensor(glove[word], dtype=torch.float32)
    return embedding_matrix

In [12]:
class IMDBDataset(Dataset):
    def __init__(self, df, vocab):
        self.data = df
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data.iloc[idx]['tokens']
        label = self.data.iloc[idx]['label']
        indexed = [self.vocab.get(word, 0) for word in tokens]
        return torch.tensor(indexed, dtype=torch.long), torch.tensor(label, dtype=torch.float32)


In [13]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs = [torch.tensor(seq, dtype=torch.long) for seq in inputs]
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.float32)
    return inputs_padded, labels

In [14]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, embedding_weights=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embedding_weights is not None:
            self.embedding.weight.data.copy_(embedding_weights)
            self.embedding.weight.requires_grad = False
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2 if bidirectional else hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.rnn(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1) if self.rnn.bidirectional else hidden[-1,:,:]
        hidden = self.batch_norm(hidden)
        return self.fc(hidden)

In [15]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

In [16]:
def train_and_evaluate(params, embedding_matrix, train_loader, val_loader, test_loader, vocab_size, patience=3):
    model = SentimentRNN(vocab_size, params['embedding_dim'], params['hidden_dim'], 1, params['n_layers'], params['bidirectional'], params['dropout'], embedding_matrix).to(device)
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])
    criterion = nn.BCEWithLogitsLoss().to(device)
    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(5):
        model.train()
        epoch_loss, epoch_acc = 0, 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(inputs).squeeze(1)
            loss = criterion(predictions, labels)
            if torch.isnan(loss):
                continue
            acc = binary_accuracy(predictions, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += acc.item()

        val_loss, val_acc = 0, 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                predictions = model(inputs).squeeze(1)
                loss = criterion(predictions, labels)
                if torch.isnan(loss):
                    continue
                acc = binary_accuracy(predictions, labels)
                val_loss += loss.item()
                val_acc += acc.item()

        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        print(f'Epoch {epoch+1}: Train Loss = {epoch_loss / len(train_loader):.4f}, Train Acc = {epoch_acc / len(train_loader):.4f}, Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}')

        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break

    return model

In [17]:
def test_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            predictions = model(inputs).squeeze(1)
            correct += (torch.round(torch.sigmoid(predictions)) == labels).sum().item()
            total += labels.size(0)

    return correct / total


In [18]:
param_grid = {
    'embedding_dim': [300],
    'hidden_dim': [256],
    'n_layers': [4],
    'bidirectional': [True],
    'dropout': [0.2, 0.5],
    'lr': [0.001, 0.0005]
}

In [19]:
df = load_data('IMDB_Dataset.csv')

In [20]:
df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1.0
1,A wonderful little production. <br /><br />The...,positive,1.0
2,I thought this was a wonderful way to spend ti...,positive,1.0
3,Basically there's a family where a little boy ...,negative,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1.0


In [21]:
df.shape

(49999, 3)

In [22]:
df = preprocess_data(df)
vocab = {word: i for i, word in enumerate(set(word for tokens in df['tokens'] for word in tokens), 1)}
embedding_matrix = create_embedding_matrix(vocab)

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=SEED)

train_ds = IMDBDataset(train_df, vocab)
val_ds = IMDBDataset(val_df, vocab)
test_ds = IMDBDataset(test_df, vocab)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=64, collate_fn=collate_fn)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
best_model = None
best_acc = 0
for params in ParameterGrid(param_grid):
    model = train_and_evaluate(params, embedding_matrix, train_loader, val_loader, test_loader, vocab_size=len(vocab) + 1)
    acc = test_model(model, test_loader)
    if acc > best_acc:
        best_acc = acc
        best_model = model

print(f'Best Test Accuracy: {best_acc * 100:.2f}%')

  inputs = [torch.tensor(seq, dtype=torch.long) for seq in inputs]


Epoch 1: Train Loss = 0.5009, Train Acc = 0.7439, Val Loss = 0.3461, Val Acc = 0.8543
Epoch 2: Train Loss = 0.3385, Train Acc = 0.8522, Val Loss = 0.3381, Val Acc = 0.8566
Epoch 3: Train Loss = 0.2974, Train Acc = 0.8720, Val Loss = 0.2896, Val Acc = 0.8834
Epoch 4: Train Loss = 0.2716, Train Acc = 0.8829, Val Loss = 0.2853, Val Acc = 0.8749
Epoch 5: Train Loss = 0.2794, Train Acc = 0.8833, Val Loss = 0.4726, Val Acc = 0.8389
Epoch 1: Train Loss = 0.5274, Train Acc = 0.7329, Val Loss = 0.7788, Val Acc = 0.7190
Epoch 2: Train Loss = 0.4995, Train Acc = 0.7469, Val Loss = 0.3841, Val Acc = 0.8382
Epoch 3: Train Loss = 0.3434, Train Acc = 0.8500, Val Loss = 0.3990, Val Acc = 0.8242
Epoch 4: Train Loss = 0.3166, Train Acc = 0.8635, Val Loss = 0.5091, Val Acc = 0.7962
Epoch 5: Train Loss = 0.3015, Train Acc = 0.8704, Val Loss = 0.2945, Val Acc = 0.8794
Epoch 1: Train Loss = 0.5278, Train Acc = 0.7348, Val Loss = 0.7327, Val Acc = 0.6564
Epoch 2: Train Loss = 0.3707, Train Acc = 0.8326, Val 