In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from wordcloud import WordCloud
import re
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
df = pd.read_csv("IMDB Dataset.csv")

In [11]:
stop_words = set(stopwords.words('english'))
remove_html = re.compile(r'<.*?>')
non_alpha_pattern = re.compile(r'[^a-z\s]')

def clean_text(text):
    text = text.lower()
    text = remove_html.sub(' ', text)
    text = non_alpha_pattern.sub('', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [12]:
def process_data_in_chunks(df, chunk_size = 5000):
    cleaned_chunks = []
    for start in range(0, df.shape[0], chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end].copy()
        chunk['cleaned_review'] = chunk['review'].apply(clean_text)
        cleaned_chunks.append(chunk)
    return pd.concat(cleaned_chunks, ignore_index = True)

In [13]:
df_cleaned = process_data_in_chunks(df)

In [14]:
df_cleaned['tokens'] = df_cleaned['cleaned_review'].apply(word_tokenize)

In [15]:
train_data, val_data = train_test_split(df_cleaned, test_size = 0.2, random_state = 42)

In [16]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score

In [17]:
def build_vocab_and_mapping(tokenized_texts):
    vocab = set(word for tokens in tokenized_texts for word in tokens)
    word_to_index = {word: idx + 2 for idx, word in enumerate(vocab)}  # Start indexing from 2
    word_to_index['<PAD>'] = 0  # Padding token
    word_to_index['<UNK>'] = 1  # Unknown token
    return word_to_index

In [18]:
def encode_tokens(tokens, word_to_index):
    return [word_to_index.get(token, word_to_index['<UNK>']) for token in tokens]

In [19]:
word_to_index = build_vocab_and_mapping(train_data['tokens'])
train_data['encoded'] = train_data['tokens'].apply(lambda tokens: encode_tokens(tokens, word_to_index))
val_data['encoded'] = val_data['tokens'].apply(lambda tokens: encode_tokens(tokens, word_to_index))

In [20]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts.iloc[idx], dtype=torch.long), self.labels[idx]

In [21]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts_padded, torch.stack(labels)

In [23]:
labels = {'positive': 1, 'negative': 0}
train_data['target'] = train_data['sentiment'].map(labels)
val_data['target'] = val_data['sentiment'].map(labels)

In [24]:
batch_size = 32
train_dataset = TextDataset(train_data['encoded'], train_data['target'])
val_dataset = TextDataset(val_data['encoded'], val_data['target'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [25]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [26]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, output_dim=1):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.sigmoid(self.fc(hidden.squeeze(0)))

In [27]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, output_dim=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        return self.sigmoid(self.fc(hidden[-1]))

In [29]:
vocab_size = len(word_to_index)
rnn_model = RNNModel(vocab_size).to(device)
lstm_model = LSTMModel(vocab_size).to(device)

In [30]:
next(rnn_model.parameters()).device

device(type='cuda', index=0)

In [41]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=3, device=device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for texts, labels in train_loader:
            texts = texts.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation phase
        val_loss = 0
        # model.eval()
        with torch.inference_mode():
            for texts, labels in val_loader:
                texts = texts.to(device)
                labels = labels.to(device)
                outputs = model(texts).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        print(f'Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}')
    return model

In [42]:
# Loss function and optimizer
criterion = nn.BCELoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=1e-3)
lstm_optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)

In [43]:
# Train RNN model
print("Training RNN Model")
rnn_model = train_model(rnn_model, train_loader, val_loader, criterion, rnn_optimizer, num_epochs=3)

Training RNN Model
Epoch 1, Training Loss: 0.6957648281097412, Validation Loss: 0.6943879144641157
Epoch 2, Training Loss: 0.6953449727058411, Validation Loss: 0.6936638071514166
Epoch 3, Training Loss: 0.6935484541893006, Validation Loss: 0.6933731950890903


In [44]:
# Train LSTM model
print("\nTraining LSTM Model")
lstm_model = train_model(lstm_model, train_loader, val_loader, criterion, lstm_optimizer, num_epochs=3)


Training LSTM Model
Epoch 1, Training Loss: 0.693178857088089, Validation Loss: 0.6927582183584999
Epoch 2, Training Loss: 0.6913622262001038, Validation Loss: 0.6928665883624896
Epoch 3, Training Loss: 0.6883591244220734, Validation Loss: 0.6911223050885307


In [45]:
def evaluate_model(model, val_loader):
    model.to('cpu')
    model.eval()
    all_preds = []
    all_targets = []

    with torch.inference_mode():
        for texts, labels in val_loader:
            outputs = model(texts).squeeze()
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())

    accuracy = np.mean(np.array(all_preds) == np.array(all_targets))
    print(f'Validation Accuracy: {accuracy}')

In [46]:
# Evaluate RNN model
print("Evaluating RNN Model")
evaluate_model(rnn_model, val_loader)

Evaluating RNN Model
Validation Accuracy: 0.5089


In [47]:
# Evaluate LSTM model
print("\nEvaluating LSTM Model")
evaluate_model(lstm_model, val_loader)


Evaluating LSTM Model
Validation Accuracy: 0.5026
