In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
dataset_path = '/content/drive/My Drive/Colab Notebooks/IMDB_Dataset.csv'


In [10]:
df = pd.read_csv(dataset_path)


In [12]:
# Preprocessing the text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = word_tokenize(text)
   # text = [word for word in text if word not in stopwords.words('english')]
    return text

df['review'] = df['review'].apply(preprocess_text)

In [13]:
# Encoding labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)



In [17]:
glove_file_path = '/content/drive/My Drive/Colab Notebooks/glove.6B.100d.txt'


In [18]:
def load_glove_embeddings(glove_file=glove_file_path):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings()


400000it [00:14, 27169.17it/s]


In [19]:
# Prepare embedding matrix and word2idx mapping
vocab = set([word for text in X_train for word in text])
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}
embedding_matrix = np.zeros((len(vocab) + 1, 100))

for word, idx in word2idx.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

In [20]:
# Custom Dataset Class
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, word2idx, max_len=100):
        self.texts = texts
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        text_indices = [self.word2idx.get(word, 0) for word in text[:self.max_len]]
        padding = [0] * (self.max_len - len(text_indices))
        text_indices = text_indices + padding
        return torch.tensor(text_indices, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

In [21]:
# Create data loaders
train_dataset = IMDBDataset(X_train, y_train, word2idx)
test_dataset = IMDBDataset(X_test, y_test, word2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [22]:
# Vanilla RNN Model
class VanillaRNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, output_dim=1):
        super(VanillaRNNModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.rnn(embedded)
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)

In [23]:
# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, output_dim=1):
        super(LSTMModel, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)

In [24]:
# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for texts, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(train_loader)}')

In [25]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for texts, labels in test_loader:
            outputs = model(texts)
            preds = outputs.squeeze().round()
            y_true.extend(labels.numpy())
            y_pred.extend(preds.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
# Task 1: Vanilla RNN with GloVe
vanilla_rnn_model = VanillaRNNModel(embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(vanilla_rnn_model.parameters(), lr=0.001)
train_model(vanilla_rnn_model, train_loader, criterion, optimizer, epochs=5)
evaluate_model(vanilla_rnn_model, test_loader)


Epoch 1/5, Loss: 0.6919156236171723
Epoch 2/5, Loss: 0.6688818548202514


In [None]:
# Task 2: LSTM with GloVe
lstm_model = LSTMModel(embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
train_model(lstm_model, train_loader, criterion, optimizer, epochs=5)
evaluate_model(lstm_model, test_loader)

In [None]:
# Task 3: Vanilla RNN with on-the-fly embeddings
vanilla_rnn_onthefly = VanillaRNNModel(embedding_matrix=None)
criterion = nn.BCELoss()
optimizer = optim.Adam(vanilla_rnn_onthefly.parameters(), lr=0.001)
train_model(vanilla_rnn_onthefly, train_loader, criterion, optimizer, epochs=5)
evaluate_model(vanilla_rnn_onthefly, test_loader)

In [None]:
# Task 3: LSTM with on-the-fly embeddings
lstm_onthefly = LSTMModel(embedding_matrix=None)
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm_onthefly.parameters(), lr=0.001)
train_model(lstm_onthefly, train_loader, criterion, optimizer, epochs=5)
evaluate_model(lstm_onthefly, test_loader)