In [8]:
import pandas as pd
import numpy as np
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
import pandas as pd

df = pd.read_csv("IMDB Dataset.csv", engine='python', on_bad_lines='skip', quotechar='"', escapechar='\\')
print(df.shape)
print(df.head())

(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [10]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text

df['cleaned'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].map({'positive':1, 'negative':0})

# Build vocabulary from training text
all_words = ' '.join(df['cleaned']).split()
counts = Counter(all_words)
MAX_VOCAB = 20000
vocab_list = [w for w,_ in counts.most_common(MAX_VOCAB)]
vocab = {w:i+1 for i,w in enumerate(vocab_list)}  # index 0 = padding

def encode(text):
    return [vocab.get(w, 0) for w in text.split()]

df['encoded'] = df['cleaned'].apply(encode)

In [11]:
import numpy as np

MAX_LEN = 200
def pad(seq, max_len=MAX_LEN):
    return seq[:max_len] + [0]*(max_len-len(seq))

df['padded'] = df['encoded'].apply(lambda x: pad(x, MAX_LEN))

X = np.array(df['padded'].tolist())
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class IMDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = IMDBDataset(X_train, y_train)
test_ds = IMDBDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=128)

In [13]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, output_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, (h, c) = self.lstm(x)
        h = self.dropout(h[-1])  # last layer hidden state
        out = self.fc(h)
        return self.sigmoid(out)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentLSTM(len(vocab)).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch).squeeze()
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 0.6929
Epoch 2, Loss: 0.6934
Epoch 3, Loss: 0.6836
Epoch 4, Loss: 0.6654
Epoch 5, Loss: 0.5527
Epoch 6, Loss: 0.3821
Epoch 7, Loss: 0.2980
Epoch 8, Loss: 0.2509
Epoch 9, Loss: 0.2181
Epoch 10, Loss: 0.1867


In [15]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch).squeeze()
        preds = (preds.cpu().numpy() > 0.5).astype(int)
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

acc = accuracy_score(all_labels, all_preds)
print("Test Accuracy:", acc)


Test Accuracy: 0.8724


In [16]:
# --------- Manual prediction with custom review ---------

test_sentence = "The service was terrible, the food was cold, and I will never come back here again."

# Same cleaning as training
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    return text

def encode(text, vocab):
    return [vocab.get(w, 0) for w in text.split()]

def pad(seq, max_len=200):
    return seq[:max_len] + [0]*(max_len-len(seq))

In [17]:
# Preprocess this single input
cleaned = clean_text(test_sentence)
encoded = encode(cleaned, vocab)
padded = pad(encoded)

# Convert to tensor and send to same device
tensor = torch.tensor([padded], dtype=torch.long).to(device)

# Predict
model.eval()
with torch.no_grad():
    pred = model(tensor)
    score = pred.item()
    label = "Positive" if score > 0.5 else "Negative"

print("Input review:", test_sentence)
print(f"Prediction: {label} (confidence={score:.4f})")


Input review: The service was terrible, the food was cold, and I will never come back here again.
Prediction: Negative (confidence=0.0229)


In [18]:
torch.save(model.state_dict(), "imdb_lstm_model.pth")
import pickle
pickle.dump(vocab, open("vocab.pkl","wb"))