<a href="https://colab.research.google.com/github/overhaulk/image-classification-ml-model/blob/main/moviereview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
df = pd.read_csv("IMDB Dataset.csv")
print(df.head())
print(df["sentiment"].value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [8]:
MAX_LEN = 200
VOCAB_SIZE = 20000

stop_word = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

df["tokens"] = df["review"].apply(clean_text)

In [9]:
from collections import Counter
all_tokens = [token for tokens in df["tokens"] for token in tokens]
freqs = Counter(all_tokens)
vocab = {word: i+2 for i, (word, _) in enumerate(freqs.most_common(VOCAB_SIZE))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

def encode(tokens):
    return [vocab.get(w, 1) for w in tokens[:MAX_LEN]] + [0]*(MAX_LEN - len(tokens))

df["encoded"] = df["tokens"].apply(encode)
df["label"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [11]:
class IMDBDataset(Dataset):
  def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

  def __len__(self):
        return len(self.reviews)

  def __getitem__(self, idx):
        return torch.tensor(self.reviews[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df["encoded"], df["label"], test_size=0.2, random_state=42)

train_dataset = IMDBDataset(X_train.tolist(), y_train.tolist())
val_dataset = IMDBDataset(X_val.tolist(), y_val.tolist())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [12]:
class CNNTextClassifier(nn.Module):
  def __init__(self, vocab_size, embed_dim=128, num_classes=1):
    super(CNNTextClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
    self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
    self.relu = nn.ReLU()
    self.pool = nn.AdaptiveMaxPool1d(1)
    self.fc = nn.Linear(128, num_classes)
    self.sigmoid = nn.Sigmoid()


  def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = self.pool(x).squeeze(2)
        x = self.fc(x)
        return self.sigmoid(x).squeeze()

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNNTextClassifier(len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 30

for epoch in range(EPOCHS):
    model.train()
    total_loss, correct = 0, 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += ((outputs >= 0.5).float() == y).sum().item()

    acc = correct / len(train_dataset)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Train Acc: {acc:.4f}")

Epoch 1, Loss: 268.6170, Train Acc: 0.7981
Epoch 2, Loss: 155.8785, Train Acc: 0.9021
Epoch 3, Loss: 88.0719, Train Acc: 0.9550
Epoch 4, Loss: 39.3363, Train Acc: 0.9886
Epoch 5, Loss: 14.2444, Train Acc: 0.9986
Epoch 6, Loss: 5.4826, Train Acc: 1.0000
Epoch 7, Loss: 2.5505, Train Acc: 1.0000
Epoch 8, Loss: 1.4475, Train Acc: 1.0000
Epoch 9, Loss: 0.8847, Train Acc: 1.0000
Epoch 10, Loss: 0.5647, Train Acc: 1.0000
Epoch 11, Loss: 0.3707, Train Acc: 1.0000
Epoch 12, Loss: 0.2466, Train Acc: 1.0000
Epoch 13, Loss: 0.1662, Train Acc: 1.0000
Epoch 14, Loss: 0.1126, Train Acc: 1.0000
Epoch 15, Loss: 0.0769, Train Acc: 1.0000
Epoch 16, Loss: 0.0529, Train Acc: 1.0000
Epoch 17, Loss: 0.0365, Train Acc: 1.0000
Epoch 18, Loss: 0.0253, Train Acc: 1.0000
Epoch 19, Loss: 0.0176, Train Acc: 1.0000
Epoch 20, Loss: 0.0123, Train Acc: 1.0000
Epoch 21, Loss: 0.0086, Train Acc: 1.0000
Epoch 22, Loss: 0.0060, Train Acc: 1.0000
Epoch 23, Loss: 0.0042, Train Acc: 1.0000
Epoch 24, Loss: 0.0030, Train Acc: 1

In [14]:
model.eval()
correct = 0
with torch.no_grad():
    for X, y in val_loader:
        X, y = X.to(device), y.to(device)
        outputs = model(X)
        correct += ((outputs >= 0.5).float() == y).sum().item()

print("Validation Accuracy:", correct / len(val_dataset))


Validation Accuracy: 0.8831


In [15]:
# after training finishes
torch.save(model.state_dict(), "sentiment_cnn.pth")


In [21]:
# ---- Function to predict user review ----
def predict_review(model, review, vocab, max_len=200):
    model.eval()
    # Clean the review
    review = review.lower()
    review = re.sub(r"<.*?>", " ", review)
    review = review.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(review)
    tokens = [w for w in tokens if w not in stop_word]

    # Encode
    encoded = [vocab.get(w, 1) for w in tokens[:max_len]] + [0] * (max_len - len(tokens))
    encoded = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)  # batch of 1

    # Prediction
    with torch.no_grad():
        output = model(encoded).item()

    sentiment = "Positive 😊" if output >= 0.5 else "Negative 😞"
    return sentiment, output

# ---- Take input from user ----
user_review = input("Enter a movie review: ")
sentiment, score = predict_review(model, user_review, vocab)
print(f"Review Sentiment: {sentiment} (score={score:.4f})")


Enter a movie review: it was shit good movie
Review Sentiment: Positive 😊 (score=1.0000)
