In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/lucky/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/lucky/nltk_data...
[nltk_data] Downloading package punkt_tab to /Users/lucky/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
!pip install datasets -q

In [4]:
from datasets import load_dataset
# Load the IMDB dataset
dataset = load_dataset("imdb")

# Peek at the dataset structure
print(dataset)

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /Users/lucky/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /Users/lucky/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


# Data Preprocessing

In [6]:
df = pd.concat([dataset['train'].to_pandas(), dataset['test'].to_pandas()])

In [7]:
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [8]:
df['label'].value_counts()

label
0    25000
1    25000
Name: count, dtype: int64

In [9]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define a text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # Lemmatize tokens (verbs)
    tokens = [word for word in tokens if len(word) > 2]  # Keep words longer than 2 characters
    cleaned_text = ' '.join(tokens)  # Join tokens back to a single string
    return cleaned_text

# Apply the cleaning function to the dataset
df['text'] = df['text'].apply(clean_text)

# Train, Test, and Val Split

In [10]:
from sklearn.model_selection import train_test_split

# First split into train+val and test
train_val_data, test_data = train_test_split(df, test_size=0.15, random_state=42)

# Then split train+val into train and val
train_data, val_data = train_test_split(train_val_data, test_size=0.1765, random_state=42)
# (0.1765 ≈ 0.15/0.85 to keep 15% val)

In [11]:
len(train_data), len(val_data), len(test_data)

(34998, 7502, 7500)

# Text to Numerical Form (Tokenization and Encoding)

##  Tokenization

In [12]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lucky/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
train_data['text'] = train_data['text'].apply(word_tokenize)
test_data['text'] = test_data['text'].apply(word_tokenize)
val_data['text'] = val_data['text'].apply(word_tokenize)

In [14]:
train_data['text']

4615     [true, story, darkest, possible, fiction, deri...
5274     [straight, video, duffer, another, nail, coffi...
7316     [film, take, three, hours, include, commercial...
62       [watch, movie, purely, set, film, old, hotel, ...
18975    [famous, international, conductor, daniel, dar...
                               ...                        
15851    [one, landmark, film, need, situate, context, ...
16792    [milestone, eastern, european, film, make, out...
9689     [boom, garner, something, reputation, heavywei...
20542    [always, concept, korean, movies, comedy, dram...
5744     [pretty, awful, film, amaze, like, derek, jaco...
Name: text, Length: 34998, dtype: object

## Text to Sequence

In [15]:
# Step 1: Build Vocabulary (with <UNK>)
all_words = set(word for sentence in train_data['text'] for word in sentence)
word2idx = {word: idx+2 for idx, word in enumerate(sorted(all_words))}  # Start from 2
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

# Step 2: Function to convert text to sequence
def text_to_sequence(tokens, word2idx):
    return [word2idx.get(word, word2idx['<UNK>']) for word in tokens]

# Convert train texts
train_data['sequence'] = train_data['text'].apply(lambda tokens: text_to_sequence(tokens, word2idx))
# Convert val texts
val_data['sequence'] = val_data['text'].apply(lambda tokens: text_to_sequence(tokens, word2idx))
# Convert test texts
test_data['sequence'] = test_data['text'].apply(lambda tokens: text_to_sequence(tokens, word2idx))

In [16]:
def pad_or_truncate(seq, max_len, pad_value=0, truncate_from='head'):
    if len(seq) > max_len:
        if truncate_from == 'head':
            return seq[:max_len]    # keep from start
        elif truncate_from == 'tail':
            return seq[-max_len:]   # keep from end
        else:
            raise ValueError("truncate_from must be 'head' or 'tail'")
    else:
        return seq + [pad_value] * (max_len - len(seq))  # pad


In [17]:
MAX_SEQUENCE_LENGTH = 100
train_data['sequence'] = train_data['sequence'].apply(lambda tokens: pad_or_truncate(tokens, MAX_SEQUENCE_LENGTH, 0,'tail'))
val_data['sequence'] = val_data['sequence'].apply(lambda tokens: pad_or_truncate(tokens, MAX_SEQUENCE_LENGTH, 0,'tail'))
test_data['sequence'] = test_data['sequence'].apply(lambda tokens: pad_or_truncate(tokens, MAX_SEQUENCE_LENGTH, 0,'tail'))

In [18]:
train_data.head()

Unnamed: 0,text,label,sequence
4615,"[true, story, darkest, possible, fiction, deri...",0,"[66766, 61926, 15453, 50357, 22852, 16583, 212..."
5274,"[straight, video, duffer, another, nail, coffi...",0,"[61977, 69830, 18964, 2523, 43945, 12317, 5451..."
7316,"[film, take, three, hours, include, commercial...",0,"[62579, 10106, 51089, 8277, 46574, 29644, 5061..."
62,"[watch, movie, purely, set, film, old, hotel, ...",0,"[70829, 43277, 51814, 57866, 22972, 46185, 303..."
18975,"[famous, international, conductor, daniel, dar...",1,"[31412, 13431, 69879, 15369, 23068, 43683, 464..."


In [19]:
import torch
from torch import nn

In [20]:
# Convert to PyTorch tensor
X_train = torch.tensor(train_data['sequence'].tolist(), dtype=torch.long)
X_val = torch.tensor(val_data['sequence'].tolist(), dtype=torch.long)
X_test = torch.tensor(test_data['sequence'].tolist(), dtype=torch.long)

In [21]:
y_train = torch.tensor(train_data['label'].tolist(), dtype=torch.float)
y_val = torch.tensor(val_data['label'].tolist(), dtype=torch.float)
y_test = torch.tensor(test_data['label'].tolist(), dtype=torch.float)

In [22]:
X_train.shape, y_train.shape

(torch.Size([34998, 100]), torch.Size([34998]))

## Embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

!unzip glove.6B.zip

In [23]:
!ls

README.md                       glove.6B.100d.txt
classification_using_LSTM.ipynb


In [24]:
GLOVE_EMB = 'glove.6B.100d.txt'
EMBEDDING_DIM = 100

Load GloVe into a Python Dictionary

In [25]:
embeddings_index = {}

with open(GLOVE_EMB, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print(f'Loaded {len(embeddings_index)} word vectors.')

Loaded 400000 word vectors.


In [26]:
print(embeddings_index['the'])

[-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.27062 ]

Create embedding matrix

In [27]:
vocab_size = len(word2idx)
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))  # initialize all with zeros

for word, idx in word2idx.items():
    if word in embeddings_index:
        embedding_matrix[idx] = np.array(embeddings_index[word])
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))  # random init for missing words

In [28]:
embedding_matrix.shape

(73534, 100)

### Dataset Class

In [38]:
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.X)

train_dataset = SimpleDataset(X_train, y_train)
val_dataset = SimpleDataset(X_val, y_val)
test_dataset = SimpleDataset(X_test, y_test)

In [44]:
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    # num_workers=2,
    # pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    # num_workers=2,
    # pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    # num_workers=2,
    # pin_memory=True
)

In [45]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)
        self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x) # B * T * C
        output, (hidden, cell) = self.lstm(embedded) # B * T * hidden_dim
        hidden = hidden[-1,:,:]  # take the last layer's hidden state
        out = self.fc(hidden)
        return out

model = LSTMClassifier(embedding_matrix, hidden_dim=64, output_dim=1)

In [46]:
model

LSTMClassifier(
  (embedding): Embedding(73534, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [47]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()  # Binary classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [48]:
num_epochs = 3
device = 'cpu'
for epoch in range(num_epochs):
    # ============ Training ============
    model.train()
    train_loss = 0
    total_train_samples = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        preds = model(X_batch)
        preds = preds.squeeze(1)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)  # multiply by batch size
        total_train_samples += X_batch.size(0)

    avg_train_loss = train_loss / total_train_samples

    # ============ Validation ============
    model.eval()
    val_loss = 0
    total_val_samples = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            preds = preds.squeeze(1)
            loss = criterion(preds, y_batch)

            val_loss += loss.item() * X_batch.size(0)  # multiply by batch size
            total_val_samples += X_batch.size(0)

    avg_val_loss = val_loss / total_val_samples

    # ============ Logging ============
    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")


Epoch 1: Train Loss = 0.6207, Val Loss = 0.4653
Epoch 2: Train Loss = 0.4211, Val Loss = 0.3763
Epoch 3: Train Loss = 0.3572, Val Loss = 0.3480


In [49]:
import torch.nn.functional as F

def calculate_accuracy_binary(model, X_test, y_test, threshold=0.5):
    """
    Binary classification accuracy calculation.
    """
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)  # shape: (batch_size, 1) or (batch_size,)

        # Apply sigmoid
        probs = torch.sigmoid(outputs)

        # Apply threshold
        preds = (probs >= threshold).long().view(-1)  # flatten if needed

        correct = (preds == y_test).sum().item()
        total = y_test.size(0)

        accuracy = correct / total

    return accuracy

In [50]:
calculate_accuracy_binary(model, X_train, y_train), calculate_accuracy_binary(model, X_val, y_val)

(0.8589633693353906, 0.8481738203145828)

In [51]:
calculate_accuracy_binary(model, X_test, y_test)

0.8489333333333333