In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import urllib.request
import zipfile
import os
from collections import Counter
import re
from tqdm import tqdm

In [2]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
def load_glove_embeddings(glove_file_path, vocab, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(vocab), embedding_dim)).astype(np.float32)
    for word, idx in vocab.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
    return torch.tensor(embedding_matrix)

In [4]:
glove_path = "glove.6B.300d.txt"
if not os.path.exists(glove_path):
    urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")

In [5]:
with zipfile.ZipFile("/kaggle/working/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [6]:
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x).squeeze()
            preds = (outputs > 0.5).int()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return acc, f1

In [7]:
def simple_tokenizer(text):
    return re.findall(r"\b\w+\b", text.lower())

In [8]:
def encode(text, vocab, max_len=500):
    tokens = simple_tokenizer(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens[:max_len]]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    return ids

In [9]:
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.texts = split["text"]
        self.labels = split["label"]
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        x = encode(self.texts[idx], self.vocab)
        y = self.labels[idx]
        return torch.tensor(x), torch.tensor(y)

In [10]:
class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1 = nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
        self.relu1 = nn.ReLU()
        self.pool = nn.MaxPool1d(2)
        self.batch_norm = nn.BatchNorm1d(128)
        self.bilstm = nn.LSTM(128, 64, batch_first=True, bidirectional=True)
        self.layer_norm = nn.LayerNorm(128)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)     # (B, C, T)
        x = self.conv1(x)                          # (B, 128, T)
        x = self.relu1(x)
        x = self.pool(x)                           # (B, 128, T/2)
        x = self.batch_norm(x)
        x = x.permute(0, 2, 1)                     # (B, T/2, 128) for LSTM
        x, _ = self.bilstm(x)                      # (B, T/2, 128)
        x = x.permute(0, 2, 1)                     # (B, 128, T/2)
        x = self.global_pool(x).squeeze(-1)        # (B, 128)
        x = self.layer_norm(x)                     # (B, 128)
        return torch.sigmoid(self.fc(x))           # (B, 1)

In [14]:
counter = Counter()
for example in train_data:
    counter.update(simple_tokenizer(example["text"]))

In [15]:
vocab = {"<PAD>": 0, "<UNK>": 1}
for word, freq in counter.items():
    if freq >= 5:
        vocab[word] = len(vocab)

In [16]:
train_dataset = IMDBDataset(train_data, vocab)
test_dataset = IMDBDataset(test_data, vocab)

In [18]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

In [19]:
embedding_dim = 300
glove_path = "glove.6B.300d.txt"  # Ensure this file is in your environment
embedding_matrix = load_glove_embeddings(glove_path, vocab, embedding_dim)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentModel(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()



best_f1 = 0
patience = 3
epochs_no_improve = 0

for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader):
        x, y = x.to(device), y.float().to(device)
        optimizer.zero_grad()
        outputs = model(x).squeeze()
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    val_acc, val_f1 = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Test Acc: {val_acc:.4f}, F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping.")
            break

100%|██████████| 391/391 [00:11<00:00, 35.24it/s]


Epoch 1, Loss: 143.1875, Test Acc: 0.8527, F1: 0.8657


100%|██████████| 391/391 [00:10<00:00, 38.72it/s]


Epoch 2, Loss: 102.3768, Test Acc: 0.8635, F1: 0.8506


100%|██████████| 391/391 [00:10<00:00, 38.73it/s]


Epoch 3, Loss: 81.2545, Test Acc: 0.8875, F1: 0.8855


100%|██████████| 391/391 [00:10<00:00, 38.64it/s]


Epoch 4, Loss: 61.2925, Test Acc: 0.8857, F1: 0.8821


100%|██████████| 391/391 [00:10<00:00, 38.54it/s]


Epoch 5, Loss: 44.7718, Test Acc: 0.8722, F1: 0.8630


100%|██████████| 391/391 [00:10<00:00, 38.18it/s]


Epoch 6, Loss: 33.2104, Test Acc: 0.8672, F1: 0.8557
Early stopping.


In [21]:
torch.save(model.state_dict(), "Bi-lstm_model.pth")

In [None]:
# For later use if needed
model = SentimentModel(embedding_matrix).to(device)
model.load_state_dict(torch.load("Bi-lstm_model.pth"))
model.eval()  # Set to evaluation mode