In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import urllib.request
import zipfile
import os
import re
from tqdm import tqdm

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
def simple_tokenizer(text):
    return re.findall(r"\b\w+\b", text.lower())

In [8]:
def encode(text, vocab, max_len=500):
    tokens = simple_tokenizer(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens[:max_len]]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    return ids

In [9]:
def load_glove_embeddings(glove_file_path, vocab, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(vocab), embedding_dim)).astype(np.float32)
    for word, idx in vocab.items():
        if word in embeddings_index:
            embedding_matrix[idx] = embeddings_index[word]
    return torch.tensor(embedding_matrix)

In [10]:
glove_path = "glove.6B.300d.txt"
if not os.path.exists(glove_path):
    urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")

In [11]:
with zipfile.ZipFile("/kaggle/working/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [12]:
def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x).squeeze()
            preds = (outputs > 0.5).int()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return acc, f1

In [13]:
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.texts = split["text"]
        self.labels = split["label"]
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        x = encode(self.texts[idx], self.vocab)
        y = self.labels[idx]
        return torch.tensor(x), torch.tensor(y)

In [14]:
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size]

In [15]:
class TemporalBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation, padding, dropout=0.2):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size,
                               padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size,
                               padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.downsample = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else None
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.conv1(x)
        out = self.chomp1(out)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.dropout1(out)

        out = self.conv2(out)
        out = self.chomp2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.dropout2(out)

        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)

In [16]:
class TCN(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        layers = []
        for i in range(len(num_channels)):
            dilation = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i - 1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, dilation,
                                     padding=(kernel_size - 1) * dilation, dropout=dropout)]
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [17]:
class SentimentModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_channels=[128, 128]):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1 = nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
        self.relu1 = nn.ReLU()
        self.pool = nn.MaxPool1d(2)
        self.batch_norm = nn.BatchNorm1d(128)

        self.tcn = TCN(128, hidden_channels)
        self.layer_norm = nn.LayerNorm(hidden_channels[-1])
        self.global_pool = nn.AdaptiveMaxPool1d(1)

        self.fc = nn.Linear(hidden_channels[-1], 1)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)      # (B, C, T)
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool(x)
        x = self.batch_norm(x)
        x = self.tcn(x)
        x = self.global_pool(x).squeeze(-1)
        x = self.layer_norm(x)
        return torch.sigmoid(self.fc(x))

In [18]:
from collections import Counter
counter = Counter()

for example in train_data:
    counter.update(simple_tokenizer(example["text"]))

In [19]:
vocab = {"<PAD>": 0, "<UNK>": 1}
for word, freq in counter.items():
    if freq >= 5:
        vocab[word] = len(vocab)

In [20]:
train_dataset = IMDBDataset(train_data, vocab)
test_dataset = IMDBDataset(test_data, vocab)

In [21]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

In [22]:
embedding_dim = 300
glove_path = "glove.6B.300d.txt"  # make sure this file is present
embedding_matrix = load_glove_embeddings(glove_path, vocab, embedding_dim)

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentModel(embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

# Training loop
best_f1 = 0
patience = 2
epochs_no_improve = 0

for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader):
        x, y = x.to(device), y.float().to(device)
        optimizer.zero_grad()
        outputs = model(x).squeeze()
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    val_acc, val_f1 = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Test Acc: {val_acc:.4f}, F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping.")
            break

100%|██████████| 391/391 [00:14<00:00, 26.83it/s]


Epoch 1, Loss: 144.5528, Test Acc: 0.8567, F1: 0.8676


100%|██████████| 391/391 [00:12<00:00, 31.75it/s]


Epoch 2, Loss: 101.3599, Test Acc: 0.8350, F1: 0.8547


100%|██████████| 391/391 [00:12<00:00, 31.47it/s]


Epoch 3, Loss: 77.2652, Test Acc: 0.8864, F1: 0.8888


100%|██████████| 391/391 [00:12<00:00, 31.07it/s]


Epoch 4, Loss: 55.7087, Test Acc: 0.8774, F1: 0.8714


100%|██████████| 391/391 [00:12<00:00, 30.69it/s]


Epoch 5, Loss: 40.4622, Test Acc: 0.8758, F1: 0.8828
Early stopping.


In [25]:
torch.save(model.state_dict(), "TCN_model.pth")

In [None]:
# For later use if needed
model = SentimentModel(embedding_matrix).to(device)
model.load_state_dict(torch.load("TCN_model.pth"))
model.eval()  # Set to evaluation mode