In [2]:
import os
os.chdir("/sfs/gpfs/tardis/home/dpv8cf/toxic-comment-analysis")  # adjust if needed
os.getcwd()


'/sfs/gpfs/tardis/home/dpv8cf/toxic-comment-analysis'

In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch
import torch.nn as nn
import sys
sys.path.append("scripts")
import data_cleaning
from torch.utils.data import TensorDataset, DataLoader


In [4]:
data = pd.read_csv("data/train.csv")

In [5]:
data = data_cleaning.clean_data(data)

In [6]:
data.shape, data.columns

((1593229, 15),
 Index(['comment_text', 'funny', 'wow', 'sad', 'likes', 'disagree', 'toxicity',
        'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack',
        'insult', 'threat', 'toxicity_annotator_count', 'rating_rejected'],
       dtype='object'))

## Comment Only

In [10]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# Using full dataset to create a smaller stratified sample
X_comment_only = data["comment_text"].astype(str)
y = data["rating_rejected"]

sss = StratifiedShuffleSplit(
    n_splits=1,
    train_size=100_000,        # try 50_000 if still too slow
    random_state=42
)

# Get indices for the small, stratified sample
idx_small, _ = next(sss.split(X_comment_only, y))

# Create a smaller DataFrame to run NN + param search on
data_small = data.iloc[idx_small].copy()

#Smaller Sample
X_comment_only = data_small["comment_text"].astype(str)
y = data_small["rating_rejected"]

#train/val split on the SMALL sample
X_train_comment_only, X_val_comment_only, y_train, y_val = train_test_split(
    X_comment_only,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

param_grid = []

activations = ["relu", "tanh"]
hidden_sizes = [128, 256]
lrs = [0.001, 0.01]

for act in activations:
    for h in hidden_sizes:
        for lr in lrs:
            param_grid.append({
                "activation": act,
                "hidden_size": h,
                "lr": lr
            })



In [11]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, classification_report

# ======================================================
# 1. Tokenizer
# ======================================================

def simple_tokenize(text):
    return text.lower().strip().split() #split on space

#Simple tokenizer with lowercase + split

# ======================================================
# 2. Build vocabulary from TRAIN ONLY
# ======================================================
from collections import Counter
#Counting all words appearing in training comments
word_counter = Counter()

for text in X_train_comment_only:
    tokens = simple_tokenize(text) #Convert Strings into Tokens
    word_counter.update(tokens) #Count word frequencies

max_vocab_size = 20000 #Keep only top 20k words
most_common = word_counter.most_common(max_vocab_size)
#Special Token for Padding and Unknown Words
PAD = "<PAD>"
UNK = "<UNK>"
#Creating Vocabulary Lists
idx2word = [PAD, UNK] + [w for w, _ in most_common] #Index -> Words
word2idx = {w: i for i, w in enumerate(idx2word)} #Words -> Index

pad_idx = word2idx[PAD]
unk_idx = word2idx[UNK]
vocab_size = len(word2idx)

print("Vocab size:", vocab_size)

# ======================================================
# 3. Convert text → sequence
# ======================================================

def text_to_seq(text, max_len=100):
    tokens = simple_tokenize(text) #Tokenize
    seq = [word2idx.get(tok, unk_idx) for tok in tokens] #Map to Indices
    if len(seq) < max_len:
        seq = seq + [pad_idx] * (max_len - len(seq)) #Pad at End
    else:
        seq = seq[:max_len] #Cut off Extra Words
    return np.array(seq, dtype=np.int64)
#Convert Train and Validation Into Integer Sequences
max_len = 100

X_train_seq = np.stack([text_to_seq(t) for t in X_train_comment_only])
X_val_seq   = np.stack([text_to_seq(t) for t in X_val_comment_only])

# ======================================================
# 4. Dataset + DataLoader
# ======================================================

class CommentDataset(Dataset):
    def __init__(self, X_seq, y):
        self.X = torch.from_numpy(X_seq).long() #Convert to Tensor
        self.y = torch.from_numpy(y.values).long()  # pandas Series → tensor

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 64

train_dataset = CommentDataset(X_train_seq, y_train)
val_dataset   = CommentDataset(X_val_seq,   y_val)
#DataLoaders for Batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

# ======================================================
# 5. Neural Network
# ======================================================

def get_activation(name):
    if name == "relu":
        return nn.ReLU()
    elif name == "tanh":
        return nn.Tanh()
    else:
        raise ValueError("Invalid activation")
#Neural Network Model
class CommentNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, activation, pad_idx):
        super().__init__() #Embedding Layer: Converts Word Indices -> Dense Vectors
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=pad_idx #Pad Tokens stay zero
        )
        self.net = nn.Sequential(
            nn.Linear(embed_dim, hidden_size),
            get_activation(activation), #ReLU or TanH
            nn.Dropout(0.3), #Regularization
            nn.Linear(hidden_size, 1),
            nn.Sigmoid() #Binary Classification 
        )

    def forward(self, x):
        # x: (batch, seq_len)
        emb = self.embedding(x)        # (batch, seq_len, embed_dim)
        pooled = emb.mean(dim=1)       # (batch, embed_dim)
        return self.net(pooled)

# ======================================================
# 6. Early Stopping Training Loop
# ======================================================

def train_model(model, optimizer, train_loader, val_loader,
                max_epochs=20, patience=3, device="cpu"):

    criterion = nn.BCELoss() #Binary Cross Entropy
    model.to(device)

    best_f1 = 0 #Track Best Validation F1
    best_state = None #Store best model weights
    wait = 0 #Patience Counter

    for epoch in range(1, max_epochs+1):
        # ---- TRAIN ----
        model.train()
        train_losses = []

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device).float().unsqueeze(1)

            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        # ---- VALIDATION ----
        model.eval()
        val_preds = []
        val_targets = []

        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                yb = yb.to(device)

                probs = model(xb)
                preds = (probs > 0.5).long().cpu().numpy().flatten()

                val_preds.extend(preds.tolist())
                val_targets.extend(yb.cpu().numpy().tolist())

        val_f1 = f1_score(val_targets, val_preds)
        print(f"Epoch {epoch} | Train Loss: {np.mean(train_losses):.4f} | Val F1: {val_f1:.4f}")

        # ---- EARLY STOPPING ----
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_state = model.state_dict()
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping.")
                break

    # load best weights
    model.load_state_dict(best_state)
    return best_f1, model

# ======================================================
# 7. Hyperparameter Search (your param_grid)
# ======================================================

results = [] #Store Results 
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_dim = 100  # you can tune this later

for params in param_grid:
    print("="*70)
    print("Testing config:", params)
    print("="*70)
#Build Model with chosen hyperparameters
    model = CommentNN(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        hidden_size=params["hidden_size"],
        activation=params["activation"],
        pad_idx=pad_idx
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
#Train and Evaluate
    best_f1, trained_model = train_model(
        model,
        optimizer,
        train_loader,
        val_loader,
        max_epochs=20,
        patience=3,
        device=device
    )

    print(f"Best Val F1 for config {params}: {best_f1:.4f}\n")
    results.append((params, best_f1, trained_model))

# ======================================================
# 8. Best Hyperparameters
# ======================================================

best_params, best_val_f1, best_model = max(results, key=lambda x: x[1])
print("\nBest Config:", best_params)
print("Best Validation F1:", best_val_f1)


Vocab size: 20002
Testing config: {'activation': 'relu', 'hidden_size': 128, 'lr': 0.001}
Epoch 1 | Train Loss: 0.2529 | Val F1: 0.0000
Epoch 2 | Train Loss: 0.2196 | Val F1: 0.0105
Epoch 3 | Train Loss: 0.1934 | Val F1: 0.0531
Epoch 4 | Train Loss: 0.1726 | Val F1: 0.1032
Epoch 5 | Train Loss: 0.1571 | Val F1: 0.1262
Epoch 6 | Train Loss: 0.1458 | Val F1: 0.1174
Epoch 7 | Train Loss: 0.1361 | Val F1: 0.1362
Epoch 8 | Train Loss: 0.1270 | Val F1: 0.1210
Epoch 9 | Train Loss: 0.1203 | Val F1: 0.1303
Epoch 10 | Train Loss: 0.1135 | Val F1: 0.1312
Early stopping.
Best Val F1 for config {'activation': 'relu', 'hidden_size': 128, 'lr': 0.001}: 0.1362

Testing config: {'activation': 'relu', 'hidden_size': 128, 'lr': 0.01}
Epoch 1 | Train Loss: 0.2386 | Val F1: 0.0076
Epoch 2 | Train Loss: 0.1949 | Val F1: 0.0939
Epoch 3 | Train Loss: 0.1553 | Val F1: 0.0754
Epoch 4 | Train Loss: 0.1144 | Val F1: 0.1050
Epoch 5 | Train Loss: 0.0853 | Val F1: 0.1237
Epoch 6 | Train Loss: 0.0717 | Val F1: 0.139

To build a neural network that can understand raw text comments, the code first creates a smaller, balanced subset of the dataset for efficient training. Each comment is tokenized, meaning the text is converted into lowercase words split by spaces, and a vocabulary is built from the training data that assigns every word an integer ID. Because neural networks require fixed-size inputs, each comment is then transformed into a sequence of exactly 100 word indices: shorter comments are padded by adding special <PAD> tokens at the end, while longer comments are trimmed by cutting off words beyond the maximum length. These fixed-length sequences are fed into an embedding layer, which learns a dense vector representation for each word so that semantically similar words end up with similar numerical vectors. The embeddings for all words in a comment are then averaged to produce a single feature vector summarizing the entire comment. This vector is passed through a small feed-forward neural network with ReLU or Tanh activation and a final sigmoid output to predict whether the comment is toxic or rejected. The model is trained with early stopping and evaluated using validation F1 scores while searching across different hyperparameter combinations to find the best-performing architecture.