# **IMPORTS**

In [None]:
import re
import string
import math
from collections import Counter
import kagglehub


import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet t

**DOWNLOADS**

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

#--------glove EMPEDDINGS   100D , 300D ---------------------------------------
# path = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")
# print("Path to dataset files:", path)

path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")
print("Path to dataset files:", path)

**LOAD** **DATA**

In [None]:
data_path = "train.csv"
data = pd.read_csv(data_path)

# **DATA PREPROCESSING**

In [None]:
data = data.dropna()
data = data[data['Discussion'].str.strip() != '']


categoryMap = {
    "Politics": 0,
    "Sports": 1,
    "Media": 2,
    "Market & Economy": 3,
    "STEM": 4
}
data['Category'] = data['Category'].map(categoryMap)




# Load the list of valid English words
english_words = set(words.words())
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    # Replace newline characters with spaces
    text = text.replace('\\n', ' ')

    # Remove backslashes and other URL characters
    text = re.sub(r'[\\?=&_\-]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Remove extra spaces
    text = ' '.join(text.split())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)


    return text

**APPLY PREPROSSESING ON DISCUSSION COLUMN**

In [None]:
data['Discussion'] = data['Discussion'].apply(preprocess_text)
data = data[data['Discussion'].str.strip() != '']

# **TOKENIZATION**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 120  # Length to truncate or pad sequences

# Tokenize text using BERT tokenizer
def bert_tokenize(text, max_length):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return encoding['input_ids'].squeeze(0), encoding['attention_mask'].squeeze(0)





# Tokenize all discussions
encoded_texts, attention_masks = [], []
for text in data['Discussion']:
    input_ids, attn_mask = bert_tokenize(text, max_length)
    encoded_texts.append(input_ids)
    attention_masks.append(attn_mask)

# Convert labels to tensor
labels = torch.tensor(data['Category'].tolist())

# Train/Validation split
train_size = int(0.8 * len(encoded_texts))
train_encoded, train_masks = encoded_texts[:train_size], attention_masks[:train_size]
train_labels = labels[:train_size]

val_encoded, val_masks = encoded_texts[train_size:], attention_masks[train_size:]
val_labels = labels[train_size:]

# **EMBEDDING**

In [None]:
def load_glove_embeddings(glove_file_path, vocab, embed_dim=100):
    embeddings = np.zeros((len(vocab), embed_dim))
    # Load GloVe embeddings
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if word in vocab:
                embeddings[vocab[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)


glove_file_path = '/root/.cache/kagglehub/datasets/danielwillgeorge/glove6b100dtxt/versions/1/glove.6B.100d.txt'
vocab = tokenizer.get_vocab()
embed_dim = 100

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path, vocab, embed_dim)

# **MODEL**

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, encoded_texts, attention_masks, labels):
        self.encoded_texts = encoded_texts
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        x = self.encoded_texts[idx]
        mask = self.attention_masks[idx]
        y = self.labels[idx]
        return x, mask, y

# Create datasets
train_dataset = TextClassificationDataset(train_encoded, train_masks, train_labels)
val_dataset = TextClassificationDataset(val_encoded, val_masks, val_labels)


batch_size = 32

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_length=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_length, embed_dim)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, max_length, embed_dim)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_length, embed_dim)
        seq_length = x.size(1)
        x = x + self.pe[:, :seq_length, :]
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.query_proj = nn.Linear(embed_dim, embed_dim)
        self.key_proj = nn.Linear(embed_dim, embed_dim)
        self.value_proj = nn.Linear(embed_dim, embed_dim)

        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        batch_size, seq_length, _ = x.size()

        Q = self.query_proj(x)
        K = self.key_proj(x)
        V = self.value_proj(x)

        # (batch_size, seq_length, num_heads, head_dim)
        Q = Q.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
        K = K.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
        V = V.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            # mask: (batch_size, seq_length) -> we need (batch_size, 1, 1, seq_length)
            # We'll create a broadcastable mask to apply to attn_scores
            mask = mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_length)
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context = torch.matmul(attn_weights, V)
        context = context.transpose(1,2).contiguous().view(batch_size, seq_length, self.embed_dim)
        out = self.out_proj(context)
        return out

class PositionwiseFeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.ffn = PositionwiseFeedForward(embed_dim, ff_dim, dropout)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_out = self.self_attn(x, mask)
        x = self.norm1(x + self.dropout(attn_out))

        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))

        return x

class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, num_layers, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, glove_embeddings ,embed_dim=300, num_heads=8, num_layers=4, ff_dim=512, num_classes=5, max_length=120, dropout=0.1):
        super(TransformerClassifier, self).__init__()

        # GloVe embeddings as input to the embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight.data.copy_(glove_embeddings)  # Set GloVe weights
        self.embedding.weight.requires_grad = False  # Optionally freeze the embedding layer
        self.pos_encoding = PositionalEncoding(embed_dim, max_length=max_length)

        self.encoder = TransformerEncoder(embed_dim, num_heads, ff_dim, num_layers, dropout)

        self.fc = nn.Linear(embed_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.embedding(x)  # (batch_size, seq_length, embed_dim)
        x = self.pos_encoding(x)

        # Pass the mask to the encoder
        x = self.encoder(x, mask=mask)  # (batch_size, seq_length, embed_dim)

        # Use the [CLS] token representation
        cls_token_repr = x[:, 0, :]  # (batch_size, embed_dim)

        x = self.dropout(cls_token_repr)
        logits = self.fc(x)  # (batch_size, num_classes)
        return logits

# **Training and Evaluation**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the TransformerClassifier
model = TransformerClassifier(
    vocab_size=tokenizer.vocab_size,  # Using BERT tokenizer's vocabulary size
    glove_embeddings=glove_embeddings,  # Pass the GloVe embeddings
    embed_dim=100,   # increased embedding dimension
    num_heads = 4,     # more heads
    num_layers= 2,    # more layers
    ff_dim=512,      # bigger feed-forward
    num_classes=5,
    max_length=max_length,  # Ensure this matches the tokenizer's max_length
    dropout=0.2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)



epochs = 10  # Number of epochs
total_steps = len(train_loader)

for epoch in range(epochs):
    total_train_loss = 0
    total_val_loss = 0
    correct_train = 0
    total_train = 0
    model.train()

    # Training loop
    for batch in train_loader:
        x_batch, mask_batch, y_batch = batch  # Unpack the tuple
        x_batch = x_batch.to(device)          # Move input IDs to the device
        mask_batch = mask_batch.to(device)    # Move attention masks to the device
        y_batch = y_batch.to(device)          # Move labels to the device

        optimizer.zero_grad()
        logits = model(x_batch, mask=mask_batch)  # Include attention mask
        loss = criterion(logits, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_train_loss += loss.item()

        # Calculate training accuracy
        preds = torch.argmax(logits, dim=1)
        correct_train += (preds == y_batch).sum().item()
        total_train += y_batch.size(0)

    train_loss = total_train_loss / total_steps
    train_accuracy = correct_train / total_train

    # Validation loop
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            x_batch, mask_batch, y_batch = batch  # Unpack the tuple
            x_batch = x_batch.to(device)
            mask_batch = mask_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(x_batch, mask=mask_batch)
            loss = criterion(logits, y_batch)
            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_val += (preds == y_batch).sum().item()
            total_val += y_batch.size(0)

    val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct_val / total_val





    # Print epoch results
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print("-" * 50)

Epoch 1/10
Train Loss: 1.2174, Train Accuracy: 0.5055
Validation Loss: 0.9446, Validation Accuracy: 0.6485
--------------------------------------------------
Epoch 2/10
Train Loss: 0.9472, Train Accuracy: 0.6422
Validation Loss: 0.8552, Validation Accuracy: 0.6705
--------------------------------------------------
Epoch 3/10
Train Loss: 0.8993, Train Accuracy: 0.6607
Validation Loss: 0.8324, Validation Accuracy: 0.6896
--------------------------------------------------
Epoch 4/10
Train Loss: 0.8755, Train Accuracy: 0.6695
Validation Loss: 0.8330, Validation Accuracy: 0.6818
--------------------------------------------------
Epoch 5/10
Train Loss: 0.8573, Train Accuracy: 0.6735
Validation Loss: 0.8535, Validation Accuracy: 0.6875
--------------------------------------------------
Epoch 6/10
Train Loss: 0.8423, Train Accuracy: 0.6817
Validation Loss: 0.8313, Validation Accuracy: 0.6885
--------------------------------------------------
Epoch 7/10
Train Loss: 0.8301, Train Accuracy: 0.685

# **Test** **Prediction**

In [None]:
data_path = "test.csv"
data = pd.read_csv(data_path)


class TestDataset(Dataset):
    def __init__(self, encoded_texts, attention_masks):
        self.encoded_texts = encoded_texts
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        x = self.encoded_texts[idx]
        mask = self.attention_masks[idx]
        return x, mask


# Tokenize all discussions
data['Discussion'] = data['Discussion'].apply(preprocess_text)

encoded_texts, attention_masks = [], []
for text in data['Discussion']:
    input_ids, attn_mask = bert_tokenize(text, max_length)
    encoded_texts.append(input_ids)
    attention_masks.append(attn_mask)



test_dataset = TestDataset(encoded_texts, attention_masks)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    predictions = []  # Initialize predictions list inside the function

    with torch.no_grad():  # No gradients needed for evaluation
        for batch in test_loader:
            x_batch, mask_batch = batch  # Unpack the tuple
            x_batch = x_batch.to(device)          # Move input IDs to the device
            mask_batch = mask_batch.to(device)    # Move attention masks to the device
            logits = model(x_batch, mask=mask_batch)  # Include attention mask
            preds = torch.argmax(logits, dim=-1)  # Get predictions
            predictions.extend(preds.cpu().tolist())  # Convert tensor to list and extend

    return predictions

In [None]:
# Assuming `test_loader` is defined and contains the test data
predictions = evaluate_model(model, test_loader)

# Convert predictions to a DataFrame and save for Kaggle submission
submission_df = pd.DataFrame({
    "SampleID": data["SampleID"],  # Replace with your test set's ID column
    "Category": predictions  # Predictions returned by the function
})
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv


In [None]:
print(submission_df.shape)
submission_df.head(10)

(10557, 2)


Unnamed: 0,SampleID,Category
0,1,3
1,2,0
2,3,1
3,4,4
4,5,3
5,6,0
6,7,3
7,8,0
8,9,2
9,10,2
