# 1. Distlibert Trained Model

### 1.1 Import Libraries

In [4]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import gc
import warnings

warnings.filterwarnings("ignore")

### 1.2 Configuration (Optimized for Stability)

In [5]:
MODEL_NAME = "distilbert-base-uncased" 
MAX_LEN = 64      
BATCH_SIZE = 8    
EPOCHS = 3       
LEARNING_RATE = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {DEVICE}")

Using device: cpu


### 1.3 Load and Fix Data

In [None]:
print("Loading and preprocessing data...")

train_df = pd.read_csv('data/train.csv')
train_df.columns = ['ID', 'Original_Message', 'Extremism_Label']

test_df = pd.read_csv('data/test.csv')

train_df['Original_Message'] = train_df['Original_Message'].fillna("").astype(str)
test_df['Original_Message'] = test_df['Original_Message'].fillna("").astype(str)

label_map = {'NON_EXTREMIST': 0, 'EXTREMIST': 1}
train_df['labels'] = train_df['Extremism_Label'].map(label_map)

train_data, val_data = train_test_split(train_df, test_size=0.1, stratify=train_df['labels'], random_state=42)

print("Train size:", len(train_data))
print("Validation size:", len(val_data))
print("Test size:", len(test_df))

print("Data loaded and preprocessed.")

Loading data...


### 1.4 Tokenization

In [7]:
print("Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["Original_Message"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LEN
    )

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_df)

cols_to_remove = ["ID", "Original_Message", "Extremism_Label", "__index_level_0__"]
train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=cols_to_remove)
val_tokenized = val_dataset.map(tokenize_function, batched=True, remove_columns=cols_to_remove)
test_tokenized = test_dataset.map(tokenize_function, batched=True, remove_columns=["ID", "Original_Message"])

train_tokenized.set_format("torch")
val_tokenized.set_format("torch")
test_tokenized.set_format("torch")

train_loader = DataLoader(train_tokenized, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_tokenized, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_tokenized, batch_size=BATCH_SIZE)

Tokenizing...


Map: 100%|██████████| 2025/2025 [00:00<00:00, 5010.86 examples/s]
Map: 100%|██████████| 225/225 [00:00<00:00, 3054.67 examples/s]
Map: 100%|██████████| 750/750 [00:00<00:00, 2774.58 examples/s]


### 1.5 Model Setup

In [8]:
print("Initializing Model...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

Initializing Model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 1.6 Training Loop

In [9]:
print("Starting Training...")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if step % 50 == 0:
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

Starting Training...
Epoch 1 | Loss: 0.4785
Epoch 2 | Loss: 0.2934
Epoch 3 | Loss: 0.1830


### 1.7 Prediction

In [10]:
print("Predicting...")
model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())

Predicting...


### 1.8 Saving Results

In [11]:
inv_map = {0: 'NON_EXTREMIST', 1: 'EXTREMIST'}
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Extremism_Label': [inv_map[p] for p in all_preds]
})

submission.to_csv('results/submission_distilbert.csv', index=False)
print("Saved submission_distilbert.csv")

Saved submission_distilbert.csv


# 2. Twitter-RoBERTa Trained Model

### 2.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc

warnings.filterwarnings("ignore")

### 2.2 Configuration 

In [None]:
MODEL_NAME = "cardiffnlp/twitter-roberta-base-offensive"
MAX_LEN = 64
BATCH_SIZE = 32
EPOCHS = 6
LEARNING_RATE = 2e-5
CONFIDENCE_THRESHOLD = 0.85

### 2.3 GPU Detection and Memory Clean

In [None]:
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print(f"GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    DEVICE = torch.device("cpu")
    print("No GPU detected. Running on CPU.")

gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

### 2.4 Custom Dataset Class

In [None]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

### 2.5 Load and Fix Data

In [None]:
print("\nLoading data...")
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df.columns = ['ID', 'Original_Message', 'Extremism_Label']

train_df['Original_Message'] = train_df['Original_Message'].fillna("").astype(str)
test_df['Original_Message'] = test_df['Original_Message'].fillna("").astype(str)

label_map = {'NON_EXTREMIST': 0, 'EXTREMIST': 1}
train_df['labels'] = train_df['Extremism_Label'].map(label_map)

train_data, val_data = train_test_split(train_df, test_size=0.15, stratify=train_df['labels'], random_state=42)


### 2.6 Tokenization

In [None]:
print(f"Loading Tokenizer ({MODEL_NAME})...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizing data...")
train_encodings = tokenizer(train_data['Original_Message'].tolist(), truncation=True, padding=True, max_length=MAX_LEN)
val_encodings = tokenizer(val_data['Original_Message'].tolist(), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(test_df['Original_Message'].tolist(), truncation=True, padding=True, max_length=MAX_LEN)

print("Creating datasets and dataloaders...")
train_dataset = SimpleDataset(train_encodings, train_data['labels'].tolist())
val_dataset = SimpleDataset(val_encodings, val_data['labels'].tolist())
test_dataset = SimpleDataset(test_encodings)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### 2.7 Model Setup

In [None]:
print("Initializing Model...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

### 2.8 Training Loop

In [None]:
print("Starting Training...")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    start_time = time.time()
    
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if step % 20 == 0 and step > 0:
            elapsed = time.time() - start_time
            print(f"Step {step}/{len(train_loader)} | Loss: {loss.item():.4f} | Time: {elapsed:.0f}s")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

### 2.9 Precision Evaluation on Validation Set

In [None]:
print("\nChecking Precision on Validation Set...")
model.eval()
val_probs = []
val_true = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        probs = F.softmax(outputs.logits, dim=-1)
        val_probs.extend(probs[:, 1].cpu().numpy())
        val_true.extend(batch['labels'].cpu().numpy())

val_preds_strict = [1 if p > CONFIDENCE_THRESHOLD else 0 for p in val_probs]

target_names = ['NON_EXTREMIST', 'EXTREMIST']
print(f"\nResults with {CONFIDENCE_THRESHOLD*100}% Confidence Threshold")
print(classification_report(val_true, val_preds_strict, target_names=target_names))

cm = confusion_matrix(val_true, val_preds_strict)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title(f'Precision Matrix (Threshold {CONFIDENCE_THRESHOLD})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### 2.10 Prediction

In [None]:
print("\nPredicting on Test Set...")
test_probs = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        probs = F.softmax(outputs.logits, dim=-1)
        test_probs.extend(probs[:, 1].cpu().numpy())

test_preds = [1 if p > CONFIDENCE_THRESHOLD else 0 for p in test_probs]

inv_map = {0: 'NON_EXTREMIST', 1: 'EXTREMIST'}

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Extremism_Label': [inv_map[p] for p in test_preds]
})

submission.to_csv('results/submission_roberta_precision.csv', index=False)
print("Saved 'submission_roberta_precision.csv'")