In [2]:
!git remote add origin "https://github.com/mrliu1212/NLP-Project"

In [3]:
!git fetch origin

remote: Enumerating objects: 141, done.[K
remote: Counting objects: 100% (141/141), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 141 (delta 50), reused 112 (delta 31), pack-reused 0 (from 0)[K
Receiving objects: 100% (141/141), 23.98 MiB | 8.00 MiB/s, done.
Resolving deltas: 100% (50/50), done.
From https://github.com/mrliu1212/NLP-Project
 * [new branch]      classification -> origin/classification
 * [new branch]      main           -> origin/main
 * [new branch]      media-bias     -> origin/media-bias


In [4]:
! git checkout classification

Filtering content: 100% (2/2), 951.17 MiB | 45.97 MiB/s, done.
Branch 'classification' set up to track remote branch 'classification' from 'origin'.
Switched to a new branch 'classification'


In [5]:
!git pull

Already up to date.


In [6]:
%cd models

/content/models


In [19]:
# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from tqdm import tqdm
import os
import pickle

# Load data
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')
test_df = pd.read_csv('../data/test.csv')

# Preprocessing
full_train_texts = train_df['text'].tolist() + dev_df['text'].tolist()
full_train_labels = train_df['label'].tolist() + dev_df['label'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in train_df['label']]
y_dev = [label2id[label] for label in dev_df['label']]

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,6),
    max_features=50000
)

tfidf.fit(full_train_texts)

X_train = tfidf.transform(train_df['text'])
X_dev = tfidf.transform(dev_df['text'])
X_test = tfidf.transform(test_df['text'])

# Train the best model fully
best_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    C=1,
    l1_ratio= 0.3
    # **best_params
)
print("Fitting model...")
best_model.fit(X_train, y_train)

# Evaluate again on Dev Set
print("Evaluation model")
dev_preds = best_model.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score (retrained best model): {dev_macro_f1:.4f}")
# C=1, L1_ratio=0.7 -> 0.4172
# C=1, L1_ratio=0.5 -> 0.4222
# C=0.5, L1_ratio=0.5 -> 0.4024
# C=1, L1_ratio=0.3 -> 0.4261


Fitting model...
Evaluation model
Validation Macro F1 Score (retrained best model): 0.4261


In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import xgboost as xgb
import joblib
import pickle
import os

SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

# ========== Utility: Check if all models exist ==========
def models_exist():
    required_files = [
        "tfidf_vectorizer.pkl",
        "logreg_model.pkl",
        "xgb_model.pkl",
        "meta_model.pkl",
        "label_maps.pkl"
    ]
    return all(os.path.exists(os.path.join(SAVE_DIR, fname)) for fname in required_files)

# ========== Load Data ==========
train_df = pd.read_csv('../data/train.csv')
dev_df = pd.read_csv('../data/dev.csv')

# ========== Train + Save Models If Not Exist ==========
if not models_exist():
    print("Models not found. Training models...")

    # Label Encoding
    all_labels = sorted(set(train_df['label']) | set(dev_df['label']))
    label2id = {label: i for i, label in enumerate(all_labels)}
    id2label = {i: label for label, i in label2id.items()}
    y_train = train_df['label'].map(label2id).values
    y_dev = dev_df['label'].map(label2id).values

    # TF-IDF Vectorization
    print("Fitting TF-IDF vectorizer...")
    tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 6), max_features=50000)
    tfidf.fit(pd.concat([train_df['text'], dev_df['text']]))

    X_train = tfidf.transform(train_df['text'])
    X_dev = tfidf.transform(dev_df['text'])

    # Train base models
    print("Training Logistic Regression...")
    logreg = LogisticRegression(
        penalty='elasticnet',
        solver='saga',
        max_iter=1000,
        C=1.0,
        l1_ratio=0.3,
        random_state=42,
        n_jobs=-1
    )
    logreg.fit(X_train, y_train)

    print("Training XGBoost...")
    xgb_clf = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(label2id),
        n_estimators=300,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb_clf.fit(X_train, y_train)

    # Generate meta-features (dev set)
    print("Generating meta-features...")
    dev_preds_logreg = logreg.predict_proba(X_dev)
    dev_preds_xgb = xgb_clf.predict_proba(X_dev)
    X_meta_dev = np.hstack([dev_preds_logreg, dev_preds_xgb])

    # Train meta-model
    print("Training meta-model...")
    meta_model = LogisticRegression(max_iter=1000, random_state=42)
    meta_model.fit(X_meta_dev, y_dev)

    # Evaluate
    meta_dev_preds = meta_model.predict(X_meta_dev)
    meta_f1 = f1_score(y_dev, meta_dev_preds, average='macro')
    print(f"Stacked Dev Macro F1: {meta_f1:.4f}")

    # Save models
    print("Saving models...")
    with open(os.path.join(SAVE_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
        pickle.dump(tfidf, f)
    joblib.dump(logreg, os.path.join(SAVE_DIR, "logreg_model.pkl"))
    joblib.dump(xgb_clf, os.path.join(SAVE_DIR, "xgb_model.pkl"))
    joblib.dump(meta_model, os.path.join(SAVE_DIR, "meta_model.pkl"))
    with open(os.path.join(SAVE_DIR, "label_maps.pkl"), "wb") as f:
        pickle.dump({'label2id': label2id, 'id2label': id2label}, f)

    print("Models saved.") # used 38 mins
else:
    print("Saved models already exist. Skipping training.")
     # Load models
    print("Loading saved models...")
    with open(os.path.join(SAVE_DIR, "tfidf_vectorizer.pkl"), "rb") as f:
        tfidf = pickle.load(f)
    logreg = joblib.load(os.path.join(SAVE_DIR, "logreg_model.pkl"))
    xgb_clf = joblib.load(os.path.join(SAVE_DIR, "xgb_model.pkl"))
    meta_model = joblib.load(os.path.join(SAVE_DIR, "meta_model.pkl"))
    with open(os.path.join(SAVE_DIR, "label_maps.pkl"), "rb") as f:
        label_maps = pickle.load(f)
        label2id = label_maps["label2id"]
        id2label = label_maps["id2label"]

    # Preprocess dev data
    print("Vectorizing dev set...")
    X_dev = tfidf.transform(dev_df["text"])
    y_dev = dev_df["label"].map(label2id).values

    # Get base model predictions
    print("Generating base model predictions...")
    dev_preds_logreg = logreg.predict_proba(X_dev)
    dev_preds_xgb = xgb_clf.predict_proba(X_dev)

    # Stack features
    X_meta_dev = np.hstack([dev_preds_logreg, dev_preds_xgb])

    # Predict with meta-model
    print("Evaluating stacked model...")
    meta_dev_preds = meta_model.predict(X_meta_dev)
    dev_preds_meta_proba = meta_model.predict_proba(X_meta_dev)
    meta_f1 = f1_score(y_dev, meta_dev_preds, average="macro")
    print(f"Stacked Dev Macro F1 Score (Loaded Models): {meta_f1:.4f}")


Saved models already exist. Skipping training.
Loading saved models...
Vectorizing dev set...
Generating base model predictions...
Evaluating stacked model...
Stacked Dev Macro F1 Score (Loaded Models): 0.4316


In [21]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # <-- Correct import here!
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os

# Load Data
train_df = pd.read_csv('../data/train.csv')  # Replace with your path
dev_df = pd.read_csv('../data/dev.csv')

# PyTorch-ready dataset
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts              # Store the list of comment texts
        self.labels = labels            # Store the corresponding list of integer labels
        self.tokenizer = tokenizer      # Store the tokenizer (e.g., RobertaTokenizer)
        self.max_length = max_length    # Store the maximum sequence length

    def __len__(self):
        return len(self.texts)  # Important: this allows DataLoader to know dataset size

    def __getitem__(self, idx):
        # Fetch the text and its corresponding label using the index
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text:
        # - truncation: cuts off texts longer than max_length
        # - padding: adds padding tokens to shorter texts
        # - return_tensors='pt': returns PyTorch tensors instead of lists
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length = self.max_length,
            return_tensors='pt' # stands for PyTorch
        )

        # Return a dictionary with model inputs and label
        return {
            'input_ids': encoding['input_ids'].squeeze(),         # Tensor of input token IDs .squeeze() again just removes the extra dimension.
            'attention_mask': encoding['attention_mask'].squeeze(), # Tensor indicating real tokens vs padding
            'labels': torch.tensor(label, dtype=torch.long)        # Make sure it becomes a PyTorch Tensor, because the model expects labels to be Tensors during training.
        }


### FUNCTIONS: train, train_MPT, evaluate
from torch.amp import GradScaler, autocast
# Use smaller (half-size) numbers for most calculations, so training is much faster and uses less memory — without losing much accuracy.
scaler = GradScaler()

# Mixed Precision Training
def train_MPT(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast(device_type=device.type):  # 🔥 autocast for mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation Function
def evaluate(model, loader):
    model.eval() # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad(): # Turns off gradient calculations (Saves memory and speeds up evaluation because we don't need gradients)
        for batch in tqdm(loader):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Get predictions (pick the class with the highest score (probability) for each example in the batch)
            preds = torch.argmax(outputs.logits, dim=1) # outputs.logits = raw scores for each class
            # Save predictions and true labels
            # We Move predictions and labels from GPU to CPU and convert them into numpy arrays
            # because some Python operations (like NumPy or scikit-learn) cannot work directly with GPU Tensors.
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Take the F1 score for each label (class) separately, and then average them equally (It does not care if one label has 1000 examples and another class has 100 examples)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return macro_f1

# -------------------------
# Data preprocessing
# -------------------------
# Encode labels to integers
labels = train_df['label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)} # {"label": 'id'}
id2label = {idx: label for label, idx in label2id.items()} # {'id':'label}


train_df['label_id'] = train_df['label'].map(label2id)
dev_df['label_id'] = dev_df['label'].map(label2id)

# -------------------
# Prepare DataLoaders
# -------------------
# CONFIG

# Version 1
# MODEL_NAME = 'roberta-base'
# MAX_LENGTH = 128
# BATCH_SIZE = 64
# EPOCHS = 2
# LEARNING_RATE = 2e-5
# 1)Train Loss: 1.4018, Validation Macro F1: 0.4303 2)Train Loss: 1.2615, Validation Macro F1: 0.4485

# Version 2
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 2e-5
# 1)Train Loss: 1.3989, Validation Macro F1: 0.4421 2)Train Loss: 1.2495, Validation Macro F1: 0.4591
# 3)Train Loss: 1.1111, Validation Macro F1: 0.4551 4)Train Loss: 0.9832, Validation Macro F1: 0.4615

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

# Prepare dataset for PyTorch (processes the raw data)
train_dataset = RedditDataset(train_df['text'].tolist(), train_df['label_id'].tolist(), tokenizer, MAX_LENGTH)
dev_dataset = RedditDataset(dev_df['text'].tolist(), dev_df['label_id'].tolist(), tokenizer, MAX_LENGTH)

# Create small batches (batches them without changing the data itself)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)

# Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cuda is faster but it requires GPU
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model.to(device) # Moves all model weights and computations to the device you selected (cuda if GPU, otherwise cpu)

# Optimizer and Scheduler
# model.parameters() are all the learnable weights in the model.
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) # A type of optimizer that updates the model's weights to minimize the loss.
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # It controls how the learning rate changes during training. In our case it linearly decreases


MODEL_SAVE_PATH = f'cache/best_{MODEL_NAME}_model_{MAX_LENGTH}-{BATCH_SIZE}-{EPOCHS}-{LEARNING_RATE}.pt'

# Check if saved model exists
if os.path.exists(MODEL_SAVE_PATH):
    print(f"Loading saved model from {MODEL_SAVE_PATH}...")
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
    print("\nEvaluating loaded model on the dev set...")
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dev_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    macro_f1 = f1_score(all_labels, all_preds, average="macro")
    print(f"\n✅ Final Dev Macro F1 Score: {macro_f1:.4f}")
else:
    print("No saved model found. Starting training...")

    # Run Training
    best_f1 = 0
    best_model_state = None

    for epoch in range(EPOCHS): # One epoch = One full pass through the entire training dataset.
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss = train_MPT(model, train_loader)
        val_f1 = evaluate(model, dev_loader)
        print(f"Train Loss: {train_loss:.4f}, Validation Macro F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = model.state_dict()

    print(f'Best f1 for this model: {best_f1}')

    # After training, save the best model
    torch.save(best_model_state, MODEL_SAVE_PATH)
    print(f"Training finished. Best model saved to {MODEL_SAVE_PATH}")

    # Loads back the best version of the model that you saved during training (Forget your current weights — load these saved weights instead)
    model.load_state_dict(best_model_state)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading saved model from cache/best_roberta-base_model_256-32-4-2e-05.pt...

Evaluating loaded model on the dev set...


Evaluating: 100%|██████████| 125/125 [00:51<00:00,  2.42it/s]


✅ Final Dev Macro F1 Score: 0.4615





In [28]:
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score

# Prepare Inputs for Meta Model
# Elastic Net (TF-IDF) model outputs: dev_preds_enet_proba
# RoBERTa model outputs: dev_logits_roberta

# Get probabilities from Elastic Net (already using predict_proba)
dev_preds_enet_proba = best_model.predict_proba(X_dev)  # shape: (n_samples, n_classes)
dev_preds_meta_proba = dev_preds_meta_proba

# Get logits from RoBERTa
model.eval()
all_logits = []
with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu().numpy()
        all_logits.append(logits)

dev_logits_roberta = np.vstack(all_logits)  # shape: (n_samples, n_classes)

# Stack Features for Meta Model
X_dev_stack = np.hstack([dev_preds_meta_proba, dev_logits_roberta])
y_dev_array = np.array(y_dev)  # true labels for dev set

# Train Meta Model
meta_model = LogisticRegressionCV(cv=5, max_iter=1000, multi_class='multinomial')
meta_model.fit(X_dev_stack, y_dev_array)

# Predict and Evaluate
meta_preds = meta_model.predict(X_dev_stack)
stacked_f1 = f1_score(y_dev_array, meta_preds, average='macro')
print(f"Meta-model Validation Macro F1 Score: {stacked_f1:.4f}")




Meta-model Validation Macro F1 Score: 0.4655


Saved models already exist. Skipping training.
Loading saved models...
Vectorizing dev set...
Generating base model predictions...
Evaluating stacked model...
✅ Stacked Dev Macro F1 Score (Loaded Models): 0.4316


In [30]:
# next

In [10]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # <-- Correct import here!
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os

# Load Data
train_df = pd.read_csv('../data/train.csv')  # Replace with your path
dev_df = pd.read_csv('../data/dev.csv')

# PyTorch-ready dataset
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts              # Store the list of comment texts
        self.labels = labels            # Store the corresponding list of integer labels
        self.tokenizer = tokenizer      # Store the tokenizer (e.g., RobertaTokenizer)
        self.max_length = max_length    # Store the maximum sequence length

    def __len__(self):
        return len(self.texts)  # Important: this allows DataLoader to know dataset size

    def __getitem__(self, idx):
        """
        Fetches a single sample from the dataset.

        Args:
            idx (int): Index of the item.

        Returns:
            dict: A dictionary containing:
                - 'input_ids': Raw text must be converted into numbers because models can't understand plain words.
                - 'attention_mask': A list that tells the model which tokens are real (1) and which ones are just padding (0).
                - 'labels': The label

        * Tensor is a multi-dimensional array (like a super-powered NumPy array) that can run on a GPU very efficiently

        ex:
        Raw Text:
        "eating soap to own the republicans"

        Tokenized to IDs:
        [0, 1553, 4153, 7, 1225, 5, 13815, 2, *1, *1]

        Attention Mask:
        [1, 1, 1, 1, 1, 1, 1, 1, *0, *0]

        Label:
        0

        Models usually expect inputs to have the same length (max = max_lenght).
        If your token list is too short, you add padding tokens (which are just zeros).
        """
        # Fetch the text and its corresponding label using the index
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text:
        # - truncation: cuts off texts longer than max_length
        # - padding: adds padding tokens to shorter texts
        # - return_tensors='pt': returns PyTorch tensors instead of lists
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length = self.max_length,
            return_tensors='pt' # stands for PyTorch
        )

        # Return a dictionary with model inputs and label
        return {
            'input_ids': encoding['input_ids'].squeeze(),         # Tensor of input token IDs .squeeze() again just removes the extra dimension.
            'attention_mask': encoding['attention_mask'].squeeze(), # Tensor indicating real tokens vs padding
            'labels': torch.tensor(label, dtype=torch.long)        # Make sure it becomes a PyTorch Tensor, because the model expects labels to be Tensors during training.
        }

        """
        squeeze example:

        [
        [ 0, 1553, 4153, 7, 1225, 5, 13815, 2, 1, 1 ]
        ]
        (batch size 1, 10 tokens)

        to

        [ 0, 1553, 4153, 7, 1225, 5, 13815, 2, 1, 1 ]
        (just 10 tokens)

        """


  ### FUNCTIONS: train, train_MPT, evaluate
from torch.amp import GradScaler, autocast
# Use smaller (half-size) numbers for most calculations, so training is much faster and uses less memory — without losing much accuracy.
scaler = GradScaler()

# Mixed Precision Training
def train_MPT(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast(device_type=device.type):  # 🔥 autocast for mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(loader)


# Training Loop
def train(model, loader):
    model.train() # Sets the model into training mode. It should be done before training starts
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad() # Reset gradients: every time before a new update, clear the old gradients.

        # Move data to device (Model and data must be on the same device to avoid errors)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass (Feed the inputs into the model)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update model weights based on the gradients
        optimizer.step()
        scheduler.step()

        # Track total loss (for reporting)
        total_loss += loss.item()

    return total_loss / len(loader) # average loss per batch


# Evaluation Function
def evaluate(model, loader):
    model.eval() # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad(): # Turns off gradient calculations (Saves memory and speeds up evaluation because we don't need gradients)
        for batch in tqdm(loader):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Get predictions (pick the class with the highest score (probability) for each example in the batch)
            preds = torch.argmax(outputs.logits, dim=1) # outputs.logits = raw scores for each class
            # Save predictions and true labels
            # We Move predictions and labels from GPU to CPU and convert them into numpy arrays
            # because some Python operations (like NumPy or scikit-learn) cannot work directly with GPU Tensors.
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Take the F1 score for each label (class) separately, and then average them equally (It does not care if one label has 1000 examples and another class has 100 examples)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return macro_f1

In [14]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
import joblib
import pickle
import torch.nn.functional as F
from tqdm import tqdm

# -------------------------------
# CONFIG
# -------------------------------
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 256
BATCH_SIZE = 32
MODEL_SAVE_PATH = f'cache/best_{MODEL_NAME}_model_{MAX_LENGTH}-{BATCH_SIZE}-4-2e-05.pt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------------------
# Load Dev Data
# -------------------------------
dev_df = pd.read_csv('../data/dev.csv')

with open("saved_models/label_maps.pkl", "rb") as f:
    label_maps = pickle.load(f)
    label2id = label_maps["label2id"]
    id2label = label_maps["id2label"]

y_dev = dev_df["label"].map(label2id).values

# -------------------------------
# Load TF-IDF and ElasticNet
# -------------------------------
with open("saved_models/tfidf_vectorizer.pkl", "rb") as f:
    tfidf = pickle.load(f)

X_dev_tfidf = tfidf.transform(dev_df["text"])
enet_model = joblib.load("saved_models/logreg_model.pkl")
enet_dev_proba = enet_model.predict_proba(X_dev_tfidf)

# -------------------------------
# Load RoBERTa and Tokenizer
# -------------------------------
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

class RedditDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

dev_dataset = RedditDataset(dev_df["text"].tolist(), tokenizer, MAX_LENGTH)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)

roberta_model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label2id))
roberta_model.load_state_dict(torch.load(MODEL_SAVE_PATH))
roberta_model.to(device)
roberta_model.eval()

# Get RoBERTa probabilities on dev set
roberta_dev_proba = []

with torch.no_grad():
    for batch in tqdm(dev_loader, desc="RoBERTa Dev Inference"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        roberta_dev_proba.append(probs.cpu().numpy())

roberta_dev_proba = np.vstack(roberta_dev_proba)

# -------------------------------
# Stack Dev Features
# -------------------------------
X_meta_dev = np.hstack([enet_dev_proba, roberta_dev_proba])

# Train meta-model
meta_model = LogisticRegressionCV(cv=5, max_iter=1000)
meta_model.fit(X_meta_dev, y_dev)

# Predict + Evaluate
meta_preds = meta_model.predict(X_meta_dev)
meta_f1 = f1_score(y_dev, meta_preds, average='macro')
print(f"✅ Stacked Dev Macro F1: {meta_f1:.4f}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
RoBERTa Dev Inference: 100%|██████████| 125/125 [00:50<00:00,  2.46it/s]


✅ Stacked Dev Macro F1: 0.4579


In [16]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # <-- Correct import here!
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os

# Load Data
train_df = pd.read_csv('../data/train.csv')  # Replace with your path
dev_df = pd.read_csv('../data/dev.csv')

# PyTorch-ready dataset
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts              # Store the list of comment texts
        self.labels = labels            # Store the corresponding list of integer labels
        self.tokenizer = tokenizer      # Store the tokenizer (e.g., RobertaTokenizer)
        self.max_length = max_length    # Store the maximum sequence length

    def __len__(self):
        return len(self.texts)  # Important: this allows DataLoader to know dataset size

    def __getitem__(self, idx):
        """
        Fetches a single sample from the dataset.

        Args:
            idx (int): Index of the item.

        Returns:
            dict: A dictionary containing:
                - 'input_ids': Raw text must be converted into numbers because models can't understand plain words.
                - 'attention_mask': A list that tells the model which tokens are real (1) and which ones are just padding (0).
                - 'labels': The label

        * Tensor is a multi-dimensional array (like a super-powered NumPy array) that can run on a GPU very efficiently

        ex:
        Raw Text:
        "eating soap to own the republicans"

        Tokenized to IDs:
        [0, 1553, 4153, 7, 1225, 5, 13815, 2, *1, *1]

        Attention Mask:
        [1, 1, 1, 1, 1, 1, 1, 1, *0, *0]

        Label:
        0

        Models usually expect inputs to have the same length (max = max_lenght).
        If your token list is too short, you add padding tokens (which are just zeros).
        """
        # Fetch the text and its corresponding label using the index
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text:
        # - truncation: cuts off texts longer than max_length
        # - padding: adds padding tokens to shorter texts
        # - return_tensors='pt': returns PyTorch tensors instead of lists
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length = self.max_length,
            return_tensors='pt' # stands for PyTorch
        )

        # Return a dictionary with model inputs and label
        return {
            'input_ids': encoding['input_ids'].squeeze(),         # Tensor of input token IDs .squeeze() again just removes the extra dimension.
            'attention_mask': encoding['attention_mask'].squeeze(), # Tensor indicating real tokens vs padding
            'labels': torch.tensor(label, dtype=torch.long)        # Make sure it becomes a PyTorch Tensor, because the model expects labels to be Tensors during training.
        }

        """
        squeeze example:

        [
        [ 0, 1553, 4153, 7, 1225, 5, 13815, 2, 1, 1 ]
        ]
        (batch size 1, 10 tokens)

        to

        [ 0, 1553, 4153, 7, 1225, 5, 13815, 2, 1, 1 ]
        (just 10 tokens)

        """





### FUNCTIONS: train, train_MPT, evaluate
from torch.amp import GradScaler, autocast
# Use smaller (half-size) numbers for most calculations, so training is much faster and uses less memory — without losing much accuracy.
scaler = GradScaler()

# Mixed Precision Training
def train_MPT(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast(device_type=device.type):  # 🔥 autocast for mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(loader)


# Training Loop
def train(model, loader):
    model.train() # Sets the model into training mode. It should be done before training starts
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad() # Reset gradients: every time before a new update, clear the old gradients.

        # Move data to device (Model and data must be on the same device to avoid errors)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass (Feed the inputs into the model)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update model weights based on the gradients
        optimizer.step()
        scheduler.step()

        # Track total loss (for reporting)
        total_loss += loss.item()

    return total_loss / len(loader) # average loss per batch


# Evaluation Function
def evaluate(model, loader):
    model.eval() # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad(): # Turns off gradient calculations (Saves memory and speeds up evaluation because we don't need gradients)
        for batch in tqdm(loader):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Get predictions (pick the class with the highest score (probability) for each example in the batch)
            preds = torch.argmax(outputs.logits, dim=1) # outputs.logits = raw scores for each class
            # Save predictions and true labels
            # We Move predictions and labels from GPU to CPU and convert them into numpy arrays
            # because some Python operations (like NumPy or scikit-learn) cannot work directly with GPU Tensors.
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Take the F1 score for each label (class) separately, and then average them equally (It does not care if one label has 1000 examples and another class has 100 examples)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return macro_f1






# -------------------------
# Data preprocessing
# -------------------------
# Encode labels to integers
labels = train_df['label'].unique()
label2id = {label: idx for idx, label in enumerate(labels)} # {"label": 'id'}
id2label = {idx: label for label, idx in label2id.items()} # {'id':'label}


train_df['label_id'] = train_df['label'].map(label2id)
dev_df['label_id'] = dev_df['label'].map(label2id)

# -------------------
# Prepare DataLoaders
# -------------------
# CONFIG

# Version 1
# MODEL_NAME = 'roberta-base'
# MAX_LENGTH = 128
# BATCH_SIZE = 64
# EPOCHS = 2
# LEARNING_RATE = 2e-5
# 1)Train Loss: 1.4018, Validation Macro F1: 0.4303 2)Train Loss: 1.2615, Validation Macro F1: 0.4485

# Version 2
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 2e-5
# 1)Train Loss: 1.3989, Validation Macro F1: 0.4421 2)Train Loss: 1.2495, Validation Macro F1: 0.4591
# 3)Train Loss: 1.1111, Validation Macro F1: 0.4551 4)Train Loss: 0.9832, Validation Macro F1: 0.4615

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

# Prepare dataset for PyTorch (processes the raw data)
train_dataset = RedditDataset(train_df['text'].tolist(), train_df['label_id'].tolist(), tokenizer, MAX_LENGTH)
dev_dataset = RedditDataset(dev_df['text'].tolist(), dev_df['label_id'].tolist(), tokenizer, MAX_LENGTH)

# Create small batches (batches them without changing the data itself)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)

# Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cuda is faster but it requires GPU
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model.to(device) # Moves all model weights and computations to the device you selected (cuda if GPU, otherwise cpu)

# Optimizer and Scheduler
# model.parameters() are all the learnable weights in the model.
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) # A type of optimizer that updates the model's weights to minimize the loss.
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # It controls how the learning rate changes during training. In our case it linearly decreases


MODEL_SAVE_PATH = f'cache/best_{MODEL_NAME}_model_{MAX_LENGTH}-{BATCH_SIZE}-{EPOCHS}-{LEARNING_RATE}.pt'

# Check if saved model exists
if os.path.exists(MODEL_SAVE_PATH):
    print(f"Loading saved model from {MODEL_SAVE_PATH}...")
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
else:
    print("No saved model found. Starting training...")

    # Run Training
    best_f1 = 0
    best_model_state = None

    for epoch in range(EPOCHS): # One epoch = One full pass through the entire training dataset.
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss = train_MPT(model, train_loader)
        val_f1 = evaluate(model, dev_loader)
        print(f"Train Loss: {train_loss:.4f}, Validation Macro F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = model.state_dict()

    print(f'Best f1 for this model: {best_f1}')

    # After training, save the best model
    torch.save(best_model_state, MODEL_SAVE_PATH)
    print(f"Training finished. Best model saved to {MODEL_SAVE_PATH}")

    # Loads back the best version of the model that you saved during training (Forget your current weights — load these saved weights instead)
    model.load_state_dict(best_model_state)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading saved model from cache/best_roberta-base_model_256-32-4-2e-05.pt...
