In [2]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [6]:
!git checkout media-bias

Branch 'media-bias' set up to track remote branch 'media-bias' from 'origin'.
Switched to a new branch 'media-bias'


In [9]:
%cd ./tasks/classification/

/content/tasks/classification


# Data Loading & Processing

In [1]:
import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

import pandas as pd
from useful_tools import UsefulTools

training_file_name = 'train_data_35_5000'

# LOAD DATA
training_data = pd.DataFrame(UsefulTools.JsonCache.load(f'../../data/{training_file_name}.json', expected_type=list))
test_data = pd.DataFrame(UsefulTools.JsonCache.load('../../data/test_data_20_5000.json', expected_type=list))

Loaded cache from '../../data/train_data_35_5000.json' (127631 items).
Loaded cache from '../../data/test_data_20_5000.json' (82494 items).


In [2]:
# PROCESS DATA
# Count sentences per source
source_counts = training_data['source'].value_counts()

# Get the top 4 sources with highest sentence count
top_sources = source_counts.head(10).index

# Filter both datasets to include only these top 4 sources
training_data = training_data[training_data['source'].isin(top_sources)].reset_index(drop=True)
test_data = test_data[test_data['source'].isin(top_sources)].reset_index(drop=True)

print(f"\nSelected top sources: {list(top_sources)}")
print(f"Training data (n sentences):\n{training_data[['id','source']].groupby('source').count().reset_index().sort_values(by='id',ascending=False).values}")
print(f"\nTest data (n sentences):\n{test_data[['id','source']].groupby('source').count().reset_index().sort_values(by='id',ascending=False).values}")
print(f"\nTraining sources: {training_data.source.unique()}")
print(f"Test sources: {test_data.source.unique()}")
print(f"Training samples: {len(training_data)}")
print(f"Test samples: {len(test_data)}")


Selected top sources: ['Forbes', 'SiliconANGLE News', 'Plos.org', 'Business Insider', 'The Verge', 'Dwarkesh.com', 'GlobeNewswire', 'Gizmodo.com', 'Securityaffairs.com', 'Yahoo Entertainment']
Training data (n sentences):
[['Forbes' 11041]
 ['SiliconANGLE News' 5744]
 ['Plos.org' 3967]
 ['Business Insider' 3820]
 ['The Verge' 3685]
 ['Dwarkesh.com' 3562]
 ['GlobeNewswire' 3378]
 ['Gizmodo.com' 3274]
 ['Securityaffairs.com' 3042]
 ['Yahoo Entertainment' 2837]]

Test data (n sentences):
[['SiliconANGLE News' 11097]
 ['Plos.org' 7974]
 ['Forbes' 5307]
 ['GlobeNewswire' 4991]
 ['Dwarkesh.com' 1781]
 ['Business Insider' 1592]
 ['Securityaffairs.com' 1341]
 ['The Verge' 1313]
 ['Yahoo Entertainment' 1171]
 ['Gizmodo.com' 824]]

Training sources: ['Gizmodo.com' 'Yahoo Entertainment' 'The Verge' 'Business Insider'
 'Forbes' 'Securityaffairs.com' 'SiliconANGLE News' 'GlobeNewswire'
 'Dwarkesh.com' 'Plos.org']
Test sources: ['The Verge' 'Gizmodo.com' 'Forbes' 'Yahoo Entertainment' 'GlobeNewswir

In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from useful_tools import UsefulTools

# 1. Preprocessing
full_train_texts = training_data['sentence'].tolist() + test_data['sentence'].tolist()
full_train_labels = training_data['source'].tolist() + test_data['source'].tolist()

labels = list(sorted(set(full_train_labels)))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

y_train = [label2id[label] for label in training_data['source']]
y_dev = [label2id[label] for label in test_data['source']]

# 2. TF-IDF Vectorization
tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 6),
    max_features=50000
)

tfidf.fit(full_train_texts)
X_train = tfidf.transform(training_data['sentence'])
X_dev = tfidf.transform(test_data['sentence'])

# 3. Hyperparameter Tuning with CVGridSearch
param_grid = {
    'C': [0.1, 0.5, 1.0, 1.5, 2.0, 5],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7]
}
fixed_params = {
    'penalty': 'elasticnet',
    'solver': 'saga',
    'max_iter': 5000,
    'random_state': 42,
    'n_jobs': -1,
    'class_weight': 'balanced'
}

model = LogisticRegression(**fixed_params)
halving_search = UsefulTools.HalvingGridSearch(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    factor=2,
    cache_file='./cache/halving_logreg.pkl',
    verbose=1
)

print("Running cross-validation grid search...")
best_model, best_params, best_score = halving_search.search_and_fit(X_train, y_train)


# Final Model Training
enet = LogisticRegression(**fixed_params, **best_params)
print("Fitting best model...")
enet.fit(X_train, y_train)

# Evaluation
print("Evaluating model on dev set...")
dev_preds = enet.predict(X_dev)
dev_macro_f1 = f1_score(y_dev, dev_preds, average='macro')
print(f"Validation Macro F1 Score: {dev_macro_f1:.4f}")


Running cross-validation grid search...
Loaded HalvingGridSearchCV from cache: ./cache/halving_logreg.pkl
Best Params: {'C': 2.0, 'l1_ratio': 0.3}
Best Score: 0.5378
Fitting best model...
Evaluating model on dev set...
Validation Macro F1 Score: 0.4743


In [34]:
def predict_custom_sentence(sentence: str, top_k: int = 4):
    # Vectorize the input sentence
    vectorized = tfidf.transform([sentence])

    # Predict class ID and get probabilities
    probas = enet.predict_proba(vectorized)[0]
    pred_id = np.argmax(probas)
    pred_label = id2label[pred_id]

    # Sort and show top_k class probabilities
    top_indices = np.argsort(probas)[::-1][:top_k]
    # print(f"\nInput Sentence: {sentence}")
    print(f"Predicted Source: {pred_label}")
    print("\nTop Predictions:")
    for idx in top_indices:
        print(f"  {id2label[idx]:<15} : {probas[idx]:.4f}")

    return pred_label


# Example usage
sample = "Making sure that a user is who they say they are is at the core of San Francisco-based Persona, which offers ID authentication software to 3,000 companies including OpenAI, LinkedIn, Etsy, Reddit, DoorDash and Robinhood."
predict_custom_sentence(sample)

Predicted Source: The Verge

Top Predictions:
  The Verge       : 0.4787
  Forbes          : 0.3099
  Dwarkesh.com    : 0.1131
  SiliconANGLE News : 0.0983


'The Verge'

# roBERTa

In [36]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # <-- Correct import here!
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os

# PyTorch-ready dataset
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts              # Store the list of comment texts
        self.labels = labels            # Store the corresponding list of integer labels
        self.tokenizer = tokenizer      # Store the tokenizer (e.g., RobertaTokenizer)
        self.max_length = max_length    # Store the maximum sequence length

    def __len__(self):
        return len(self.texts)  # Important: this allows DataLoader to know dataset size

    def __getitem__(self, idx):
        """
        Fetches a single sample from the dataset.

        Args:
            idx (int): Index of the item.

        Returns:
            dict: A dictionary containing:
                - 'input_ids': Raw text must be converted into numbers because models can't understand plain words.
                - 'attention_mask': A list that tells the model which tokens are real (1) and which ones are just padding (0).
                - 'labels': The label

        * Tensor is a multi-dimensional array (like a super-powered NumPy array) that can run on a GPU very efficiently

        ex:
        Raw Text:
        "eating soap to own the republicans"

        Tokenized to IDs:
        [0, 1553, 4153, 7, 1225, 5, 13815, 2, *1, *1]

        Attention Mask:
        [1, 1, 1, 1, 1, 1, 1, 1, *0, *0]

        Label:
        0

        Models usually expect inputs to have the same length (max = max_lenght).
        If your token list is too short, you add padding tokens (which are just zeros).
        """
        # Fetch the text and its corresponding label using the index
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text:
        # - truncation: cuts off texts longer than max_length
        # - padding: adds padding tokens to shorter texts
        # - return_tensors='pt': returns PyTorch tensors instead of lists
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length = self.max_length,
            return_tensors='pt' # stands for PyTorch
        )

        # Return a dictionary with model inputs and label
        return {
            'input_ids': encoding['input_ids'].squeeze(),         # Tensor of input token IDs .squeeze() again just removes the extra dimension.
            'attention_mask': encoding['attention_mask'].squeeze(), # Tensor indicating real tokens vs padding
            'labels': torch.tensor(label, dtype=torch.long)        # Make sure it becomes a PyTorch Tensor, because the model expects labels to be Tensors during training.
        }


from torch.amp import GradScaler, autocast
# Use smaller (half-size) numbers for most calculations, so training is much faster and uses less memory — without losing much accuracy.
scaler = GradScaler()

# Mixed Precision Training
def train_MPT(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast(device_type=device.type):  # 🔥 autocast for mixed precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation Function
def evaluate(model, loader):
    model.eval() # Set the model to evaluation mode
    all_preds = []
    all_labels = []
    with torch.no_grad(): # Turns off gradient calculations (Saves memory and speeds up evaluation because we don't need gradients)
        for batch in tqdm(loader):
            # Move data to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Get predictions (pick the class with the highest score (probability) for each example in the batch)
            preds = torch.argmax(outputs.logits, dim=1) # outputs.logits = raw scores for each class
            # Save predictions and true labels
            # We Move predictions and labels from GPU to CPU and convert them into numpy arrays
            # because some Python operations (like NumPy or scikit-learn) cannot work directly with GPU Tensors.
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Take the F1 score for each label (class) separately, and then average them equally (It does not care if one label has 1000 examples and another class has 100 examples)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    return macro_f1

In [38]:
# 1. Preprocessing
labels = training_data['source'].unique()
label2id = {label: idx for idx, label in enumerate(labels)} # {"label": 'id'}
id2label = {idx: label for label, idx in label2id.items()} # {'id':'label}


training_data['label_id'] = training_data['source'].map(label2id)
test_data['label_id'] = test_data['source'].map(label2id)


MODEL_NAME = 'roberta-base'
MAX_LENGTH = 128
BATCH_SIZE = 64
EPOCHS = 2
LEARNING_RATE = 2e-5

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

training_data['label_id'] = training_data['source'].map(label2id)
test_data['label_id'] = test_data['source'].map(label2id)

train_dataset = RedditDataset(training_data['sentence'].tolist(), training_data['label_id'].tolist(), tokenizer, MAX_LENGTH)
dev_dataset = RedditDataset(test_data['sentence'].tolist(), test_data['label_id'].tolist(), tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)

# Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cuda is faster but it requires GPU
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model.to(device) # Moves all model weights and computations to the device you selected (cuda if GPU, otherwise cpu)

# Optimizer and Scheduler
# model.parameters() are all the learnable weights in the model.
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE) # A type of optimizer that updates the model's weights to minimize the loss.
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # It controls how the learning rate changes during training. In our case it linearly decreases


MODEL_SAVE_PATH = f'cache/{training_file_name}_best_{MODEL_NAME}_model_{MAX_LENGTH}-{BATCH_SIZE}-{EPOCHS}-{LEARNING_RATE}.pt'

# Check if saved model exists
if os.path.exists(MODEL_SAVE_PATH):
    print(f"Loading saved model from {MODEL_SAVE_PATH}...")
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
else:
    print("No saved model found. Starting training...")

    # Run Training
    best_f1 = 0
    best_model_state = None

    for epoch in range(EPOCHS): # One epoch = One full pass through the entire training dataset.
        print(f"Epoch {epoch+1}/{EPOCHS}")
        train_loss = train_MPT(model, train_loader)
        val_f1 = evaluate(model, dev_loader)
        print(f"Train Loss: {train_loss:.4f}, Validation Macro F1: {val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = model.state_dict()

    print(f'Best f1 for this model: {best_f1}')

    # After training, save the best model
    torch.save(best_model_state, MODEL_SAVE_PATH)
    print(f"Training finished. Best model saved to {MODEL_SAVE_PATH}")

    # Loads back the best version of the model that you saved during training (Forget your current weights — load these saved weights instead)
    model.load_state_dict(best_model_state)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No saved model found. Starting training...
Epoch 1/2


100%|██████████| 1045/1045 [06:21<00:00,  2.74it/s]
100%|██████████| 210/210 [01:27<00:00,  2.39it/s]


Train Loss: 3.1052, Validation Macro F1: 0.0314
Epoch 2/2


100%|██████████| 1045/1045 [06:17<00:00,  2.77it/s]
100%|██████████| 210/210 [01:27<00:00,  2.39it/s]


Train Loss: 2.2538, Validation Macro F1: 0.0368
Best f1 for this model: 0.03684385149134057
Training finished. Best model saved to cache/train_data_15_50_best_roberta-base_model_128-64-2-2e-05.pt


In [None]:
def predict_sentence_roberta(text):
    model.eval()
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy().flatten()
        pred_idx = np.argmax(probs)
        print(f"\n Sentence: {text}")
        print(f"Predicted Source: {id2label[pred_idx]}")
        print("\n Top Probabilities:")
        for idx in probs.argsort()[::-1]:
            print(f"  {id2label[idx]:<15} : {probs[idx]:.4f}")

# Stacking

In [25]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score

class StackedNewsClassifier:
    def __init__(self, label2id, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.label2id = label2id
        self.id2label = {v: k for k, v in label2id.items()}
        self.num_classes = len(label2id)

        self.device = torch.device(device)

        # Components
        self.tfidf = None
        self.lr_model = None
        self.roberta_tokenizer = None
        self.roberta_model = None

    def fit_tfidf_logreg(self, texts, labels, **tfidf_kwargs):
        self.tfidf = TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=(2, 6),
            max_features=50000,
            **tfidf_kwargs
        )
        X = self.tfidf.fit_transform(texts)
        y = [self.label2id[label] for label in labels]

        self.lr_model = LogisticRegression(
            penalty='elasticnet',
            solver='saga',
            max_iter=5000,
            random_state=42,
            n_jobs=-1,
            C=1.0,
            l1_ratio=0.3,
            class_weight='balanced'
        )
        self.lr_model.fit(X, y)

    def load_roberta(self, model_path, model_name='roberta-base'):
        self.roberta_tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.roberta_model = RobertaForSequenceClassification.from_pretrained(
            model_name, num_labels=self.num_classes
        )
        self.roberta_model.load_state_dict(torch.load(model_path))
        self.roberta_model.to(self.device)
        self.roberta_model.eval()

    def predict_proba(self, sentence, max_length=128, alpha=0.5):
        """
        Combines TF-IDF + LR and RoBERTa predictions (weighted average).
        """
        # TF-IDF + Logistic Regression
        X = self.tfidf.transform([sentence])
        prob_lr = self.lr_model.predict_proba(X)[0]

        # RoBERTa
        inputs = self.roberta_tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        ).to(self.device)

        with torch.no_grad():
            logits = self.roberta_model(**inputs).logits
            prob_roberta = F.softmax(logits, dim=1).cpu().numpy()[0]

        # Combine: weighted average
        combined_prob = alpha * prob_roberta + (1 - alpha) * prob_lr
        return combined_prob

    def predict(self, sentence, max_length=128, alpha=0.5, top_k=3):
        prob = self.predict_proba(sentence, max_length=max_length, alpha=alpha)
        pred_idx = np.argmax(prob)
        print(f"\n📝 Sentence: {sentence}")
        print(f"📢 Predicted Source: {self.id2label[pred_idx]}")

        print(f"\n🔍 Top {top_k} Probabilities:")
        for idx in prob.argsort()[::-1][:top_k]:
            print(f"  {self.id2label[idx]:<15}: {prob[idx]:.4f}")

        return self.id2label[pred_idx]

    def evaluate(self, sentences, true_labels, alpha=0.5, max_length=128):
        """
        Evaluate the stacked model on a list of sentences and true labels.

        Args:
            sentences (list[str]): List of raw input texts.
            true_labels (list[str] or list[int]): Ground-truth labels (string or integer).
            alpha (float): Weight for RoBERTa in the ensemble.
            max_length (int): Token truncation/padding length for RoBERTa.

        Returns:
            float: Macro F1 score
        """
        self.roberta_model.eval()
        y_true = [self.label2id[label] if isinstance(label, str) else label for label in true_labels]
        y_pred = []

        for text in tqdm(sentences, desc="Evaluating"):
            probs = self.predict_proba(text, alpha=alpha, max_length=max_length)
            pred_idx = np.argmax(probs)
            y_pred.append(pred_idx)

        macro_f1 = f1_score(y_true, y_pred, average='macro')
        print(f"\n✅ Macro F1 Score: {macro_f1:.4f}")
        return macro_f1


In [29]:
# 1. Setup
stacker = StackedNewsClassifier(label2id)

# 2. Train TF-IDF + LR model
stacker.fit_tfidf_logreg(
    texts=training_data['sentence'].tolist(),
    labels=training_data['source'].tolist()
)

# 3. Load pre-trained RoBERTa model
stacker.load_roberta(model_path='cache/best_roberta-base_model_128-64-2-2e-05.pt')

stacker.evaluate(
    sentences=test_data['sentence'].tolist(),
    true_labels=test_data['label_id'].tolist(),  # or test_data['source'] if strings
    alpha=0.6  # weight more toward RoBERTa
)


# # 4. Predict
# stacker.predict("The prime minister held a press briefing on new trade policies.")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 438/438 [00:05<00:00, 83.70it/s]


✅ Macro F1 Score: 0.3470





0.34703113350405895

In [24]:
# 4. Predict
stacker.predict("The prime minister held a press briefing on new trade policies.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📝 Sentence: The prime minister held a press briefing on new trade policies.
📢 Predicted Source: The Verge

🔍 Top 3 Probabilities:
  The Verge      : 0.5655
  Business Insider: 0.1762
  Gizmodo.com    : 0.1420


'The Verge'