In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [5]:
# Load CSV
df = pd.read_csv('baseline_dataset.csv')

sentence_data = df[['sentence_text', 'sentence_has_bias', 'article_id']].copy()

print(f"Total sentences: {len(sentence_data)}")
print(f"\nBias distribution:")
print(sentence_data['sentence_has_bias'].value_counts())
print(f"\nTotal articles: {sentence_data['article_id'].nunique()}")



Total sentences: 8846

Bias distribution:
sentence_has_bias
0    7268
1    1578
Name: count, dtype: int64

Total articles: 300


In [6]:
sentence_data.head()

Unnamed: 0,sentence_text,sentence_has_bias,article_id
0,"WASHINGTON — Michael Steele, chairman of the R...",1,nyt_001
1,“This is not something the United States had a...,0,nyt_001
2,“It was the president who was trying to be cut...,1,nyt_001
3,"“Well, if he’s such a student of history, has ...",0,nyt_001
4,"Mr. Steele, seeking to clarify his remarks, is...",0,nyt_001


In [7]:
train_df, temp_df = train_test_split(
    sentence_data, 
    test_size=0.2,
    stratify=sentence_data['sentence_has_bias']
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['sentence_has_bias'],
)

print(f"Train: {len(train_df)} sentences ({len(train_df)/len(sentence_data)*100:.1f}%)")
print(f"Val:   {len(val_df)} sentences ({len(val_df)/len(sentence_data)*100:.1f}%)")
print(f"Test:  {len(test_df)} sentences ({len(test_df)/len(sentence_data)*100:.1f}%)")

Train: 7076 sentences (80.0%)
Val:   885 sentences (10.0%)
Test:  885 sentences (10.0%)


In [8]:
class SentenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['sentence_text']
        label = row['sentence_has_bias']
        

        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long),
            'article_id': row['article_id']
        }

print("✓ Dataset class defined")

✓ Dataset class defined


In [None]:
# Hyperparameters
MODEL_NAME = 'bert-base-uncased'  
BATCH_SIZE = 16
LEARNING_RATE = 0.00001
EPOCHS = 3

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2  
).to(device)


train_dataset = SentenceDataset(train_df, tokenizer)
val_dataset = SentenceDataset(val_df, tokenizer)
test_dataset = SentenceDataset(test_df, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [11]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)



In [None]:
def evaluate(model, dataloader, device, verbose=True):
    model.eval()
    
    all_preds = []
    all_labels = []
    all_article_ids = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label']
            article_ids = batch['article_id']
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Get predictions
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
            all_article_ids.extend(article_ids.numpy())
    
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    
    if verbose:
        print(f"\n=== Sentence-Level Metrics ===")
        print(f"F1:        {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"\nClassification Report:")
        print(classification_report(all_labels, all_preds, target_names=['Neutral', 'Biased']))
        
        # Compute per-article average bias rate
        results_df = pd.DataFrame({
            'article_id': all_article_ids,
            'prediction': all_preds,
            'label': all_labels
        })
        
        article_bias_rate = results_df.groupby('article_id')['prediction'].mean()
        print(f"\n=== Per-Article Average Bias Rate ===")
        print(f"Mean bias rate: {article_bias_rate.mean():.4f}")
        print(f"Std bias rate:  {article_bias_rate.std():.4f}")
        print(f"Min bias rate:  {article_bias_rate.min():.4f}")
        print(f"Max bias rate:  {article_bias_rate.max():.4f}")
    
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'predictions': all_preds,
        'labels': all_labels,
        'article_ids': all_article_ids
    }



In [14]:
from tqdm.notebook import tqdm  # For progress bars

best_val_f1 = 0

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"{'='*50}")
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validate
    print("\n--- Validation ---")
    val_metrics = evaluate(model, val_loader, device, verbose=True)
    
    # Save best model
    if val_metrics['f1'] > best_val_f1:
        best_val_f1 = val_metrics['f1']
        torch.save(model.state_dict(), 'best_sentence_model.pt')
        print("✓ Saved best model")

print(f"\n{'='*50}")
print("Training complete!")
print(f"Best validation F1: {best_val_f1:.4f}")


Epoch 1/3
Train Loss: 0.4273

--- Validation ---


AttributeError: 'list' object has no attribute 'numpy'

In [None]:
# Load best model
model.load_state_dict(torch.load('best_sentence_model.pt'))

print(f"{'='*50}")
print("FINAL TEST RESULTS")
print(f"{'='*50}")

test_metrics = evaluate(model, test_loader, device, verbose=True)