<a href="https://colab.research.google.com/github/lmethratta/Poliscope/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset
!wget -O data.csv "https://raw.githubusercontent.com/lmethratta/Poliscope/refs/heads/main/twinviews-13k.csv"

# Load CSV into a DataFrame
df = pd.read_csv("data.csv")


--2025-04-02 21:15:28--  https://raw.githubusercontent.com/lmethratta/Poliscope/refs/heads/main/twinviews-13k.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3063233 (2.9M) [text/plain]
Saving to: ‘data.csv’


2025-04-02 21:15:28 (213 MB/s) - ‘data.csv’ saved [3063233/3063233]



In [None]:
# Create a regression dataset with scores from -1 to 1
df_left = df[['l']].rename(columns={'l': 'text'})
df_left['bias_score'] = -1  # Left bias

df_right = df[['r']].rename(columns={'r': 'text'})
df_right['bias_score'] = 1  # Right bias

# Combine datasets and shuffle
df_combined = pd.concat([df_left, df_right]).sample(frac=1).reset_index(drop=True)

# Split into train and validation
train_df, val_df = train_test_split(df_combined, test_size=0.1, random_state=42)

# Load pre-trained tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


In [None]:
import numpy as np
from sklearn.metrics import average_precision_score

def average_precision_at_k(relevance_scores, k):
    """Computes Average Precision at K"""
    relevance_scores = np.array(relevance_scores)[:k]
    num_relevant = np.sum(relevance_scores)
    if num_relevant == 0:
        return 0.0
    cumulative_precision = [
        np.sum(relevance_scores[:i+1]) / (i+1) for i in range(len(relevance_scores))
    ]
    return np.sum(cumulative_precision * relevance_scores) / num_relevant

def mean_average_precision(relevance_scores_list, k=10):
    """Computes Mean Average Precision (MAP)"""
    return np.mean([average_precision_at_k(scores, k) for scores in relevance_scores_list])

def dcg_at_k(relevance_scores, k):
    """Computes Discounted Cumulative Gain at K"""
    relevance_scores = np.array(relevance_scores)[:k]
    return np.sum(relevance_scores / np.log2(np.arange(2, len(relevance_scores) + 2)))

def ndcg_at_k(relevance_scores, k):
    """Computes Normalized Discounted Cumulative Gain (NDCG)"""
    ideal_relevance = sorted(relevance_scores, reverse=True)  # Ideal DCG
    return dcg_at_k(relevance_scores, k) / (dcg_at_k(ideal_relevance, k) + 1e-10)

# Example usage:
relevance_scores_list = [
    [1, 0, 1, 1, 0],  # Example query results: 1 = relevant, 0 = non-relevant
    [1, 1, 0, 0, 1],
]

map_score = mean_average_precision(relevance_scores_list, k=5)
ndcg_score = np.mean([ndcg_at_k(scores, k=5) for scores in relevance_scores_list])

print(f"MAP: {map_score:.4f}")
print(f"NDCG: {ndcg_score:.4f}")


MAP: 0.8361
NDCG: 0.9265


In [None]:
# Create torch datasets
class PoliticalBiasDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer):
        self.encodings = tokenizer(dataframe["text"].tolist(), padding=True, truncation=True, return_tensors="pt")
        self.labels = torch.tensor(dataframe["bias_score"].values, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = PoliticalBiasDataset(train_df, tokenizer)
val_dataset = PoliticalBiasDataset(val_df, tokenizer)

In [None]:
# Load model for regression (1 output for the bias score)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

OSError: /usr/local/lib/python3.11/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [None]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    criterion = nn.MSELoss()

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs).squeeze()

            # Convert regression output to -1 or 1
            predictions = torch.where(outputs >= 0, torch.tensor(1.0, device=device), torch.tensor(-1.0, device=device))

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

    accuracy = correct / total * 100
    return total_loss / len(data_loader), accuracy


# Compute validation loss & accuracy
val_loss, val_accuracy = compute_accuracy(model, val_loader)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.2f}%")

Validation Loss: 0.5307
Validation Accuracy: 50.56%
