<a href="https://colab.research.google.com/github/lmethratta/Poliscope/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install datasets transformers



In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Load dataset
!wget -O data.csv "https://raw.githubusercontent.com/lmethratta/Poliscope/refs/heads/main/twinviews-13k.csv"


# Convert dataset to Pandas
import pandas as pd

# Load CSV into a DataFrame
df = pd.read_csv("data.csv")


--2025-04-02 21:15:28--  https://raw.githubusercontent.com/lmethratta/Poliscope/refs/heads/main/twinviews-13k.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3063233 (2.9M) [text/plain]
Saving to: ‘data.csv’


2025-04-02 21:15:28 (213 MB/s) - ‘data.csv’ saved [3063233/3063233]



In [12]:
# Convert dataset into a regression dataset
df_left = df[['l']].rename(columns={'l': 'text'})
df_left['bias_score'] = -1  # Left bias

df_right = df[['r']].rename(columns={'r': 'text'})
df_right['bias_score'] = 1  # Right bias

# Combine both datasets and shuffle
df_combined = pd.concat([df_left, df_right]).sample(frac=1).reset_index(drop=True)

# Tokenization
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re

# Simple text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation
    return text

df_combined["text"] = df_combined["text"].apply(clean_text)

# Build vocabulary
tokenized_texts = [text.split() for text in df_combined["text"]]
word_freq = Counter(word for sentence in tokenized_texts for word in sentence)

# Create word-to-index mapping
vocab = {word: i+1 for i, (word, _) in enumerate(word_freq.most_common())}  # 1-based index
vocab["<PAD>"] = 0  # Padding token

# Convert text to sequences of numbers
def encode_text(text):
    return [vocab.get(word, 0) for word in text.split()]

df_combined["encoded"] = df_combined["text"].apply(encode_text)

# Pad sequences to max length
MAX_LEN = 50  # Set max sequence length
padded_sequences = pad_sequence([torch.tensor(seq[:MAX_LEN]) for seq in df_combined["encoded"]],
                                batch_first=True, padding_value=0)

# Convert labels to tensor
labels = torch.tensor(df_combined["bias_score"].values, dtype=torch.float32)

In [13]:
import numpy as np
from sklearn.metrics import average_precision_score

def average_precision_at_k(relevance_scores, k):
    """Computes Average Precision at K"""
    relevance_scores = np.array(relevance_scores)[:k]
    num_relevant = np.sum(relevance_scores)
    if num_relevant == 0:
        return 0.0
    cumulative_precision = [
        np.sum(relevance_scores[:i+1]) / (i+1) for i in range(len(relevance_scores))
    ]
    return np.sum(cumulative_precision * relevance_scores) / num_relevant

def mean_average_precision(relevance_scores_list, k=10):
    """Computes Mean Average Precision (MAP)"""
    return np.mean([average_precision_at_k(scores, k) for scores in relevance_scores_list])

def dcg_at_k(relevance_scores, k):
    """Computes Discounted Cumulative Gain at K"""
    relevance_scores = np.array(relevance_scores)[:k]
    return np.sum(relevance_scores / np.log2(np.arange(2, len(relevance_scores) + 2)))

def ndcg_at_k(relevance_scores, k):
    """Computes Normalized Discounted Cumulative Gain (NDCG)"""
    ideal_relevance = sorted(relevance_scores, reverse=True)  # Ideal DCG
    return dcg_at_k(relevance_scores, k) / (dcg_at_k(ideal_relevance, k) + 1e-10)

# Example usage:
relevance_scores_list = [
    [1, 0, 1, 1, 0],  # Example query results: 1 = relevant, 0 = non-relevant
    [1, 1, 0, 0, 1],
]

map_score = mean_average_precision(relevance_scores_list, k=5)
ndcg_score = np.mean([ndcg_at_k(scores, k=5) for scores in relevance_scores_list])

print(f"MAP: {map_score:.4f}")
print(f"NDCG: {ndcg_score:.4f}")


MAP: 0.8361
NDCG: 0.9265


In [14]:
class BiasDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

dataset = BiasDataset(padded_sequences, labels)

# Split into training & validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [23]:
!pip install torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.3.0->torchtext)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader, Dataset

# ======= 🔹 Load GloVe Pretrained Embeddings 🔹 =======
glove = GloVe(name="6B", dim=100)  # Use 100D word embeddings

# ======= 🔹 Define Model 🔹 =======
class LSTMBiasClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
        super(LSTMBiasClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=False)  # Use GloVe
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                            bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Output a single regression value
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take last output from LSTM
        return self.sigmoid(out).squeeze()  # Regression output

OSError: /usr/local/lib/python3.11/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [21]:
# Hyperparameters
hidden_dim = 256
num_layers = 2
dropout = 0.3
lr = 0.001
batch_size = 16
epochs = 10

# Model Initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMBiasClassifier(len(glove.stoi), 100, hidden_dim, num_layers, dropout).to(device)

# Optimizer & Loss
criterion = nn.BCELoss()  # Binary Cross Entropy for classification
optimizer = optim.AdamW(model.parameters(), lr=lr)

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).float()

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += ((outputs >= 0.5).float() == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total * 100
    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f} | Train Acc: {train_acc:.2f}%")

NameError: name 'LSTMBiasClassifier' is not defined

In [19]:
model.eval()
total_loss = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        total_loss += loss.item()

print(f"Validation Loss: {total_loss / len(val_loader)}")

Validation Loss: 0.5306617390868986


In [20]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    criterion = nn.MSELoss()

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs).squeeze()

            # Convert regression output to -1 or 1
            predictions = torch.where(outputs >= 0, torch.tensor(1.0, device=device), torch.tensor(-1.0, device=device))

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

    accuracy = correct / total * 100
    return total_loss / len(data_loader), accuracy


# Compute validation loss & accuracy
val_loss, val_accuracy = compute_accuracy(model, val_loader)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.2f}%")

Validation Loss: 0.5307
Validation Accuracy: 50.56%
