In [None]:
# STEP 1: Install required libraries
!pip install transformers datasets scikit-learn pandas

# STEP 2: Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, precision_recall_curve
from torch.nn import BCEWithLogitsLoss
from torch import nn, optim
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# STEP 3: Upload file
from google.colab import files
uploaded = files.upload()

# STEP 4: Load the dataset
df = pd.read_csv("CombinedAnnotationsWithConfidenceValues.csv")

# STEP 5: Define columns
bias_columns = [
    'Final_Gender_Bias', 'Final_Religion_Bias',
    'Final_Age_Bias', 'Final_Disability_Bias', 'Final_Sexuality_Bias'
]

confidence_columns = [
    'Gender_Bias_Confidence_Value', 'Religion_Bias_Confidence_Value',
    'Age_Bias_Confidence_Value', 'Disability_Bias_Confidence_Value', 'Sexuality_Bias_Confidence_Value'
]

df = df.dropna(subset=bias_columns + confidence_columns)

texts = df['Sentence_Text'].tolist()
labels = df[bias_columns].values
confidences = df[confidence_columns].values

# STEP 5.1: Data Augmentation for minority classes

for i, col in enumerate(bias_columns):
    print(f"{col}: {df[col].sum()} positive samples out of {len(df)}")

# Augmentation for Sexuality_Bias - duplicate existing examples
sexuality_indices = df[df['Final_Sexuality_Bias'] == 1].index
if len(sexuality_indices) < 20:

    sexuality_texts = df.loc[sexuality_indices, 'Sentence_Text'].tolist()
    sexuality_labels = df.loc[sexuality_indices, bias_columns].values
    sexuality_confidences = df.loc[sexuality_indices, confidence_columns].values


    n_copies = min(5, 50 // len(sexuality_indices))

    # Add copies to the original data
    for _ in range(n_copies):
        texts.extend(sexuality_texts)
        labels = np.vstack([labels, sexuality_labels])
        confidences = np.vstack([confidences, sexuality_confidences])

    print(f"Added {len(sexuality_indices) * n_copies} augmented Sexuality_Bias examples")

# STEP 6: Split dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Compute class weights
label_counts = np.sum(labels, axis=0)
pos_weight = (len(labels) - label_counts) / label_counts
pos_weight = torch.tensor(pos_weight, dtype=torch.float32).to(device)

train_texts, val_texts, train_labels, val_labels, train_weights, val_weights = train_test_split(
    texts, labels, confidences, test_size=0.2, random_state=42
)

# STEP 7: Tokenize
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class BiasDataset(Dataset):
    def __init__(self, texts, labels, weights):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels
        self.weights = weights

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        item['weights'] = torch.tensor(self.weights[idx], dtype=torch.float32)
        return item

train_dataset = BiasDataset(train_texts, train_labels, train_weights)
val_dataset = BiasDataset(val_texts, val_labels, val_weights)

# STEP 8: Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# STEP 9: Define model and optimizer
from torch.optim import AdamW

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5, problem_type="multi_label_classification")
model.to(device)

# Use a smaller learning rate and add weight decay for better generalization
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# STEP 10: Custom Focal Loss to focus on hard examples
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, pos_weight=None):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, pos_weight=self.pos_weight, reduction='none'
        )
        pt = torch.exp(-BCE_loss)
        F_loss = (1-pt)**self.gamma * BCE_loss
        return torch.mean(F_loss)

# Use focal loss instead of BCE
criterion = FocalLoss(gamma=2.0, pos_weight=pos_weight)

# STEP 11: Train
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# STEP 12: Find optimal thresholds for each bias type
def find_optimal_thresholds():
    model.eval()
    all_probs = [[] for _ in range(len(bias_columns))]
    all_labels = [[] for _ in range(len(bias_columns))]

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            probs = torch.sigmoid(outputs).cpu().numpy()

            for i in range(len(bias_columns)):
                all_probs[i].extend(probs[:, i])
                all_labels[i].extend(labels[:, i])

    optimal_thresholds = []

    # Find optimal threshold for each bias type based on F1 score
    for i, bias_type in enumerate(bias_columns):
        precision, recall, thresholds = precision_recall_curve(all_labels[i], all_probs[i])

        # Calculate F1 score for each threshold
        f1_scores = 2 * precision * recall / (precision + recall + 1e-8)

        # Find the threshold with the best F1 score
        if len(thresholds) > 0:
            best_idx = np.argmax(f1_scores[:-1])
            best_threshold = thresholds[best_idx]
        else:
            best_threshold = 0.5

        optimal_thresholds.append(best_threshold)


    return np.array(optimal_thresholds)


optimal_thresholds = find_optimal_thresholds()

# STEP 13: Evaluate with optimal thresholds
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.sigmoid(outputs).cpu().numpy()

        # Apply optimal thresholds for each bias type
        preds = np.zeros_like(probs)
        for i in range(len(bias_columns)):
            preds[:, i] = (probs[:, i] >= optimal_thresholds[i]).astype(int)

        all_preds.extend(preds)
        all_true.extend(labels)


all_preds = np.array(all_preds)
all_true = np.array(all_true)

print("\nAccuracy for each bias type:")
for i, bias_type in enumerate(bias_columns):
    acc = accuracy_score(np.array(all_true)[:, i], np.array(all_preds)[:, i])
    precision, recall, f1, _ = precision_recall_fscore_support(
        np.array(all_true)[:, i], np.array(all_preds)[:, i], average='binary'
    )
    print(f"{bias_type:25} Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

# Calculate overall accuracy across all predictions
overall_acc = np.mean(np.array(all_preds) == np.array(all_true))
print(f"\nOverall accuracy: {overall_acc:.2f}")

# STEP 15: Save the model and optimal thresholds
torch.save({
    'model_state_dict': model.state_dict(),
    'optimal_thresholds': optimal_thresholds,
    'bias_columns': bias_columns
}, 'bias_detection_model.pt')

print("Model and thresholds saved to 'bias_detection_model.pt'")



Saving CombinedAnnotationsWithConfidenceValues.csv to CombinedAnnotationsWithConfidenceValues.csv
Final_Gender_Bias: 165 positive samples out of 3299
Final_Religion_Bias: 200 positive samples out of 3299
Final_Age_Bias: 183 positive samples out of 3299
Final_Disability_Bias: 158 positive samples out of 3299
Final_Sexuality_Bias: 36 positive samples out of 3299


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 330/330 [01:56<00:00,  2.82it/s]


Epoch 1, Loss: 0.4496519203805788


100%|██████████| 330/330 [01:56<00:00,  2.82it/s]


Epoch 2, Loss: 0.18336403691633182


100%|██████████| 330/330 [01:56<00:00,  2.82it/s]


Epoch 3, Loss: 0.13320386845928928


100%|██████████| 330/330 [01:56<00:00,  2.82it/s]


Epoch 4, Loss: 0.10456938680638841


100%|██████████| 330/330 [01:56<00:00,  2.82it/s]


Epoch 5, Loss: 0.08406496264241552

Accuracy for each bias type:
Final_Gender_Bias         Accuracy: 0.99, Precision: 0.81, Recall: 0.97, F1: 0.89
Final_Religion_Bias       Accuracy: 0.98, Precision: 0.90, Recall: 0.82, F1: 0.86
Final_Age_Bias            Accuracy: 0.99, Precision: 0.93, Recall: 0.89, F1: 0.91
Final_Disability_Bias     Accuracy: 0.97, Precision: 0.69, Recall: 0.97, F1: 0.80
Final_Sexuality_Bias      Accuracy: 1.00, Precision: 0.86, Recall: 1.00, F1: 0.92

Overall accuracy: 0.99
Model and thresholds saved to 'bias_detection_model.pt'
