In [1]:
from google.colab import files
uploaded = files.upload()

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import pandas as pd
from Metrics import subgroup_auc, bpsn_auc, bnsp_auc, positive_aeg, negative_aeg

# Load dataset once
dataset = load_dataset("moatazhamza194/gb_test_gendered", split="train")
true_labels = (np.array(dataset["label"]) >= 0.5).astype(int)
genders     = np.array(dataset["gender"])
subgroups   = ['male', 'female']

# Models to evaluate
model_names = [
    "moatazhamza194/tc-bert",
    "moatazhamza194/tc-bert_mask",
    "moatazhamza194/tc-bert_swap"
]

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Store final results
all_results = {}
all_metrics = {}

for model_name in model_names:
    print(f"\n Evaluating model: {model_name}")

    # Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    model.eval()

    # Define scoring function for this model
    def compute_model_scores(batch):
        inputs = tokenizer(batch["comment"], padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.squeeze(-1)  # regression logits
        return {"score": logits.cpu().numpy()}

    # Apply model on dataset
    eval_dataset = dataset.map(compute_model_scores, batched=True, batch_size=32)
    pred_scores = np.array(eval_dataset["score"], dtype=float)
    pred_labels = (pred_scores >= 0.5).astype(int)

    # ---- Fairness metrics ----
    metrics_results = {}
    for subgroup in subgroups:
        mask = (genders == subgroup)
        metrics_results[subgroup] = {
            'subgroup_auc': subgroup_auc(true_labels, pred_scores, mask),
            'bpsn_auc'    : bpsn_auc(true_labels, pred_scores, mask),
            'bnsp_auc'    : bnsp_auc(true_labels, pred_scores, mask),
            'positive_aeg': positive_aeg(true_labels, pred_scores, mask),
            'negative_aeg': negative_aeg(true_labels, pred_scores, mask)
        }

    subgroup_df = pd.DataFrame(metrics_results).T
    all_results[model_name] = subgroup_df

    # ---- Accuracy ----
    accuracy = (pred_labels == true_labels).mean()
    all_metrics[model_name] = {"accuracy": accuracy}

# 📊 Display comparison
print("\n=== Accuracy Comparison ===")
metrics_df = pd.DataFrame(all_metrics).T
display(metrics_df)

print("\n=== Subgroup Fairness Results ===")
for model, df in all_results.items():
    print(f"\nModel: {model}")
    display(df)


Saving Metrics.py to Metrics.py


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/351 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/96844 [00:00<?, ? examples/s]


 Evaluating model: moatazhamza194/tc-bert


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/96844 [00:00<?, ? examples/s]


 Evaluating model: moatazhamza194/tc-bert_mask


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/96844 [00:00<?, ? examples/s]


 Evaluating model: moatazhamza194/tc-bert_swap


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/96844 [00:00<?, ? examples/s]


=== Accuracy Comparison ===


Unnamed: 0,accuracy
moatazhamza194/tc-bert,0.957065
moatazhamza194/tc-bert_mask,0.956972
moatazhamza194/tc-bert_swap,0.956084



=== Subgroup Fairness Results ===

Model: moatazhamza194/tc-bert


Unnamed: 0,subgroup_auc,bpsn_auc,bnsp_auc,positive_aeg,negative_aeg
male,0.93943,0.939616,0.97497,0.053232,-0.197696
female,0.944094,0.956,0.966763,0.080603,-0.193616



Model: moatazhamza194/tc-bert_mask


Unnamed: 0,subgroup_auc,bpsn_auc,bnsp_auc,positive_aeg,negative_aeg
male,0.93567,0.938573,0.973361,0.063373,-0.199831
female,0.945649,0.952827,0.96864,0.078918,-0.203798



Model: moatazhamza194/tc-bert_swap


Unnamed: 0,subgroup_auc,bpsn_auc,bnsp_auc,positive_aeg,negative_aeg
male,0.934219,0.934999,0.973555,0.052218,-0.195025
female,0.94319,0.955652,0.965035,0.082606,-0.177704
