In [1]:
# Bias Analysis for BGH Gender Counterfactuals
# This notebook analyzes gender bias in legal decision predictions using counterfactual data

# Data handling and machine learning libraries
import datasets  # HuggingFace datasets library for loading the BGH dataset
import nltk      # Natural Language Toolkit for German stopwords
import pandas as pd  # Data manipulation and analysis

# Statistical analysis
from scipy.stats import ttest_1samp  # Statistical test for bias detection

# Machine learning components
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes classifier for text classification
from sklearn.feature_extraction.text import TfidfVectorizer  # Text vectorization using TF-IDF
from sklearn.pipeline import make_pipeline  # Pipeline creation for streamlined ML workflow
from sklearn.metrics import classification_report  # Model evaluation metrics

In [2]:
# Load the BGH Gender Counterfactuals dataset
# This dataset contains legal cases with both original facts and gender-swapped versions
ds = datasets.load_dataset("nlietzow/BGH-CivAppeals-GenderCF")

# Split into training and test sets
# Training set: used to train the classification model
# Test set: used to evaluate bias and model performance
train = ds["train"].to_pandas()
test = ds["test"].to_pandas()

In [3]:
# Build the text classification pipeline
def build_pipeline():
    """
    Creates a machine learning pipeline for legal decision classification.
    
    The pipeline consists of:
    1. TF-IDF Vectorizer: Converts text to numerical features, removing German stopwords
    2. Multinomial Naive Bayes: Probabilistic classifier suitable for text classification
    
    Returns:
        sklearn.pipeline.Pipeline: Complete ML pipeline ready for training
    """
    # Load German stopwords to filter out common words that don't contribute to meaning
    stopwords = nltk.corpus.stopwords.words("german")
    
    # TF-IDF vectorizer with German stopwords and feature limit for efficiency
    tfidf = TfidfVectorizer(stop_words=stopwords, max_features=20_000)
    
    # Combine TF-IDF and Naive Bayes into a single pipeline
    return make_pipeline(tfidf, MultinomialNB())

In [4]:
# BASELINE MODEL: Train on original data only
# Create and train the classification pipeline using only the original legal case facts
# This model will serve as our baseline to detect potential gender bias
pipeline = build_pipeline()
pipeline.fit(train.facts, train.decision)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['aber', 'alle', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [5]:
# Evaluate baseline model performance on test set
# Generate predictions and display classification metrics
predictions = pipeline.predict(test.facts)
print("BASELINE MODEL PERFORMANCE:")
print(classification_report(test.decision, predictions))

BASELINE MODEL PERFORMANCE:
              precision    recall  f1-score   support

    reversed       0.77      0.66      0.71       982
      upheld       0.52      0.64      0.57       553

    accuracy                           0.65      1535
   macro avg       0.64      0.65      0.64      1535
weighted avg       0.68      0.65      0.66      1535



In [6]:
# BIAS DETECTION: Test for gender bias in the baseline model
# Compare predictions on original vs. gender-swapped versions of the same cases

# Get the index for "reversed" decision class for probability extraction
reversed_index = list(pipeline.classes_).index("reversed")

# Calculate probabilities of "reversed" decision for both original and augmented facts
test["p_reversed"] = pipeline.predict_proba(test.facts)[:, reversed_index]
test["p_reserved_augmented"] = pipeline.predict_proba(test.facts_augmented)[:, reversed_index]

# Calculate bias scores: difference in probability between original and gender-swapped versions
# Multiply by gender direction: +1 for masculine appellant, -1 for feminine appellant
# This creates a consistent bias metric where positive values indicate bias favoring masculine appellants
diffs = (test["p_reversed"] - test["p_reserved_augmented"]) * test.appellant_gender.apply(
    lambda x: 1 if x == "masculine" else -1
)

# Statistical test: one-sample t-test against null hypothesis (no bias = mean difference of 0)
t_stat, p_value = ttest_1samp(diffs, 0)
print("BASELINE MODEL BIAS TEST:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.2e}")
print(f"Mean bias score: {diffs.mean():.4f}")
print("Interpretation: Positive values indicate bias favoring masculine appellants")

BASELINE MODEL BIAS TEST:
T-statistic: 18.535
P-value: 2.29e-69
Mean bias score: 0.0058
Interpretation: Positive values indicate bias favoring masculine appellants


In [7]:
# DEBIASED MODEL: Train on augmented data to reduce gender bias
# This approach trains on both original and gender-swapped versions of legal cases
# The goal is to make the model less sensitive to gender-related language patterns

# Create a new pipeline for the debiased model
pipeline = build_pipeline()

# Train on combined dataset: original facts + gender-swapped facts with same decisions
# This data augmentation technique helps the model learn that gender shouldn't affect outcomes
pipeline.fit(
    pd.concat([train.facts, train.facts_augmented]),  # Combined original + augmented facts
    pd.concat([train.decision, train.decision])       # Corresponding decisions (duplicated)
)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['aber', 'alle', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [8]:
# Evaluate debiased model performance on test set
# Check if bias mitigation affected overall model accuracy
predictions = pipeline.predict(test.facts)
print("DEBIASED MODEL PERFORMANCE:")
print(classification_report(test.decision, predictions))

DEBIASED MODEL PERFORMANCE:
              precision    recall  f1-score   support

    reversed       0.77      0.64      0.70       982
      upheld       0.51      0.67      0.58       553

    accuracy                           0.65      1535
   macro avg       0.64      0.65      0.64      1535
weighted avg       0.68      0.65      0.66      1535



In [9]:
# FINAL BIAS EVALUATION: Test if data augmentation reduced gender bias
# Apply the same bias detection methodology to the debiased model

# Get probability predictions for both original and gender-swapped versions
reversed_index = list(pipeline.classes_).index("reversed")
test["p_reversed"] = pipeline.predict_proba(test.facts)[:, reversed_index]
test["p_reserved_augmented"] = pipeline.predict_proba(test.facts_augmented)[:, reversed_index]

# Calculate bias scores using the same methodology as the baseline model
diffs = (test["p_reversed"] - test["p_reserved_augmented"]) * test.appellant_gender.apply(
    lambda x: 1 if x == "masculine" else -1
)

# Statistical test to measure remaining bias
t_stat, p_value = ttest_1samp(diffs, 0)
print("DEBIASED MODEL BIAS TEST:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.2e}")
print(f"Mean bias score: {diffs.mean():.4f}")
print("\nCOMPARISON:")
print("- Lower absolute t-statistic indicates reduced bias")
print("- Mean bias score closer to 0 indicates better fairness")
print("- Compare p-values to assess statistical significance of remaining bias")

DEBIASED MODEL BIAS TEST:
T-statistic: 7.344
P-value: 3.35e-13
Mean bias score: 0.0025

COMPARISON:
- Lower absolute t-statistic indicates reduced bias
- Mean bias score closer to 0 indicates better fairness
- Compare p-values to assess statistical significance of remaining bias
