# Real-world case: Credit scoring model
### Author: Martín Anaya

In this second experiment, we transition from synthetic data to a real-world scenario by using the German Credit Dataset. This dataset contains information on 1000 individuals, including sensitive attributes such as gender, alongside a label indicating their creditworthiness. More details about this dataset and its full set of attributes can be found in https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data.

### Imports

In [None]:
# Utilities
import pandas as pd
import numpy as np

# Data Loading
from ucimlrepo import fetch_ucirepo 

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Logistic regression model
from sklearn.linear_model import LogisticRegression

# Fairlearn
from fairlearn.reductions import ExponentiatedGradient, DemographicParity

# Evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# In order to improve the notebook's readability, we ignore the max_iter warnings.
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
seed=42 # Setting a seed for reproducibility

## Data Loading

In [None]:
# Fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 

In [None]:
df = pd.concat([X,y], axis = 1)

## Data Preprocessing

In [None]:
# We rename the columns
df.columns = [
    "checking_status",  # A1
    "duration",         # A2
    "credit_history",   # A3
    "purpose",          # A4
    "credit_amount",    # A5
    "savings_status",   # A6
    "employment",       # A7
    "installment_commitment",  # A8
    "personal_status",        # A9
    "other_parties",          # A10
    "residence_since",        # A11
    "property_magnitude",     # A12
    "age",                    # A13
    "other_payment_plans",    # A14
    "housing",                # A15
    "existing_credits",       # A16
    "job",                    # A17
    "num_dependents",         # A18
    "own_telephone",          # A19
    "foreign_worker",         # A20
    "class"                   # target
]

The column of relevant interest to us is _personal_status_. This contains both the gender of the individual and their marital status. We will decode this attribute and create two separate features for each dimension.

In [None]:
mapping_personal_status = {
    "A91": {"sex": "male",   "marital_status": "divorced/separated"},
    "A92": {"sex": "female", "marital_status": "divorced/separated/married"},
    "A93": {"sex": "male",   "marital_status": "single"},
    "A94": {"sex": "male",   "marital_status": "married/widowed"},
    "A95": {"sex": "female", "marital_status": "single"}
}

# Tranform the labels of the dataset from 1 and 2 to 0 and 1.
df["class"] = df["class"].map({1: 1, 2: 0}) #1 = Good, 2 = Bad => 1 = Positive, 0 = Negative

# Map personal_status
df["sex"] = df["personal_status"].map(lambda x: mapping_personal_status[x]["sex"])
df["marital_status"] = df["personal_status"].map(lambda x: mapping_personal_status[x]["marital_status"])

# Remove the column
df = df.drop("personal_status" , axis = 1)
df

In [None]:
print("Female count: ", len(df[df["sex"]=="female"]))
print("Male count: ", len(df[df["sex"]=="male"]))

print("\n\nOutcome mean by sex (1=credit, 0=no_credit):")
print(df.groupby("sex")["class"].mean())

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["class"]), df["class"], test_size=0.5, random_state=42, stratify=df["class"])

print("Female count in X_train: ", len(X_train[X_train["sex"]=="female"]))
print("Male count in X_train: ", len(X_train[X_train["sex"]=="male"]))

## SPRT Framework

We adapted the code from the previous notebook to the credit scoring dataset.

In [None]:
def sprt_statistical_parity(group1, group2, alpha=0.05, beta=0.2, delta=0.1):
    """
    SPRT for Statistical Parity fairness criterion.
    """
    n1 = len(group1)
    n2 = len(group2)

    s1 = group1['prediction'].sum() 
    s2 = group2['prediction'].sum() 

    p1_hat = s1 / n1
    p2_hat = s2 / n2

    p0 = (p1_hat + p2_hat) / 2 
    p1 = p0 + delta/2
    p2 = p0 - delta/2

    # Likelihood ratio
    L0 = (p0**s1 * (1 - p0)**(n1 - s1)) * (p0**s2 * (1 - p0)**(n2 - s2))
    L1 = (p1**s1 * (1 - p1)**(n1 - s1)) * (p2**s2 * (1 - p2)**(n2 - s2))
    lr = L1 / L0 if L0 > 0 else np.inf

    # Thresholds
    A = (1 - beta) / alpha
    B = beta / (1 - alpha)

    # Decision
    if lr >= A:
        return "Reject H0 (difference detected)", lr
    elif lr <= B:
        return "Accept H0 (no difference)", lr
    else:
        return "Continue sampling", lr


def sprt_equal_opportunity(group1, group2, alpha=0.05, beta=0.2, delta=0.1):
    """
    SPRT for Equal Opportunity fairness criterion.
    Compares true positive rates (TPR) between two groups.
    """

    # Filter only the truly positive individuals (Y=1)
    g1_pos = group1[group1['prediction'] == 1]
    g2_pos = group2[group2['prediction'] == 1]
    
    n1 = len(g1_pos)
    n2 = len(g2_pos)

    # We need one sample from each group
    if n1 == 0 or n2 == 0:
        return "Continue sampling" , np.nan


    s1 = group1['prediction'].sum() 
    s2 = group2['prediction'].sum() 

    # Observed TPRs
    p1_hat = s1 / n1
    p2_hat = s2 / n2

    p0 = (p1_hat + p2_hat) / 2 
    p1 = p0 + delta/2
    p2 = p0 - delta/2

    # Likelihood ratio
    L0 = (p0**s1 * (1 - p0)**(n1 - s1)) * (p0**s2 * (1 - p0)**(n2 - s2))
    L1 = (p1**s1 * (1 - p1)**(n1 - s1)) * (p2**s2 * (1 - p2)**(n2 - s2))
    lr = L1 / L0 if L0 > 0 else np.inf

    # Thresholds
    A = (1 - beta) / alpha
    B = beta / (1 - alpha)

    # Decision
    if lr >= A:
        return "Reject H0 (difference detected)", lr
    elif lr <= B:
        return "Accept H0 (no difference)", lr
    else:
        return "Continue sampling", lr


# SPRT main loop
def run_test(df, batch_size=10, max_steps=1000, alpha = 0.05, beta = 0.2, delta=0.1, criterion="SP"):
    pool = df.copy()
    pool = pool.sample(frac=1).reset_index(drop=True) # reordena para usar iloc

    results = []

    g1_index = pool[pool['sex'] == 'male'].index[0]
    g2_index = pool[pool['sex'] == 'female'].index[0]
    
    # Evidence set
    accumulated = pool.loc[[g1_index, g2_index]]
    pool = pool.drop([g1_index, g2_index])

    
    for step in range(max_steps):
        if len(pool) < batch_size:
            #print("Not enough data")
            break

        batch = pool.iloc[:batch_size]
        pool = pool.iloc[batch_size:]
        
        accumulated = pd.concat([accumulated, batch], ignore_index=True)

        g1 = accumulated[accumulated['sex'] == 'male']
        g2 = accumulated[accumulated['sex'] == 'female']
        
        # Choose test depending on criterion
        if criterion == "SP":
            decision, lr = sprt_statistical_parity(g1, g2, alpha=alpha, beta=beta, delta=delta)
        elif criterion == "EO":
            decision, lr = sprt_equal_opportunity(g1, g2, alpha=alpha, beta=beta, delta=delta)
        else:
            raise ValueError("criterion must be 'SP' or 'EO'")

        results.append((step+1, decision, lr))
        
        if decision != "Continue sampling":
            break

    return results

## Experiments

The hypotheses for our test are defined as:

$$H_0 : \pi_1 = \pi_2 \qquad \text{(the system is fair)}$$

$$H_1 : |\pi_1 - \pi_2| \geq 0.1  \qquad \text{(the system favours one group over the other)}$$

where:
$$\pi_1 = P(\text{credit approved | A = Male})$$ 
$$\pi_2 = P(\text{credit approved | A = Female})$$


### Baseline model

In [None]:
categorical_cols = X_train.select_dtypes(include="object").columns
numeric_cols = X_train.select_dtypes(exclude="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# Base model
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=seed))
])

In [None]:
clf.fit(X_train, y_train)
print("Train acc:", clf.score(X_train, y_train))
print("Test acc:", clf.score(X_test, y_test))

In [None]:
y_pred = clf.predict(X_test)

output_classifier = X_test.copy()
output_classifier["true"] = y_test
output_classifier["prediction"] = y_pred
output_classifier["sex"] = df.loc[X_test.index, "sex"]

output_classifier = output_classifier.loc[:, ["true","prediction","sex"]]

output_classifier.groupby("sex")["prediction"].mean()

### Classifier without sensitive attributes

In [None]:
X_train_no_sentitive = X_train.drop(columns=["sex"])
X_test_no_sensitive = X_test.drop(columns=["sex"])

categorical_cols_no_sensitive = X_train_no_sentitive.select_dtypes(include="object").columns
numeric_cols_no_sensitive = X_train_no_sentitive.select_dtypes(exclude="object").columns


preprocessor_no_sensitive = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols_no_sensitive),
        ("num", "passthrough", numeric_cols_no_sensitive)
    ]
)


X_train_no_sensitive_enc = preprocessor_no_sensitive.fit_transform(X_train_no_sentitive)
X_test_no_sensitive_enc = preprocessor_no_sensitive.transform(X_test_no_sensitive)


no_sensitive_clf = Pipeline(steps=[
    ("preprocessor", preprocessor_no_sensitive),
    ("classifier", LogisticRegression(max_iter=10000, random_state=seed))
])

no_sensitive_clf.fit(X_train_no_sentitive, y_train)
print("Train acc:", no_sensitive_clf.score(X_train_no_sentitive, y_train))
print("Test acc:", no_sensitive_clf.score(X_test_no_sensitive, y_test))

y_pred_no_sensitive = no_sensitive_clf.predict(X_test_no_sensitive)

In [None]:
output_classifier_no_sensitive = X_test.copy()
output_classifier_no_sensitive["true"] = y_test
output_classifier_no_sensitive["prediction"] = y_pred_no_sensitive
output_classifier_no_sensitive["sex"] = df.loc[X_test.index, "sex"]
output_classifier_no_sensitive = output_classifier_no_sensitive.loc[:, ["true","prediction","sex"]]

output_classifier_no_sensitive.groupby("sex")["prediction"].mean()

### Fairness-aware classifier

In [None]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("num", "passthrough", numeric_cols)
])

In [None]:
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

base_clf = LogisticRegression(max_iter=5000, random_state=seed)

# Fair classifier
fair_clf = ExponentiatedGradient(
    estimator=base_clf,
    constraints=DemographicParity()
)

# Train
fair_clf.fit(
    X_train_enc,
    y_train,
    sensitive_features=X_train["sex"]
)

# Predictions
y_pred_fair = fair_clf.predict(X_test_enc)

In [None]:
output_classifier_fair = X_test.copy()
output_classifier_fair["true"] = y_test
output_classifier_fair["prediction"] = y_pred_fair
output_classifier_fair["sex"] = df.loc[X_test.index, "sex"]
output_classifier_fair = output_classifier_fair.loc[:, ["true","prediction","sex"]]


output_classifier_fair.groupby("sex")["prediction"].mean()

### Fairness-accuracy tradeoff

In [None]:
def metrics(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}, Precision: {precision_score(y_true, y_pred):.2f}, Recall: {recall_score(y_true, y_pred):.2f}, F1-Score: {f1_score(y_true, y_pred):.2f}")


print("\nBaseline model:")
metrics(y_test, y_pred)

print("\nNo sensitive attributes model:")
metrics(y_test, y_pred_no_sensitive)

print("\nFair model:")
metrics(y_test, y_pred_fair)

### Audit simulations

In [None]:
batch_size = 5
max_steps = 2000
num_simulations = 10000

alpha = 0.05
beta = 0.2
delta = 0.1

criterion = 'SP'

test_df = output_classifier_fair



def simulation_loop(num_simulations, test_df, batch_size, max_steps, alpha, beta, delta, criterion):
    resultados_simulaciones = pd.DataFrame()
    count_data_limit_reached = 0
    for i in range(num_simulations): 
        res = run_test(df = test_df, batch_size = batch_size, max_steps = max_steps, alpha = alpha, beta = beta, delta = delta, criterion = criterion)[-1][0:2]
        res = pd.DataFrame(res).transpose()
        
        col = res.columns[1]

        res.loc[res[col] == 'Reject H0 (difference detected)', col] = 1
        res.loc[res[col] == 'Accept H0 (no difference)', col] = 0
        
        if(res[1] == 'Continue sampling').any():
           count_data_limit_reached = count_data_limit_reached+1
            
        res.loc[res[col] == 'Continue sampling', col] = 0
        
        res.iloc[:,0] = res.iloc[:,0] * batch_size + 2
            
        resultados_simulaciones = pd.concat([resultados_simulaciones, res], ignore_index=True)
    
    resultados_simulaciones.columns = ["Num_samples", "Difference Detected"]
    
    print("Num simulaciones:" , num_simulations, "\nMean sample size: ", resultados_simulaciones['Num_samples'].mean(), "\nBias detected: ", resultados_simulaciones['Difference Detected'].sum(),"\nNot enough data: ", count_data_limit_reached)
    return resultados_simulaciones

In [None]:
alpha = 0.05
beta = 0.2
batch_size = 15

print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.05
beta = 0.1
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.2
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.1
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)



alpha = 0.05
beta = 0.2
batch_size = 5

print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.05
beta = 0.1
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.2
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.1
print("Base classifier:")
base_res = simulation_loop(num_simulations, output_classifier, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nClassifier without sensitive attributes:")
no_sensitive_res = simulation_loop(num_simulations, output_classifier_no_sensitive, batch_size, max_steps, alpha, beta, delta, criterion)
print("\nFair classifier:")
fair_res = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

### Audit another fairness criterion

In [None]:
alpha = 0.05
beta = 0.2
batch_size = 5
delta = 0.5
num_simulations = 1000
criterion = 'EO'

print("\nFair classifier:")
fair_eop = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.2
print("\nFair classifier:")
fair_eop = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.05
beta = 0.1
print("\nFair classifier:")
fair_eop = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)

alpha = 0.1
beta = 0.1
print("\nFair classifier:")
fair_eop = simulation_loop(num_simulations, output_classifier_fair, batch_size, max_steps, alpha, beta, delta, criterion)