In [None]:
# Sets up essential parameters and constants
# OUTCOME_COL: The name of the target variable we are trying to predict.
# RACE_COL_CANDIDATES and GENDER_COL_CANDIDATES: Lists of possible column names
#      for race and gender, which will be used to automatically detect these sensitive attributes in the dataset.
# TARGET_DP, MAX_AUG_MULT, NOISE_SCALE, RANDOM_STATE:
#      Various numerical parameters for data augmentation and reproducibility

CSV_PATH = "/content/nij-challenge2021_full_dataset.csv"
OUTCOME_COL = "Recidivism_Arrest_Year1"

RACE_COL_CANDIDATES = ["race","Race","RACE"]
GENDER_COL_CANDIDATES = ["gender","Gender","sex","Sex"]

TARGET_DP = 0.50
MAX_AUG_MULT = 1.00
NOISE_SCALE = 0.05
RANDOM_STATE = 42


In [None]:
# Handles data loading and identification of sensitive attributes.
# detect_col function programmatically find columns that match a list of candidate names (e.g., 'race', 'gender').

import pandas as pd, numpy as np

df = pd.read_csv(CSV_PATH)
print("Loaded dataset shape:", df.shape)

def detect_col(df, candidates):
    for c in df.columns:
        if any(x.lower() in c.lower() for x in candidates):
            return c
    return None

race_col = detect_col(df, RACE_COL_CANDIDATES)
gender_col = detect_col(df, GENDER_COL_CANDIDATES)
assert OUTCOME_COL in df.columns

print(f"SENSITIVE FEATURES → Race: {race_col}, Gender: {gender_col}")
print(f"OUTCOME → {OUTCOME_COL}")


Loaded dataset shape: (25835, 54)
SENSITIVE FEATURES → Race: Race, Gender: Gender
OUTCOME → Recidivism_Arrest_Year1


In [None]:
# Prepares the data for model training by constructing the feature
#  matrix X and the target variable y, and by categorizing features into numerical and categorical types

y_raw = df[OUTCOME_COL]
y = y_raw.astype(str).str.lower().map({
    "yes":1,"y":1,"true":1,"t":1,"1":1,
    "no":0,"n":0,"false":0,"f":0,"0":0
}).fillna(0).astype(int)

X = df.drop(columns=[OUTCOME_COL], errors='ignore').copy()

cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].dtype.name=='category']
num_cols = [c for c in X.columns if c not in cat_cols]

print("Categorical features:", len(cat_cols))
print("Numeric features:", len(num_cols))


Categorical features: 42
Numeric features: 11


In [None]:
# Initiates the training of a baseline logistic regression model

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler # Import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

# Add an imputer to handle missing values
imputer = SimpleImputer(strategy='most_frequent')

preprocess = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

# Create a custom transformer to convert numpy array to DataFrame
class NumpyToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)


clf = Pipeline([
    ("imputer", imputer),
    ("to_df", NumpyToDataFrame(columns=X_train.columns)),
    ("prep", preprocess),
    ("scaler", StandardScaler(with_mean=False)), # Add StandardScaler
    ("lr", LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)
proba_test = clf.predict_proba(X_test)[:,1]
y_pred = (proba_test >= 0.5).astype(int)

print("Baseline Accuracy:", accuracy_score(y_test, y_pred))
print("Baseline AUC:", roc_auc_score(y_test, proba_test))

Baseline Accuracy: 1.0
Baseline AUC: 1.0


In [None]:
# Adresses data leakage - Removes 3 columns

leaky_categorical_features = ['Recidivism_Within_3years', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3']
X = X.drop(columns=leaky_categorical_features, errors='ignore')

print(f"Dropped leaky categorical features: {leaky_categorical_features}")
print("New shape of X after dropping features:", X.shape)

Dropped leaky categorical features: ['Recidivism_Within_3years', 'Recidivism_Arrest_Year2', 'Recidivism_Arrest_Year3']
New shape of X after dropping features: (25835, 50)


In [None]:
# Updates the classification of features into numerical and categorical types after the leaky features were dropped

cat_cols = [c for c in X.columns if X[c].dtype == 'object' or X[c].dtype.name=='category']
num_cols = [c for c in X.columns if c not in cat_cols]

print("Updated categorical features count:", len(cat_cols))
print("Updated numerical features count:", len(num_cols))

Updated categorical features count: 39
Updated numerical features count: 11


In [None]:
# Re-training the model after the leaky features have been removed

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

# Re-split data to reflect the cleaned X
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

# Retrain the model with the updated features
# Note: The 'clf' pipeline still uses the original num_cols and cat_cols from when it was defined.
# To ensure it uses the updated lists, the pipeline needs to be redefined or its components adjusted.
# For simplicity, we will redefine the preprocess step within the pipeline using the updated lists.

# Redefine the preprocess step using the updated cat_cols and num_cols
preprocess_updated = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

# Recreate the pipeline to use the updated preprocessing
clf_updated = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("to_df", NumpyToDataFrame(columns=X_train.columns)),
    ("prep", preprocess_updated),
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(max_iter=1000))
])

clf_updated.fit(X_train, y_train)
proba_test_cleaned = clf_updated.predict_proba(X_test)[:,1]
y_pred_cleaned = (proba_test_cleaned >= 0.5).astype(int)

print("Model Accuracy (after leakage removal):", accuracy_score(y_test, y_pred_cleaned))
print("Model AUC (after leakage removal):", roc_auc_score(y_test, proba_test_cleaned))

Model Accuracy (after leakage removal): 0.7652887443876761
Model AUC (after leakage removal): 0.8049488492569624


In [None]:
# Fairness metrics for both race and gender

from sklearn.metrics import confusion_matrix
from IPython.display import display
import pandas as pd

def group_metrics(y_true, y_pred, group):
    rows = []
    # Ensure y_true, y_pred, and group are Series with a default integer index
    # This prevents issues with DataFrame/Series indices not aligning or numpy array indexing
    y_true_aligned = pd.Series(y_true).reset_index(drop=True)
    y_pred_aligned = pd.Series(y_pred).reset_index(drop=True)
    group_aligned = pd.Series(group).reset_index(drop=True).astype(str)

    for g in group_aligned.unique():
        idx = (group_aligned == g)
        # Now, yt and yp will be correctly indexed subsets
        yt, yp = y_true_aligned[idx], y_pred_aligned[idx]
        tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0,1]).ravel()
        rows.append({
            "group": g,
            "n": idx.sum(),
            "sel_rate": yp.mean(),
            "TPR": tp/(tp+fn+1e-9),
            "FPR": fp/(fp+tn+1e-9)
        })
    return pd.DataFrame(rows)

def detect_col(df, candidates):
    for c in df.columns:
        if any(x.lower() in c.lower() for x in candidates):
            return c
    return None

race_col = detect_col(df, RACE_COL_CANDIDATES)

print("=== Race Fairness After Leakage Removal ===")
display(group_metrics(y_test, y_pred_cleaned, X_test[race_col]))

print("=== Gender Fairness After Leakage Removal ===")
display(group_metrics(y_test, y_pred_cleaned, X_test[gender_col]))

=== Race Fairness After Leakage Removal ===


Unnamed: 0,group,n,sel_rate,TPR,FPR
0,WHITE,2691,0.206243,0.47348,0.09854
1,BLACK,3768,0.22293,0.465458,0.115326


=== Gender Fairness After Leakage Removal ===


Unnamed: 0,group,n,sel_rate,TPR,FPR
0,M,5657,0.23387,0.481854,0.118986
1,F,802,0.089776,0.3,0.045317


In [None]:
# Defines the synthesize_like function, which is a crucial utility for synthesizing new samples from an existing group of data.
#  This function is designed to augment datasets by creating 'similar' data points.

def synthesize_like(df_group, n_new, noise=NOISE_SCALE):
    if n_new <= 0 or df_group.empty:
        return pd.DataFrame(columns=df_group.columns)
    samp = df_group.sample(n=n_new, replace=True, random_state=RANDOM_STATE)
    for c in num_cols:
        std = df_group[c].std() or 0
        samp[c] = samp[c] + np.random.normal(0, std*noise, len(samp))
    return samp


In [None]:
# synthesizing samples to improve gender parity in the training data

import pandas as pd

# 1. Combine X_train and y_train with the gender column for easier grouping
train_df_for_aug = X_train.copy()
train_df_for_aug["_y"] = y_train
train_df_for_aug["_grp_gender"] = X_train[gender_col] # Add gender column to the dataframe for grouping

# 2. Calculate the 'positive outcome rate' (mean of _y) for each gender group
group_positive_rates = train_df_for_aug.groupby("_grp_gender")["_y"].mean()

# 3. Find the maximum positive outcome rate among all groups.
# This will be our target rate for augmentation, aiming to boost lower rates towards this maximum.
max_positive_rate = group_positive_rates.max()

augmented_data_parts = [train_df_for_aug] # Start with the original training data

for group_val in group_positive_rates.index:
    current_group_rate = group_positive_rates[group_val]

    # Get the original data for this specific group
    original_group_data = train_df_for_aug[train_df_for_aug["_grp_gender"] == group_val]
    n_orig = len(original_group_data)

    if n_orig == 0:
        continue # Skip if group is empty

    # If the current group's positive rate is less than the maximum, consider augmenting
    if current_group_rate < max_positive_rate:
        # Calculate the theoretical factor by which the group's 'presence' needs to increase
        # to match the max_positive_rate. Handle division by zero for current_group_rate.
        target_ratio_from_rates = (max_positive_rate / current_group_rate) if current_group_rate > 0 else (MAX_AUG_MULT + 1)

        # Calculate the number of new samples needed based on this ratio
        n_new_calculated = n_orig * (target_ratio_from_rates - 1)

        # Cap the augmentation at MAX_AUG_MULT times the original group size
        n_new = min(n_new_calculated, n_orig * MAX_AUG_MULT)
        n_new = int(max(0, n_new)) # Ensure non-negative integer number of samples

        if n_new > 0:
            # Synthesize new samples from the original data of this group.
            # We drop '_grp_gender' as `synthesize_like` operates on features (X) and target (y),
            # and we'll add the group identifier back after synthesis.
            synthesized_samples_df = synthesize_like(
                original_group_data.drop(columns=["_grp_gender"], errors='ignore'),
                n_new,
                noise=NOISE_SCALE
            )
            # Add back the group identifier to the synthesized samples
            synthesized_samples_df["_grp_gender"] = group_val
            augmented_data_parts.append(synthesized_samples_df)

# Concatenate all original and augmented dataframes to form the final augmented training set
train_aug_gender = pd.concat(augmented_data_parts, ignore_index=True)

print(f"Original training data size: {len(X_train)}")
print(f"Augmented training data size (gender): {len(train_aug_gender)}")

Original training data size: 19376
Augmented training data size (gender): 20310


In [None]:

# Performance metrics of the logistic regression model after being retrained with the gender-augmented training data

X_train_gender_aug = train_aug_gender.drop(columns=["_y", "_grp_gender"], errors='ignore')
y_train_gender_aug = train_aug_gender["_y"]

clf_updated.fit(X_train_gender_aug, y_train_gender_aug)

# Generate probabilities for the test set using the retrained model
proba_test_gender_aug = clf_updated.predict_proba(X_test)[:,1]

y_pred_gender_aug = (proba_test_gender_aug >= 0.5).astype(int)

print("Model Accuracy (after gender augmentation with proportional sampling):")
print(accuracy_score(y_test, y_pred_gender_aug))
print("Model AUC (after gender augmentation with proportional sampling):")
print(roc_auc_score(y_test, proba_test_gender_aug))

Model Accuracy (after gender augmentation with proportional sampling):
0.7643598080198173
Model AUC (after gender augmentation with proportional sampling):
0.8043903815925032


In [None]:
print("=== Gender Fairness After Augmentation ===")
display(group_metrics(y_test, y_pred_gender_aug, X_test[gender_col]))

print("=== Race Fairness After Augmentation ===")
display(group_metrics(y_test, y_pred_gender_aug, X_test[race_col]))

=== Gender Fairness After Augmentation ===


Unnamed: 0,group,n,sel_rate,TPR,FPR
0,M,5657,0.235991,0.484087,0.121055
1,F,802,0.089776,0.292857,0.046828


=== Race Fairness After Augmentation ===


Unnamed: 0,group,n,sel_rate,TPR,FPR
0,WHITE,2691,0.205871,0.467012,0.100626
1,BLACK,3768,0.22638,0.472366,0.117241
