# Import

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import accuracy_score, f1_score

from fairlearn.metrics import (demographic_parity_difference,
                               demographic_parity_ratio,
                               equalized_odds_difference,
                               equalized_odds_ratio,
                               MetricFrame,
                               false_positive_rate,
                               selection_rate,
                               true_positive_rate)

import collections

# Parameters

In [12]:
# parameters and configuration
INC_THRESHOLD = 50_000
ROUND_DIGITS = 4
SELECT_FEATURES = [#'AGEP', 
                   'COW',
                   'SCHL',
                   #'MAR', 
                   'OCCP', 
                   #'POBP',
                   #'RELP', 
                   'WKHP', 
                   #'SEX',
                   #'RAC1P', 
                   'ST']
NOMINAL_FEATURES = ["COW",
                     "OCCP",
                     "ST"]
SPLIT_SEED = 42
TEST_SIZE = 0.3

# Read data

In [13]:
df_1 = pd.read_csv("data/prompt_1_300.csv", index_col="id")
df_2 = pd.read_csv("data/prompt_2_336.csv", index_col="id")
df_4 = pd.read_csv("data/prompt_3_410.csv", index_col="id")
df_5 = pd.read_csv("data/prompt_4_410.csv", index_col="id")
df_CTGAN = pd.read_csv("data/CTGAN_data.csv")
df_GC = pd.read_csv("data/GC_data.csv")
df_TVAE = pd.read_csv("data/TVAE_data.csv")
df_ROS = pd.read_csv("data/ROS_data.csv")


df_og = pd.read_csv("data/ACS_50k.csv", index_col="id")
df_og_50k = df_og.copy()

In [14]:
total_feature_values = 0
print("Number of feature values for selected nominal features")
print("----------------------")
for feat in NOMINAL_FEATURES:
    num_feats = len(df_og[feat].unique())
    print(feat, num_feats)
    total_feature_values += num_feats

print("----------------------")
print("Total feature values:", total_feature_values)

Number of feature values for selected nominal features
----------------------
COW 8
OCCP 528
ST 51
----------------------
Total feature values: 587


# Pre-processing

In [15]:
dfs_dict = {"0": df_og, "1": df_1, "2": df_2, "3": df_4, "4": df_5, "CTGAN": df_CTGAN, "GC": df_GC, "TVAE": df_TVAE, "ROS": df_ROS}

for id, df in dfs_dict.items():
    if id != "0":
        dfs_dict[id] = pd.concat([df, df_og], axis=0, ignore_index=True)
    dfs_dict[id].SCHL = dfs_dict[id].SCHL.astype(int)
    dfs_dict[id].WKHP = dfs_dict[id].WKHP.astype(int)
    dfs_dict[id].reset_index()

dfs_dict = collections.OrderedDict(sorted(dfs_dict.items()))

# Data splitting

In [16]:
def split_data(df):

    X = df[SELECT_FEATURES]
    X = pd.get_dummies(X, columns=NOMINAL_FEATURES)
    y = df['PINCP']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)
    y_test = (y_test > INC_THRESHOLD).astype(int)

    return X_train, X_test, y_train, y_test

# Training

In [17]:
def train_models(X_train, y_train):
    ml_algos = [RandomForestRegressor(n_estimators=200,
                                      bootstrap=True,
                                      max_samples=0.8, 
                                      random_state=42,
                                      n_jobs=-1)]

    models_dict = {}
    for algo in ml_algos:
        model = algo
        model.fit(X_train, y_train)
        models_dict[algo.__class__.__name__] = model
    return models_dict

# Metrics

In [18]:
def _make_predictions(model_dict, X_test):
    preds_dict = {}
    for algo, model in model_dict.items():
        pred = model.predict(X_test)
        pred = (pred > INC_THRESHOLD).astype(int)
        preds_dict[algo] = pred
    return preds_dict


def get_ml_metrics(model_dict, X_test, y_test):
    preds_dict = _make_predictions(model_dict, X_test)
    metrics_dict = {}
    for algo, pred in preds_dict.items():
        metrics_dict[algo] = {}
        metrics_dict[algo]['Accuracy'] = round(accuracy_score(y_test, pred), ndigits=ROUND_DIGITS)
        metrics_dict[algo]['F1'] = round(f1_score(y_test, pred), ndigits=ROUND_DIGITS)
    return metrics_dict


def _get_parity_frame(model_dict, y_true, X_test, sensitive_features):
        preds_dict = _make_predictions(model_dict, X_test)
        frames_dict = {}
        for algo, pred in preds_dict.items():
            sel_rate = MetricFrame(
                metrics=selection_rate,
                y_true=y_true,
                y_pred=pred,
                sensitive_features=sensitive_features
            )
            frames_dict[algo] = sel_rate
        return frames_dict


def _get_equalized_odds_frame(model_dict, y_true, X_test, sensitive_features):
        preds_dict = _make_predictions(model_dict, X_test)
        frames_dict = {}
        for algo, pred in preds_dict.items():
            fns = {"tpr": true_positive_rate, "fpr": false_positive_rate}
            eo = MetricFrame(
                metrics=fns,
                y_true=y_true,
                y_pred=pred,
                sensitive_features=sensitive_features,
            )
            frames_dict[algo] = eo
        return frames_dict


def get_fairness_frames(model_dict, y_true, X_test, sensitive_features):
    frames_dict = {}
    for algo, model in model_dict.items():
          frames_dict[algo] = {}
          parity_dict = _get_parity_frame({algo: model}, y_true, X_test, sensitive_features)
          eo_dict = _get_equalized_odds_frame({algo: model}, y_true, X_test, sensitive_features)
          frames_dict[algo]['Demographic parity frame'] = parity_dict[algo]
          frames_dict[algo]['Equalized odds frame'] = eo_dict[algo]
    return frames_dict
     


def get_fairness_metrics(model_dict, X_test, y_test, sensitive_features):
    preds_dict = _make_predictions(model_dict, X_test)
    metrics_dict = {}
    for algo, pred in preds_dict.items():
        metrics_dict[algo] = {}
        metrics_dict[algo]['Demographic parity difference'] = \
            demographic_parity_difference(y_test, pred, sensitive_features=sensitive_features)
        metrics_dict[algo]['Demographic parity ratio'] = \
            demographic_parity_ratio(y_test, pred, sensitive_features=sensitive_features)
        metrics_dict[algo]['Equalized odds difference'] = \
            equalized_odds_difference(y_test, pred, sensitive_features=sensitive_features)
        metrics_dict[algo]['Equalized odds ratio'] = \
            equalized_odds_ratio(y_test, pred, sensitive_features=sensitive_features)
    return metrics_dict

# Main code

First, analyzing the whole 50k dataset by running fairness metrics on all races

Second, analyzing the merged synthetic datasets

In [19]:
# Analyze fairness metrics on whole 50k dataset

# X_train, X_test, y_train, y_test = split_data(df_og_50k)
# model_dict = train_models(X_train, y_train)
# fairness_metrics = get_fairness_metrics(model_dict, X_test, y_test)
# frames = get_fairness_frames(model_dict, y_test, X_test)
# for algo, frame_dict in frames.items():
#     print(algo)
#     print()
#     for name, frame in frame_dict.items():
#         print(name)
#         display(frame.by_group)

In [20]:
# Analyze the 4 merged datasets + the original

abs_rates = []
diff_rates = []
metrics = []

for id, df in dfs_dict.items():
    X_train, X_test, y_train, y_test = split_data(df)
    
    model_dict = train_models(X_train, y_train)
    
    fairness_metrics = get_fairness_metrics(model_dict, X_test, y_test, df['RAC1P'].loc[X_test.index])
    fairness_frames = get_fairness_frames(model_dict, y_test, X_test, df['RAC1P'].loc[X_test.index])
    metric_frames = get_ml_metrics(model_dict, X_test, y_test)
    df_fair = fairness_frames['RandomForestRegressor']['Equalized odds frame'].by_group
    df_metrics = metric_frames['RandomForestRegressor']

    for idx, group in enumerate([3,6]):
        abs_rates.append({'Dataset_ID': id,
                        'Class': group, 
                        'TPR': df_fair.at[group, 'tpr'], 
                        'FPR': df_fair.at[group, 'fpr']})
        
    diff_rates.append({'Dataset_ID': id,
                        'TPR_Diff': abs(df_fair.at[3, 'tpr'] - df_fair.at[6, 'tpr']),
                        'FPR_Diff': abs(df_fair.at[3, 'fpr'] - df_fair.at[6, 'fpr'])})
    
    metrics.append({'Dataset_ID': id,
                    'Accuracy': df_metrics['Accuracy'],
                    'F1': df_metrics['F1']})
        
pd.DataFrame(abs_rates).to_csv("results/tpr_fpr_abs.csv", index=False)
pd.DataFrame(diff_rates).to_csv("results/tpr_fpr_diff.csv", index=False)
pd.DataFrame(metrics).to_csv("results/metrics.csv", index=False)