In [1]:
import pandas as pd
import os
df = pd.read_csv(os.path.join(os.getcwd(), "simulation.csv"))
df.head()

Unnamed: 0,Label,Gender,Race,Usage,Income
0,0,MALE,Asian,3035,20657
1,0,FEMALE,White,1065,30909
2,0,MALE,White,1056,10901
3,0,MALE,White,6003,60390
4,0,MALE,Hispanic,5092,10612


In [2]:

def get_demographic_parity(p_pos, p_feature, p_pos_feature):
    return p_pos_feature / p_feature

In [4]:
num_rows = df.shape[0]
label_col = "Label"
sensitive_col = "Race"
num_pos = df[df[label_col] == 1].shape[0]
feature_values = df[sensitive_col].unique()
p_feature_col = df[sensitive_col].value_counts().rename("p_feature") / num_rows
p_pos_feature_col = df[df[label_col] == 1][sensitive_col].value_counts().rename("p_pos_feature") / num_rows
new_df = pd.concat([p_feature_col, p_pos_feature_col], axis=1)
new_df["p_pos"] = df[df[label_col]==1].shape[0] / num_rows
new_df.apply(lambda x: get_demographic_parity(x["p_pos"], x["p_feature"], x["p_pos_feature"]), axis = 1)

White       0.090526
Asian       0.062827
Black       0.110526
Hispanic    0.166667
Other       0.203704
dtype: float64

In [30]:
from constants import feature_measures_to_func
import itertools
def get_feature_metrics(df, sensitive_col, label_col):
    # TODO check that label col is 0 or 1  column
    num_rows = df.shape[0]
    feature_values = df[sensitive_col].unique()
    p_feature_col = df[sensitive_col].value_counts().rename("p_feature") / num_rows
    p_pos_feature_col = df[df[label_col] == 1][sensitive_col].value_counts().rename("p_pos_feature") / num_rows
    new_df = pd.concat([p_feature_col, p_pos_feature_col], axis=1)
    new_df["p_pos"] = df[df[label_col] == 1].shape[0] / num_rows
    for metric, func in feature_measures_to_func.items():
        new_df[metric.value] = new_df.apply(lambda x: func(x["p_pos"], x["p_feature"], x["p_pos_feature"]), axis=1)
    return new_df

def get_gaps(df, sensitive_col, label_col):
    metrics_df = get_feature_metrics(df, sensitive_col, label_col)
    unique_vals = df[sensitive_col].unique()
    pairs = list(itertools.combinations(unique_vals, 2))  # list of tuples of the pairings of classes
    gap_df = pd.DataFrame(pairs, columns=["classA", "classB"])
    for measure in feature_measures_to_func.keys() :
        classA_metric = gap_df["classA"].apply(lambda x: metrics_df.loc[x])[measure.value]
        classB_metric = gap_df["classB"].apply(lambda x: metrics_df.loc[x])[measure.value]
        gap_df[measure.value + "_gap"] = classA_metric-classB_metric
    return gap_df

In [10]:
get_feature_metrics(df, "Gender", "Label")

Unnamed: 0,p_feature,p_pos_feature,p_pos,demographic_parity,pointwise_mutual_info,sd_coef,jaccard_index,log_likelihood,ttest_pvalue
MALE,0.734,0.081,0.102,0.110354,-2.20406,0.09689,0.107285,-0.230524,0.099159
FEMALE,0.183,0.016,0.102,0.087432,-2.436897,0.05614,0.05948,-1.852384,0.609953
UNKNOWN,0.083,0.005,0.102,0.060241,-2.809403,0.027027,0.027778,-3.015535,1.016554


In [31]:
get_gaps(df, "Gender", "Label")

Unnamed: 0,classA,classB,demographic_parity_gap,pointwise_mutual_info_gap,sd_coef_gap,jaccard_index_gap,log_likelihood_gap,ttest_pvalue_gap
0,MALE,FEMALE,0.022923,0.232838,0.04075,0.047805,1.62186,-0.510794
1,MALE,UNKNOWN,0.050113,0.605343,0.069863,0.079507,2.785011,-0.917394
2,FEMALE,UNKNOWN,0.027191,0.372505,0.029113,0.031702,1.163151,-0.4066
