In [1]:
# ML Unfairness

In [2]:
import pandas as pd
import numpy as np

In [3]:
##############################
# We have a dataframe with all the features, actual values, predicted values of all years
##############################

In [4]:
############################################################################
# get the dataframe only contains the information we want
# return a new dataframe?

# e.g.: if we want to analyze gender in year0
# we then have return dataframe that contains: 
# ids, columns of gender(dummies), corresponding actual values and predicted values
############################################################################

In [5]:
# parameters: original df(dataframe), year(string), feature(string)
def new_df (df, year, feature):
    # values
    year_cols = [x for x in df.columns[df.columns.str.contains(year)]]
    # feature
    feature_cols = [x for x in df.columns[df.columns.str.contains(feature)]]
    # ID
    ID_col = ['ID']
    # attach 
    cols = year_cols + feature_cols + ID_col
    # the new dataframe
    data = df[cols]
    # drop the nan values
    data = data.dropna(axis = 0)
    
    return data

In [6]:
df = pd.read_csv('full_data.csv')

In [7]:
new_df (df, 'year0', 'REGION')

Unnamed: 0,year0,year0_real,year0_pred,year0_prob,REGION_MIDWEST,REGION_NOTH_EAST,REGION_SOUTH_EAST,ID
2,1.0,2.0,1.0,0.418155,0,0,1,60076
4,1.0,2.0,1.0,0.493894,1,0,0,60082
14,1.0,2.0,1.0,0.254679,1,0,0,60493
24,1.0,2.0,2.0,0.648194,0,0,1,60517
27,1.0,2.0,2.0,0.570695,0,1,0,60525
...,...,...,...,...,...,...,...,...
48329,1.0,2.0,2.0,0.607721,0,1,0,159285
48331,1.0,2.0,2.0,0.537648,0,0,1,159288
48339,1.0,2.0,2.0,0.850086,0,0,1,159299
48343,0.0,1.0,2.0,0.541452,0,1,0,159307


In [8]:
############################################################################
# get the groups of the feature
# return a list

# e.g.: if we want to analyze gender
# we then return: ['M','F']
############################################################################

In [9]:
# parameters df(dataframe), feature(string)
def groups(df,feature):
    cols = [x for x in df.columns[df.columns.str.contains(feature)]]
    return cols

In [10]:
groups (df, 'REGION')

['REGION_MIDWEST', 'REGION_NOTH_EAST', 'REGION_SOUTH_EAST']

In [11]:
##################################################################
# get the actual and predicted values of different groups as 2-D arrays
# e.g. gender contains (M,F)
# return [[1,0,0...], [1,0,1...],[prob for M], [1,0,0...], [0,0,0...], [probn for F]]
#       actual M,  predicted M,                actual F, predicted F


##################################################################

In [12]:
# the function that seperate the results of defferent groups

def actual_pred_values(df, year, feature):
    # get the new dataframe contains the useful data
    data = new_df(df, year, feature)
    # get the number of groups
    num_groups = len(groups(data,feature))
    # get the columns names (groups)
    groups_name = groups(data,feature) 
        
    
    # the 2-D list to store the values
    # for the columns
    lst_values = []
    for i in range(0, num_groups):
        # create an empty array for this 
        # get the pred and actual for every group
        year_actual = year + '_real'
        year_pred = year + '_pred'
        year_prob = year + '_prob'
        # actual values
        lst_actual = data[data[groups_name[i]] == 1][year_actual].values.tolist()
        lst_values.append(list(lst_actual))
        # predicted values
        lst_pred = data.loc[data[groups_name[i]]== 1, year_pred].values.tolist()
        lst_values.append(list(lst_pred))
        # probabilities
        lst_prob = data.loc[data[groups_name[i]]== 1, year_prob].values.tolist()
        lst_values.append(list(lst_prob))
    
    return lst_values
     
    

In [13]:
actual_pred_values (df, 'year0','REGION')

[[2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  1.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  2.0,
  1.0,
  1.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,

In [14]:
# import dataframe
data = pd.read_csv('full_data.csv')

In [15]:
dataa = actual_pred_values(data, 'year8', 'GENDER')

In [16]:
dataa[2]

[0.11480452,
 0.23956259800000002,
 0.307035587,
 0.17793520000000002,
 0.148609302,
 0.21941434199999998,
 0.279356022,
 0.142140087,
 0.094178386,
 0.22036700399999998,
 0.36259500299999997,
 0.085214629,
 0.135339333,
 0.273387698,
 0.19656005399999998,
 0.151647216,
 0.37210097200000003,
 0.204476363,
 0.190829238,
 0.114764492,
 0.35598182700000003,
 0.14705163,
 0.164532368,
 0.154109518,
 0.162437083,
 0.07011687900000001,
 0.199877633,
 0.063841754,
 0.26112686399999996,
 0.236936316,
 0.18843871399999998,
 0.183167005,
 0.102350988,
 0.015329124,
 0.403621041,
 0.17656304399999997,
 0.21378983399999998,
 0.205214122,
 0.097177915,
 0.18349460199999998,
 0.185040398,
 0.280763532,
 0.100529921,
 0.236358806,
 0.16897085399999998,
 0.309241831,
 0.150011637,
 0.271883092,
 0.335357476,
 0.349945929,
 0.13936934,
 0.21572701,
 0.39694118899999997,
 0.187918406,
 0.134626444,
 0.119640358,
 0.081830932,
 0.223445195,
 0.373909372,
 0.230339011,
 0.18347011300000002,
 0.330951307,


In [17]:
data.head()

Unnamed: 0.1,Unnamed: 0,AGE,AGE_DON,BMI_CALC,BMI_DON_CALC,BUN_DON,CREAT_DON,CREAT_TRR,DAYS_STAT1,DAYS_STAT1A,...,year7_prob,year8_real,year8_pred,year8_prob,year9_real,year9_pred,year9_prob,year10_real,year10_pred,year10_prob
0,0,52,24,21.434609,26.631409,13.0,1.0,1.2,0,0,...,,,,,,,,,,
1,1,48,22,28.398718,37.787058,13.0,1.0,1.2,0,0,...,,,,,,,,,,
2,2,44,25,20.619254,23.344121,13.0,1.0,1.2,0,0,...,0.470412,1.0,2.0,0.512121,1.0,2.0,0.509773,1.0,2.0,0.505782
3,3,59,27,29.407788,29.158955,13.0,1.0,1.2,0,0,...,,,,,,,,,,
4,4,56,28,21.484375,27.124606,13.0,1.0,1.2,0,0,...,0.507765,2.0,2.0,0.524879,2.0,2.0,0.524476,2.0,2.0,0.545338


In [18]:
for i in range(0,len(data.columns)):
    print(data.columns[i])

Unnamed: 0
AGE
AGE_DON
BMI_CALC
BMI_DON_CALC
BUN_DON
CREAT_DON
CREAT_TRR
DAYS_STAT1
DAYS_STAT1A
DAYS_STAT1B
DAYS_STAT2
DAYSWAIT_CHRON
HEMO_CO_TCR
HEMO_CO_TRR
HEMO_PA_DIA_TCR
HEMO_PA_DIA_TRR
HEMO_PA_MN_TCR
HEMO_PA_MN_TRR
HEMO_PCW_TCR
HEMO_PCW_TRR
HEMO_SYS_TCR
HEMO_SYS_TRR
HGT_CM_CALC
HGT_CM_DON_CALC
HGT_CM_TCR
INIT_AGE
INIT_BMI_CALC
INIT_HGT_CM_CALC
INIT_WGT_KG_CALC
ISCHTIME
SGOT_DON
SGPT_DON
TBILI
TBILI_DON
WGT_KG_CALC
WGT_KG_DON_CALC
WGT_KG_TCR
BMI_CHNG
WGT_CHNG
HGT_CHNG
AGE_MAT
BMI_MAT
PVR
year1
year2
year3
year4
year5
year6
year7
year8
year9
year10
year0
ID
ABO_A
ABO_AB
ABO_B
ABO_DON_A
ABO_DON_AB
ABO_DON_B
ABO_MAT_IDENTICAL
AMIS_NO_MISMATCH
AMIS_ONE_MISMATCHED
AMIS_TWO_MISMATCHED
ANTIHYPE_DON_N
ANTIHYPE_DON_UNKNOWN
BMIS_NO_MISMATCH
BMIS_ONE_MISMATCHED
BMIS_TWO_MISMATCHED
BRONCHO_LT_DON_ABNORMAL
BRONCHO_LT_DON_N
BRONCHO_LT_DON_NORMAL
BRONCHO_LT_DON_OTHER
BRONCHO_RT_DON_ABNORMAL
BRONCHO_RT_DON_N
BRONCHO_RT_DON_NORMAL
BRONCHO_RT_DON_OTHER
CHEST_XRAY_DON_ABNORMAL_BOTH
CHEST_XRAY_DON_ABN

Statistical test

(1) Statistical signiﬁcance does not mean practical signiﬁcance; statistical tests do not show the magnitude of the the differences between the groups, which can be huge, or can be minor. 


(2) If the null hypothesis is rejected then discrimination is present, but if null hypothesis cannot be rejected, this does not prove that there is no discrimination. It maybe that the data sample is too small to declare discrimination.


In [19]:
    file_name = 'features_'+'year0'+'.csv'
    feature_data = pd.read_csv(file_name)
    features = list(feature_data['variables'])
    features

['AGE',
 'AGE_DON',
 'BMI_CALC',
 'CREAT_TRR',
 'DAYS_STAT1',
 'DAYSWAIT_CHRON',
 'HEMO_SYS_TCR',
 'HEMO_SYS_TRR',
 'HGT_CM_CALC',
 'HGT_CM_DON_CALC',
 'HGT_CM_TCR',
 'INIT_HGT_CM_CALC',
 'ISCHTIME',
 'SGOT_DON',
 'TBILI',
 'TBILI_DON',
 'PVR',
 'AMIS_ONE_MISMATCHED',
 'ANTIHYPE_DON_N',
 'BMIS_ONE_MISMATCHED',
 'COD_CAD_DON_CEREBROVASCULAR_STROKE',
 'CONTIN_CIG_DON_UNKNOWN',
 'CORONARY_ANGIO_UNKNOWN',
 'DIAG_DILATED_MYOPATHY_OTH',
 'DIAG_OTHER',
 'FUNC_STAT_TRR_ABLE',
 'FUNC_STAT_TRR_ASSISTED',
 'FUNC_STAT_TRR_OTHER',
 'GENDER_DON_F',
 'HIST_OTH_DRUG_DON_N',
 'HTLV2_OLD_DON_OTHER',
 'INOTROP_VASO_CO_TRR_UNKNOWN',
 'INOTROP_VASO_PCW_TRR_UNKNOWN',
 'INOTROP_VASO_SYS_TCR_UNKNOWN',
 'INOTROP_VASO_SYS_TRR_UNKNOWN',
 'INOTROPES_TRR_N',
 'INOTROPIC_N',
 'LIFE_SUP_TRR_N',
 'LIFE_SUP_TRR_UNKNOWN',
 'MED_COND_TRR_ICU_HOSPITALIZED',
 'PRIOR_CARD_SURG_TCR_N',
 'PRIOR_CARD_SURG_TRR_UNKNOWN',
 'PROC_TY_HR_BICAVAL',
 'PROC_TY_HR_OTHER',
 'PT_T4_DON_N',
 'REGION_NOTH_EAST',
 'REGION_SOUTH_EAST',
 'SHA

1. Regression slope test

In [20]:
from scipy.stats import linregress

def regression_slope_test(df, year, feature):
    # get the features we used for the year
    file_name = 'features_'+year+'.csv'
    feature_data = pd.read_csv(file_name)
    features = list(feature_data['variables'])
    
    # if the feature we want to check is not included in this year
    for x in features:
        if feature in x:
            print('applicable')
            break;
    
    #regression may include only the protected variable s as a predictor, but it may also include variables from X that may explain some of the observed difference in decisions.
    # first try regression include only the protected variable
    # define X and y
    
    # get the groups of feature
    group = groups(df, feature)
    
    # get the regression one by one?
    for x in group:
        pred = year + '_pred'
        prob = year + '_prob'
        
        
        data = df[[x, pred, prob]]
        data = data.dropna(axis = 0)
        X = data[x]

        y_pred = data[pred]
        y_prob = data[prob]
        
        slope, intercept, r_value, p_value, std_err = linregress(X, y_pred)
        print('prediction value ', x , 'p_value' , str(p_value) )
        slope, intercept, r_value, p_value, std_err = linregress(X, y_pred)
        print('probbility ' , x,  'p_value', str(p_value) )

        # more measures such as RMSE, R^2, MSE...


In [21]:
regression_slope_test(df, 'year0', 'GENDER')

applicable
prediction value  GENDER_M p_value 2.047124441310111e-05
probbility  GENDER_M p_value 2.047124441310111e-05
prediction value  GENDER_F p_value 2.047124441310111e-05
probbility  GENDER_F p_value 2.047124441310111e-05
prediction value  GENDER_DON_M p_value 1.7288832952971992e-79
probbility  GENDER_DON_M p_value 1.7288832952971992e-79
prediction value  GENDER_DON_F p_value 1.7288832952967557e-79
probbility  GENDER_DON_F p_value 1.7288832952967557e-79
prediction value  GENDER_MAT_N p_value 2.2467804144903932e-36
probbility  GENDER_MAT_N p_value 2.2467804144903932e-36


2. difference of means test

Calculate the T-test for the means of two independent samples of scores.
This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default.

In [22]:
from scipy.stats import ttest_ind
from itertools import combinations 


def difference_of_means_test(df, year, feature):
    # get the features we used for the year
    file_name = 'features_'+year+'.csv'
    feature_data = pd.read_csv(file_name)
    features = list(feature_data['variables'])
    
    # if the feature we want to check is not included in this year
    for x in features:
        if feature in x:
            print('applicable')
            break;
    
    # get the groups of feature
    group = groups(df, feature)
    # get the list of values
    values = actual_pred_values(df, year, feature)
    
    # get the predicted values as a list
    predvals = []
    for i in range(0, len(group)):
        predvals.append(list(values[i*3+1]))
    
    # get the combination of the predvals
    pred_comb = list(combinations(predvals,2))
    # get the combination of the group names
    group_comb = list(combinations(group,2))
    
    # get the result of means test 
    for i in range(0,len(pred_comb)):
        group_name = group_comb[i]
        print('groups: ', group_name)
        # sigma
        p_1 = pred_comb[i][0]
        count = len(pred_comb[i][0])
        statistic, p = ttest_ind(pred_comb[i][0],pred_comb[i][1])
        print('statistic:', statistic, 'p-value: ', p)

In [23]:
difference_of_means_test(df, 'year0', 'GENDER')

applicable
groups:  ('GENDER_M', 'GENDER_F')
statistic: 4.261827395631901 p-value:  2.047124441310169e-05
groups:  ('GENDER_M', 'GENDER_DON_M')
statistic: -6.059581437501714 p-value:  1.3998704783410631e-09
groups:  ('GENDER_M', 'GENDER_DON_F')
statistic: 14.400720113644237 p-value:  1.4713429790865425e-46
groups:  ('GENDER_M', 'GENDER_MAT_N')
statistic: 10.12080918885036 p-value:  5.8233834705638726e-24
groups:  ('GENDER_F', 'GENDER_DON_M')
statistic: -8.50961667933534 p-value:  2.027864792080694e-17
groups:  ('GENDER_F', 'GENDER_DON_F')
statistic: 7.570454494374112 p-value:  4.386999686490028e-14
groups:  ('GENDER_F', 'GENDER_MAT_N')
statistic: 4.326710577515286 p-value:  1.543178279699763e-05
groups:  ('GENDER_DON_M', 'GENDER_DON_F')
statistic: 19.05634258013374 p-value:  1.7288832952954745e-79
groups:  ('GENDER_DON_M', 'GENDER_MAT_N')
statistic: 14.681091408607776 p-value:  2.9085620937702656e-48
groups:  ('GENDER_DON_F', 'GENDER_MAT_N')
statistic: -3.343982878062749 p-value:  0.00

3. Difference in proportions for two groups

The null hypothesis is that the rates of positive outcomes within the two groups are equal. 
Only applicable for two groups

In [45]:
from scipy import stats
import math 

def diff_in_prop_test (df, year, feature):
    # get the features we used for the year
    file_name = 'features_'+year+'.csv'
    feature_data = pd.read_csv(file_name)
    features = list(feature_data['variables'])
    # if the feature we want to check is not included in this year
    for x in features:
        if feature in x:
            print('applicable')
            break;
    
    # get the groups of feature
    group = groups(df, feature)
    # get the list of values
    values = actual_pred_values(df, year, feature)
    
    # get the actual values as a list
    predvals = []
    for i in range(0, len(group)):
        predvals.append(list(values[i*3+1]))
    
    # get the combination of the predvals
    pred_comb = list(combinations(predvals,2))
    # get the combination of the group names
    group_comb = list(combinations(group,2))
    
    # get the result of means test 
    for i in range(0,len(pred_comb)):
        group_name = group_comb[i]
        print('groups: ', group_name)
        
        # get sigma
        
        # get p_1_pos (proportion of positive in group 1) and 
        # p_1_neg (proportion of negative in group 1)
        group_1 = pred_comb[i][0]
        group_1_pos = [1 for x in group_1 if x == 1]
        group_1_neg = [0 for x in group_1 if x == 2]
        p_1_pos = len(group_1_pos)/len(group_1)
        p_1_neg = len(group_1_neg)/len(group_1)
        n_1 = len(group_1)
        
        # get p_2_pos (proportion of positive in group 2) and 
        # p_2_neg (proportion of negative in group 2)
        group_2 = pred_comb[i][1]
        group_2_pos = [1 for x in group_2 if x == 1]
        group_2_neg = [0 for x in group_2 if x == 2]
        p_2_pos = len(group_2_pos)/len(group_2)
        p_2_neg = len(group_2_neg)/len(group_2)
        n_2 = len(group_2)
        
        sigma = math.sqrt( p_1_pos * p_1_neg /n_1 + p_2_pos * p_2_neg /n_2)
        #get the z-statistics (typo in the reference?????)
        z = (p_1_pos - p_2_pos)/sigma
        
        # two sided? 
        p = stats.norm.cdf(z)
        print('statistic:', z, 'p-value: ', p)

In [46]:
diff_in_prop_test(df, 'year0', 'GENDER')

applicable
groups:  ('GENDER_M', 'GENDER_F')
statistic: -4.235612101341684 p-value:  1.139649778191816e-05
groups:  ('GENDER_M', 'GENDER_DON_M')
statistic: 6.066067120000043 p-value:  0.9999999993445982
groups:  ('GENDER_M', 'GENDER_DON_F')
statistic: -14.384070547519597 p-value:  3.2573713402031213e-47
groups:  ('GENDER_M', 'GENDER_MAT_N')
statistic: -10.065864680986007 p-value:  3.9098566733805826e-24
groups:  ('GENDER_F', 'GENDER_DON_M')
statistic: 8.365714260881798 p-value:  1.0
groups:  ('GENDER_F', 'GENDER_DON_F')
statistic: -7.5637690464622676 p-value:  1.9577734810031156e-14
groups:  ('GENDER_F', 'GENDER_MAT_N')
statistic: -4.32771212538398 p-value:  7.533310697088889e-06
groups:  ('GENDER_DON_M', 'GENDER_DON_F')
statistic: -18.864105045632478 p-value:  1.1252470942629294e-79
groups:  ('GENDER_DON_M', 'GENDER_MAT_N')
statistic: -14.46925715210761 p-value:  9.475388217353156e-48
groups:  ('GENDER_DON_F', 'GENDER_MAT_N')
statistic: 3.3438572220031113 p-value:  0.9995868887625873
