## 1. Imports


In [217]:
import os
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns;

In [218]:
data_path = '../results/syn_raw/bi/'
folders= ['dt','gnb','lgr','gbt']
#folders= ['dt']

## 2. CSVs

In [219]:
def impact_csvs(data_path= 'data/results/',b_or_w = 'Black', folders= ['dt','gnb','lgr','gbt']):

    col_names_df = []

    for i,f in enumerate(folders):
        if b_or_w == 'Black':
            path = f'{data_path}{f}/{f}_black_results.csv'
        else:
            path = f'{data_path}{f}/{f}_white_results.csv'

        df = pd.read_csv(path,index_col=0)
        df = df.reset_index()

        col_names_df.append(f'{f.upper()}')

        if i == 0:
            joined_df = df.iloc[:,-1]
        else:
            joined_df = pd.concat([joined_df, df.iloc[:,-1]], axis=1)

    joined_df.set_axis(folders, axis=1)


    # split dataframe after the two reduction algorithms
    df = joined_df.iloc[:6,:]

    # set new index
    df['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']
    df.set_index('Constraint',inplace=True)
    
    df.columns = col_names_df

    print('Group: ',b_or_w,'\n DataFrame: \n',df)
    print('A')
    df.to_csv(f'{data_path}/{b_or_w}_DI.csv')
    print('B')


In [220]:
impact_csvs(data_path,'Black', folders= ['dt','gnb','lgr','gbt'])
impact_csvs(data_path,'White', folders= ['dt','gnb','lgr','gbt'])

Group:  Black 
 DataFrame: 
                 DT    GNB    LGR    GBT
Constraint                             
Unmitigated  30.99  31.74  27.82  30.99
DP           25.61  23.90  25.07  26.49
EO           30.41  27.63  26.23  30.39
EOO          28.43  26.70  26.29  28.79
FPER         30.79  27.83  27.82  30.55
ERP          30.99  31.20  27.82  30.99
A
B
Group:  White 
 DataFrame: 
                 DT    GNB    LGR    GBT
Constraint                             
Unmitigated  38.64  35.58  38.01  38.33
DP           39.30  38.48  39.15  39.47
EO           37.75  36.15  35.47  37.86
EOO          39.52  36.80  38.85  39.36
FPER         39.09  37.52  38.01  38.99
ERP          35.94  33.99  35.50  35.93
A
B


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']


## 3. FP/TP/TN/FN Ratios

In [221]:
# Ratios
dfs = {} # list for pandas dfs
pd.set_option('display.max_columns', None)
for i,f in enumerate(folders):
    path = f'{data_path}{f}/{f}_all_types.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    df = df.melt(var_name="ID",value_name="Category")
    df = df.groupby('ID').value_counts(normalize=True)
    df = df.reset_index()
    df = df.rename(columns= {0:'Ratio'})
    
    df = df.pivot(index='Category', columns='ID')['Ratio']
    df = df.fillna(0)
    df = df.round(decimals = 3)
    print('Classifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}{f}/{f}_type_ratios.csv')

Classifier:  dt 
 DataFrame: 
 ID          dpB    dpW    eoB    eoW   erpB   erpW  fprpB  fprpW  testB  \
Category                                                                  
FN        0.057  0.061  0.081  0.079  0.088  0.075  0.085  0.051  0.000   
FP        0.123  0.057  0.078  0.058  0.071  0.076  0.074  0.063  0.000   
TN        0.215  0.185  0.259  0.184  0.266  0.166  0.263  0.179  0.337   
TP        0.606  0.697  0.581  0.679  0.575  0.684  0.578  0.707  0.663   

ID        testW  tprpB  tprpW  unmitB  unmitW  
Category                                       
FN        0.000  0.070  0.064   0.088   0.044  
FP        0.000  0.097  0.054   0.071   0.069  
TN        0.242  0.240  0.188   0.266   0.172  
TP        0.758  0.592  0.694   0.575   0.714  
Classifier:  gnb 
 DataFrame: 
 ID          dpB    dpW    eoB    eoW   erpB   erpW  fprpB  fprpW  testB  \
Category                                                                  
FN        0.047  0.048  0.066  0.064  0.103  0.0

In [222]:
# Absolute numbers
dfs = {} # list for pandas dfs
pd.set_option('display.max_columns', None)
for i,f in enumerate(folders):
    path = f'{data_path}{f}/{f}_all_types.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    df = df.melt(var_name="ID",value_name="Category")
    df = df.groupby('ID').value_counts()
    df = df.reset_index()
    df = df.rename(columns= {0:'Number'})
    
    df = df.pivot(index='Category', columns='ID')['Number']
    df = df.fillna(0)
    print('Classifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}{f}/{f}_type_absolute.csv')

Classifier:  dt 
 DataFrame: 
 ID           dpB      dpW     eoB      eoW    erpB     erpW   fprpB    fprpW  \
Category                                                                       
FN         848.0    923.0  1212.0   1195.0  1303.0   1130.0  1260.0    770.0   
FP        1824.0    856.0  1165.0    876.0  1062.0   1144.0  1104.0    953.0   
TN        3198.0   2798.0  3857.0   2778.0  3960.0   2510.0  3918.0   2701.0   
TP        9014.0  10539.0  8650.0  10267.0  8559.0  10332.0  8602.0  10692.0   

ID         testB    testW   tprpB    tprpW  unmitB   unmitW  
Category                                                     
FN           0.0      0.0  1046.0    964.0  1303.0    667.0  
FP           0.0      0.0  1445.0    813.0  1062.0   1050.0  
TN        5022.0   3654.0  3577.0   2841.0  3960.0   2604.0  
TP        9862.0  11462.0  8816.0  10498.0  8559.0  10795.0  
Classifier:  gnb 
 DataFrame: 
 ID           dpB      dpW     eoB      eoW    erpB     erpW   fprpB    fprpW  \
Cate

# Analyzing Scores

#### Extractig Scores from csv into dataframes

In [223]:
# Scores Data Frames
classifier_dfs = {}
dfs_b = {}
dfs_w = {}

for f in folders:
    path = f'{data_path}{f}/{f}_all_scores.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    df = df.round(0)

    df_black = df.filter(like='B')
    df_white = df.filter(like='W')
    
    classifier_dfs[f] = df
    dfs_b[f] = df_black
    dfs_w[f] = df_white

print(classifier_dfs['dt'])
print(dfs_b['dt'])

       testB  testW  unmitB  unmitW    dpB  dpW    eoB  eoW  tprpB  tprpW  \
0      475.0    782   475.0     850  475.0  850  475.0  850  475.0    850   
1      624.0    608   699.0     683  699.0  683  699.0  683  699.0    683   
2      691.0    657   766.0     732  766.0  732  766.0  732  766.0    732   
3      361.0    788   361.0     850  361.0  850  361.0  850  361.0    850   
4      763.0    573   838.0     648  838.0  648  838.0  648  838.0    573   
...      ...    ...     ...     ...    ...  ...    ...  ...    ...    ...   
15111    NaN    576     NaN     426    NaN  426    NaN  576    NaN    576   
15112    NaN    675     NaN     750    NaN  750    NaN  750    NaN    750   
15113    NaN    782     NaN     850    NaN  850    NaN  850    NaN    850   
15114    NaN    758     NaN     833    NaN  833    NaN  833    NaN    833   
15115    NaN    537     NaN     537    NaN  537    NaN  537    NaN    537   

       fprpB  fprpW   erpB  erpW  
0      475.0    850  475.0   850  
1    

### Checking if normal distributions:

if p < 0.01 (or < 0.05) then the distribution is significantly different from a normal distribution

In [224]:
for c,df in classifier_dfs.items():
    print('Classifier:',c)
    for col in df:
        data=df[col].dropna(axis=0)
        _,p = stats.kstest(data, "norm")
        if p > 0.01:
            print(col,',p:',p)
    print('Check for norm Distributions done')

Classifier: dt
Check for norm Distributions done
Classifier: gnb
Check for norm Distributions done
Classifier: lgr
Check for norm Distributions done
Classifier: gbt
Check for norm Distributions done


### Mann Whitney U test:

“a two-sample rank test for the difference between two population medians . . . It assumes that the data are independent random samples from two populations that have the same shape.”

In [225]:
mwu_path = f'{data_path}mwu/'
os.makedirs(mwu_path,exist_ok = True)

#### Variance of Distributions Each Model against each other
if p < 0.001 (or < 0.05) then the distributions are significantly different from each other

In [226]:
def p_values_mwu_1model(dfs,model=''):
    p_vals = pd.DataFrame(data={'Classifier': []})
    p_signi = pd.DataFrame(data={'Classifier': []})
    
    for c1,df1 in dfs.items():
        col_signi = []
        col_vals = []
        idx = []
        
        data1 = df1[model].dropna(axis=0)
        
        for c2,df2 in dfs.items():
            idx.append(c2)
            
            data2 =df2[model].dropna(axis=0)
            
            _,p = stats.mannwhitneyu(data1, data2)
            
            col_vals.append(p)
            if p< 0.05:
                col_signi.append('s')
            else:
                col_signi.append(' ')
                
        p_signi[c1] = col_signi
        p_vals[c1] = col_vals
        
    p_vals['Classifier'] = idx
    p_vals = p_vals.set_index('Classifier')
    p_signi['Classifier'] = idx
    p_signi = p_signi.set_index('Classifier')
    
    p_vals = p_vals.round(decimals=3)
    print(p_signi)
    
    p_vals.to_csv(f'{mwu_path}p_{model}.csv')
    p_signi.to_csv(f'{mwu_path}significanz_{model}.csv')

In [227]:
for col in classifier_dfs['dt']:
    print('\nC:',col)
    p_values_mwu_1model(classifier_dfs,col)


C: testB
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: testW
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: unmitB
           dt gnb lgr gbt
Classifier               
dt                  s    
gnb                 s    
lgr         s   s       s
gbt                 s    

C: unmitW
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: dpB
           dt gnb lgr gbt
Classifier               
dt                       
gnb                     s
lgr                      
gbt             s        

C: dpW
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: eo

#### Variance of Distributions unmitigated v Mitigated for each race

if p < 0.001 (or < 0.0005) then the distributions are significantly different from each other

In [228]:
def p_race_mwu(dfs, b_or_w = 'B'):
    p_vals = pd.DataFrame(data={'Constraints': []})
    p_signi = pd.DataFrame(data={'Constraints': []})
    
    for c,df in dfs.items():
        
        c = f'{c}{b_or_w}'
        col_signi = []
        col_vals = []
        idx = []
        
        data_unmiti = df[f'unmit{b_or_w}'].dropna(axis=0)
        
        for col in df:
            
            idx.append(col[:-1])
            
            data_miti=df[col].dropna(axis=0)
            
            _,p = stats.mannwhitneyu(data_unmiti, data_miti)
            
            col_vals.append(p)
            if p< 0.05:
                col_signi.append('s')
            else:
                col_signi.append(' ')
                
        p_signi[c] = col_signi
        p_vals[c] = col_vals
        
    p_vals['Constraints'] = idx
    p_vals = p_vals.set_index('Constraints')
    
    
    
    p_signi['Constraints'] = idx
    p_signi = p_signi.set_index('Constraints')
    
    p_vals = p_vals.round(decimals=3)
    print(p_signi)
    
    p_vals.to_csv(f'{mwu_path}p_un_vs_miti_{b_or_w}.csv')
    p_signi.to_csv(f'{mwu_path}significanz_un_vs_miti_{b_or_w}.csv')
   

In [229]:
print('Black:')
p_race_mwu(dfs_b,'B')

print('\nWhite:')
p_race_mwu(dfs_w,'W')

Black:
            dtB gnbB lgrB gbtB
Constraints                   
test          s    s    s    s
unmit                         
dp            s    s    s    s
eo                 s          
tprp          s    s         s
fprp               s          
erp                           

White:
            dtW gnbW lgrW gbtW
Constraints                   
test          s    s    s    s
unmit                         
dp                            
eo                            
tprp                          
fprp                          
erp                     s     


Fairness Metrics 

In [230]:
# Absolute numbers
dfs = {} # list for pandas dfs
#datasets = ['00', '0b', '0i', 'b0','bb','bi','i0','ib','ii']
#datasets = ['00', '0b']
folders = ['dt','gnb','lgr','gbt']
df2 = pd.Series()
#pd.set_option('display.max_columns', None)
for _,f in enumerate(folders):
    print(f)
    #for _,ds in enumerate(datasets):
        #print(ds)
    path = f'{data_path}{f}/{f}_overall_results.csv'
    df = pd.read_csv(path)
    df = df.set_index('Run')
    df = df.iloc[:,12:]
    
  #  df2 = df.iloc[:,:0]
  #  df2['FairDiff'] = df.iloc[:,0]
   # print(df,df2)
   # for i in range(1,5):
      #  df2.iloc[i,0] = df.iloc[i,i]

    #print(df2)
    df = df.set_index(pd.Index(['Unmitigated','DP','EO','TPRP','FPRP','ERP']))
    print(df)
    #dfs[f] = df2
    #print('\nClassifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}/{f}/{f}_fair_performance.csv')

dt
             DP Diff  EO Diff  TPR Diff  FPR Diff  ER Diff
Unmitigated    13.72     7.59      7.39      7.59     4.53
DP              2.57    12.89      0.55     12.89     6.18
EO              7.77     1.86      1.86      0.78     2.27
TPRP            5.89     6.52      2.20      6.52     4.98
FPRP           11.83     6.06      6.06      4.10     4.48
ERP            11.28    10.16      3.35     10.16     0.85
gnb
             DP Diff  EO Diff  TPR Diff  FPR Diff  ER Diff
Unmitigated    22.29    24.69     13.07     24.69     4.01
DP              2.38    12.93      0.75     12.93     6.96
EO              6.87     1.59      1.59      0.40     3.19
TPRP            5.86     2.53      1.31      2.53     3.98
FPRP           11.24     6.10      6.10      3.83     5.52
ERP            17.44    21.05      7.96     21.05     1.22
lgr
             DP Diff  EO Diff  TPR Diff  FPR Diff  ER Diff
Unmitigated    10.15     5.35      5.35      1.70     5.60
DP              2.29    13.29      0.44     1

  df2 = pd.Series()


In [231]:
# Absolute numbers
dfs = {} # list for pandas dfs
#datasets = ['00', '0b', '0i', 'b0','bb','bi','i0','ib','ii']
#datasets = ['00', '0b']
folders = ['dt','gnb','lgr','gbt']
df2 = pd.Series()
#pd.set_option('display.max_columns', None)
for _,f in enumerate(folders):
    print(f)
    #for _,ds in enumerate(datasets):
        #print(ds)
    path = f'{data_path}{f}/{f}_overall_results.csv'
    df = pd.read_csv(path)
    df = df.set_index('Run')
    df = pd.concat([df.iloc[:,:1],df.iloc[:,3:4]],axis=1)
    
  #  df2 = df.iloc[:,:0]
  #  df2['FairDiff'] = df.iloc[:,0]
   # print(df,df2)
   # for i in range(1,5):
      #  df2.iloc[i,0] = df.iloc[i,i]

    #print(df2)
    df = df.set_index(pd.Index(['Unmitigated','DP','EO','TPRP','FPRP','ERP']))
    print(df)
    #dfs[f] = df2
    #print('\nClassifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}/{f}/{f}_model_performance.csv')

dt
               Acc  F1weighted
Unmitigated  86.39       86.36
DP           85.16       84.91
EO           85.17       85.26
TPRP         85.77       85.71
FPRP         86.38       86.37
ERP          84.54       84.60
gnb
               Acc  F1weighted
Unmitigated  85.67       85.56
DP           84.88       84.35
EO           84.25       84.01
TPRP         84.66       84.34
FPRP         85.60       85.28
ERP          84.09       84.10
lgr
               Acc  F1weighted
Unmitigated  85.86       85.53
DP           85.18       84.83
EO           83.61       83.17
TPRP         85.37       85.09
FPRP         85.86       85.53
ERP          83.83       83.72
gbt
               Acc  F1weighted
Unmitigated  86.41       86.35
DP           85.20       85.06
EO           85.18       85.28
TPRP         85.66       85.64
FPRP         86.36       86.33
ERP          84.66       84.70


  df2 = pd.Series()


In [232]:
#stats.pearsonr(data_unmiti, data_miti)