## 1. Imports


In [50]:
import os
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns;

In [52]:
data_path = '../data/results/testset_balanced_200_1/'
folders= ['dt','gnb','lgr','gbt']
#folders= ['dt','gnb']

## 2. CSVs

In [53]:
def impact_csvs(data_path= 'data/results/',b_or_w = 'Black', folders= ['dt','gnb','lgr','gbt']):

    col_names_eg = []
    col_names_gs = []

    for i,f in enumerate(folders):
        if b_or_w == 'Black':
            path = f'{data_path}{f}/{f}_black_results.csv'
        else:
            path = f'{data_path}{f}/{f}_white_results.csv'

        df = pd.read_csv(path,index_col=0)
        df = df.reset_index()

        col_names_eg.append(f'EG+{f.upper()}')
        col_names_gs.append(f'GS+{f.upper()}')

        if i == 0:
            joined_df = df.iloc[:,-1]
        else:
            joined_df = pd.concat([joined_df, df.iloc[:,-1]], axis=1)

    joined_df.set_axis(folders, axis=1)


    # split dataframe after the two reduction algorithms
    df_eg = joined_df.iloc[:6,:]
    df_gs = pd.concat([joined_df.iloc[0:1,:],joined_df.iloc[6:,:]])

    # set new index
    df_eg['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']
    df_eg.set_index('Constraint',inplace=True)
    df_gs['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']
    df_gs.set_index('Constraint',inplace=True)
    df_eg.columns = col_names_eg
    df_gs.columns = col_names_gs

    df_final = pd.concat([df_eg, df_gs], axis=1)
    print('Group: ',b_or_w,'\n DataFrame: \n',df_final)
    print('A')
    df_final.to_csv(f'{data_path}/{b_or_w}_DI.csv')
    print('B')


In [54]:
impact_csvs(data_path,'Black', folders= ['dt','gnb','lgr','gbt'])
impact_csvs(data_path,'White', folders= ['dt','gnb','lgr','gbt'])

Group:  Black 
 DataFrame: 
              EG+DT  EG+GNB  EG+LGR  EG+GBT  GS+DT  GS+GNB  GS+LGR  GS+GBT
Constraint                                                               
Unmitigated  19.05   17.87   19.19   19.16  19.05   17.87   19.19   19.16
DP            2.99    1.92    3.00    3.72  17.16   19.07   16.05   17.16
EO           16.89   15.54   14.94   17.16  19.05   17.87   19.19   19.16
EOO          16.76   16.08   15.58   17.05  16.76   17.67   15.58   17.16
FPER         17.44   16.37   16.78   17.35  19.05   17.87   19.19   19.16
ERP          19.05   17.92   19.20   19.16  19.05   18.15   19.19   19.16
A
B
Group:  White 
 DataFrame: 
              EG+DT  EG+GNB  EG+LGR  EG+GBT  GS+DT  GS+GNB  GS+LGR  GS+GBT
Constraint                                                               
Unmitigated   9.75    3.57   10.02   10.42   9.75    3.57   10.02   10.42
DP           18.29   18.66   18.60   18.64  16.33   11.42   16.52   16.64
EO           16.58   15.72   14.59   16.75   9.75 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eg['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eg['Constraint'] = ['Unmitigated', 'DP', 'EO', 'EOO','FPER','ERP']


## 3. FP/TP/TN/FN Ratios

In [55]:
# Ratios
dfs = {} # list for pandas dfs
pd.set_option('display.max_columns', None)
for i,f in enumerate(folders):
    path = f'{data_path}{f}/{f}_all_types.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    df = df.melt(var_name="ID",value_name="Category")
    df = df.groupby('ID').value_counts(normalize=True)
    df = df.reset_index()
    df = df.rename(columns= {0:'Ratio'})
    
    df = df.pivot(index='Category', columns='ID')['Ratio']
    df = df.fillna(0)
    df = df.round(decimals = 3)
    print('Classifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}{f}/{f}_type_ratios.csv')

Classifier:  dt 
 DataFrame: 
 ID        egdpB  egdpW  egeoB  egeoW  egerpB  egerpW  egfprpB  egfprpW  \
Category                                                                 
FN        0.039  0.124  0.080  0.075   0.134   0.035    0.086    0.059   
FP        0.204  0.046  0.091  0.082   0.049   0.159    0.084    0.090   
TN        0.296  0.454  0.409  0.418   0.451   0.341    0.416    0.410   
TP        0.461  0.376  0.420  0.425   0.366   0.465    0.414    0.441   

ID        egtprpB  egtprpW  gsdpB  gsdpW  gseoB  gseoW  gserpB  gserpW  \
Category                                                                 
FN          0.080    0.076  0.082  0.057  0.134  0.025   0.134   0.025   
FP          0.092    0.074  0.088  0.093  0.049  0.153   0.049   0.153   
TN          0.408    0.426  0.412  0.407  0.451  0.347   0.451   0.347   
TP          0.420    0.424  0.418  0.443  0.366  0.475   0.366   0.475   

ID        gsfprpB  gsfprpW  gstprpB  gstprpW  testB  testW  unmitB  unmitW  
Ca

In [56]:
# Absolute numbers
dfs = {} # list for pandas dfs
pd.set_option('display.max_columns', None)
for i,f in enumerate(folders):
    path = f'{data_path}{f}/{f}_all_types.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    df = df.melt(var_name="ID",value_name="Category")
    df = df.groupby('ID').value_counts()
    df = df.reset_index()
    df = df.rename(columns= {0:'Number'})
    
    df = df.pivot(index='Category', columns='ID')['Number']
    df = df.fillna(0)
    print('Classifier: ',f,'\n DataFrame: \n',df)
    df.to_csv(f'{data_path}{f}/{f}_type_absolute.csv')

Classifier:  dt 
 DataFrame: 
 ID         egdpB   egdpW   egeoB   egeoW  egerpB  egerpW  egfprpB  egfprpW  \
Category                                                                     
FN         786.0  1793.0  1639.0  1090.0  2739.0   508.0   1749.0    856.0   
FP        4168.0   666.0  1852.0  1189.0  1007.0  2290.0   1721.0   1294.0   
TN        6033.0  6554.0  8349.0  6031.0  9194.0  4930.0   8480.0   5926.0   
TP        9415.0  5427.0  8562.0  6130.0  7462.0  6712.0   8452.0   6364.0   

ID        egtprpB  egtprpW   gsdpB   gsdpW   gseoB   gseoW  gserpB  gserpW  \
Category                                                                     
FN         1625.0   1103.0  1681.0   820.0  2739.0   365.0  2739.0   365.0   
FP         1876.0   1074.0  1794.0  1342.0  1007.0  2203.0  1007.0  2203.0   
TN         8325.0   6146.0  8407.0  5878.0  9194.0  5017.0  9194.0  5017.0   
TP         8576.0   6117.0  8520.0  6400.0  7462.0  6855.0  7462.0  6855.0   

ID        gsfprpB  gsfprpW  gst

# Analyzing Scores

#### Extractig Scores from csv into dataframes

In [57]:
# Scores Data Frames
classifier_dfs = {}
dfs_b = {}
dfs_w = {}
dfs_eg = {}
dfs_gs = {}
for f in folders:
    path = f'{data_path}{f}/{f}_all_scores.csv'
    df = pd.read_csv(path)
    df = df.reset_index(drop=True)
    

    df_black = df.filter(like='B')
    df_white = df.filter(like='W')
    df_eg = pd.concat([df.iloc[:,:4],df.filter(like='eg')],axis=1)
    df_gs = pd.concat([df.iloc[:,:4],df.filter(like='gs')],axis=1)
    
    classifier_dfs[f] = df
    dfs_b[f] = df_black
    dfs_w[f] = df_white
    dfs_eg[f] = df_eg
    dfs_gs[f] = df_gs
print(classifier_dfs['dt'])

       testB  testW  unmitB  unmitW  egdpB  egdpW  egeoB  egeoW  egtprpB  \
0        728  746.0     803   821.0    803  821.0    803  821.0      803   
1        576  355.0     651   355.0    651  355.0    651  355.0      651   
2        664  758.0     739   833.0    739  833.0    739  833.0      739   
3        626  552.0     701   627.0    701  552.0    701  552.0      701   
4        406  767.0     406   842.0    406  842.0    406  842.0      406   
...      ...    ...     ...     ...    ...    ...    ...    ...      ...   
20397    676    NaN     751     NaN    751    NaN    751    NaN      751   
20398    664    NaN     739     NaN    739    NaN    739    NaN      739   
20399    560    NaN     560     NaN    635    NaN    635    NaN      635   
20400    608    NaN     683     NaN    683    NaN    683    NaN      683   
20401    468    NaN     468     NaN    543    NaN    468    NaN      468   

       egtprpW  egfprpB  egfprpW  egerpB  egerpW  gsdpB  gsdpW  gseoB  gseoW  \
0      

### Checking if normal distributions:

if p < 0.01 (or < 0.05) then the distribution is significantly different from a normal distribution

In [35]:
for c,df in classifier_dfs.items():
    print('Classifier:',c)
    for col in df:
        data=df[col].dropna(axis=0)
        _,p = stats.kstest(data, "norm")
        if p > 0.01:
            print(col,',p:',p)
    print('Check for norm Distributions done')

Classifier: dt
Check for norm Distributions done
Classifier: gnb
Check for norm Distributions done
Classifier: lgr
Check for norm Distributions done
Classifier: gbt
Check for norm Distributions done


### Mann Whitney U test:

“a two-sample rank test for the difference between two population medians . . . It assumes that the data are independent random samples from two populations that have the same shape.”

In [36]:
mwu_path = f'{data_path}mwu/'
os.makedirs(mwu_path,exist_ok = True)

#### Variance of Distributions EG vs. GS
if p < 0.001 (or < 0.05) then the distributions are significantly different from each other

In [37]:
p_vals = pd.DataFrame(data={'Constraints': []})
p_signi = pd.DataFrame(data={'Constraints': []})
    
for (c1,df_eg),(c2,df_gs) in zip(dfs_eg.items(),dfs_gs.items()):
    col_signi = []
    col_vals = []
    idx = []
    
    for col_eg,col_gs in zip(df_eg,df_gs):
        
        idx.append(col_eg[2:])
        
        data_eg=df_eg[col_eg].dropna(axis=0)
        data_gs=df_gs[col_gs].dropna(axis=0)
        
        _,p = stats.mannwhitneyu(data_eg, data_gs)
        
        col_vals.append(p)
        if p< 0.05:
            col_signi.append('s')
        else:
            col_signi.append(' ')
            
    p_signi[c1] = col_signi
    p_vals[c1] = col_vals
    
p_vals['Constraints'] = idx
p_vals = p_vals.set_index('Constraints')
p_signi['Constraints'] = idx
p_signi = p_signi.set_index('Constraints')

p_vals = p_vals.round(decimals=3)
print(p_signi)
    
p_vals.to_csv(f'{mwu_path}p_eg_gs.csv')
p_signi.to_csv(f'{mwu_path}significanz_eg_gs.csv')


            dt gnb lgr gbt
Constraints               
stB                       
stW                       
mitB                      
mitW                      
dpB          s   s   s   s
dpW          s   s   s   s
eoB          s   s   s   s
eoW          s   s   s   s
tprpB            s        
tprpW            s        
fprpB        s   s   s   s
fprpW        s   s   s   s
erpB                      
erpW             s        


#### Variance of Distributions Each Model against each other
if p < 0.001 (or < 0.05) then the distributions are significantly different from each other

In [38]:
def p_values_mwu_1model(dfs,model=''):
    p_vals = pd.DataFrame(data={'Classifier': []})
    p_signi = pd.DataFrame(data={'Classifier': []})
    
    for c1,df1 in dfs.items():
        col_signi = []
        col_vals = []
        idx = []
        
        data1 = df1[model].dropna(axis=0)
        
        for c2,df2 in dfs.items():
            idx.append(c2)
            
            data2 =df2[model].dropna(axis=0)
            
            _,p = stats.mannwhitneyu(data1, data2)
            
            col_vals.append(p)
            if p< 0.05:
                col_signi.append('s')
            else:
                col_signi.append(' ')
                
        p_signi[c1] = col_signi
        p_vals[c1] = col_vals
        
    p_vals['Classifier'] = idx
    p_vals = p_vals.set_index('Classifier')
    p_signi['Classifier'] = idx
    p_signi = p_signi.set_index('Classifier')
    
    p_vals = p_vals.round(decimals=3)
    print(p_signi)
    
    p_vals.to_csv(f'{mwu_path}p_{model}.csv')
    p_signi.to_csv(f'{mwu_path}significanz_{model}.csv')

In [39]:
for col in classifier_dfs['dt']:
    print('\nC:',col)
    p_values_mwu_1model(classifier_dfs,col)


C: testB
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: testW
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C: unmitB
           dt gnb lgr gbt
Classifier               
dt              s        
gnb         s           s
lgr                      
gbt             s        

C: unmitW
           dt gnb lgr gbt
Classifier               
dt              s        
gnb         s       s   s
lgr             s        
gbt             s        

C: egdpB
           dt gnb lgr gbt
Classifier               
dt                       
gnb                     s
lgr                      
gbt             s        

C: egdpW
           dt gnb lgr gbt
Classifier               
dt                       
gnb                      
lgr                      
gbt                      

C

#### Variance of Distributions unmitigated v Mitigated for each race

if p < 0.001 (or < 0.0005) then the distributions are significantly different from each other

In [40]:
def p_race_mwu(dfs, b_or_w = 'B'):
    p_vals = pd.DataFrame(data={'Constraints': []})
    p_signi = pd.DataFrame(data={'Constraints': []})
    
    for c,df in dfs.items():
        
        c = f'{c}{b_or_w}'
        col_signi = []
        col_vals = []
        idx = []
        
        data_unmiti = df[f'unmit{b_or_w}'].dropna(axis=0)
        
        for col in df:
            
            idx.append(col[:-1])
            
            data_miti=df[col].dropna(axis=0)
            
            _,p = stats.mannwhitneyu(data_unmiti, data_miti)
            
            col_vals.append(p)
            if p< 0.05:
                col_signi.append('s')
            else:
                col_signi.append(' ')
                
        p_signi[c] = col_signi
        p_vals[c] = col_vals
        
    p_vals['Constraints'] = idx
    p_vals = p_vals.set_index('Constraints')
    
    
    
    p_signi['Constraints'] = idx
    p_signi = p_signi.set_index('Constraints')
    
    p_vals = p_vals.round(decimals=3)
    print(p_signi)
    
    p_vals.to_csv(f'{mwu_path}p_un_vs_miti_{b_or_w}.csv')
    p_signi.to_csv(f'{mwu_path}significanz_un_vs_miti_{b_or_w}.csv')
   

In [41]:
print('Black:')
p_race_mwu(dfs_b,'B')

print('\nWhite:')
p_race_mwu(dfs_w,'W')

Black:
            dtB gnbB lgrB gbtB
Constraints                   
test          s    s    s    s
unmit                         
egdp          s    s    s    s
egeo          s    s    s    s
egtprp        s    s    s    s
egfprp        s    s    s    s
egerp                         
gsdp          s    s    s    s
gseo                          
gstprp        s    s    s    s
gsfprp                        
gserp                         

White:
            dtW gnbW lgrW gbtW
Constraints                   
test          s    s    s    s
unmit                         
egdp          s    s    s    s
egeo          s    s    s    s
egtprp        s    s    s    s
egfprp        s    s    s    s
egerp                         
gsdp          s    s    s    s
gseo                          
gstprp        s    s    s    s
gsfprp                        
gserp              s          


In [None]:
stats.pearsonr(data_unmiti, data_miti)