In [40]:
import numpy as np
import pandas as pd
import sys, os
sys.path.append("../")
from dataloaders.data_preprocessing import *
from general.utils import mkdir

%load_ext autoreload
%autoreload 2
%matplotlib inline

## Adapted from : https://aif360.readthedocs.io/en/latest/modules/generated/aif360.sklearn.datasets.fetch_compas.html

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import aif360
from aif360.sklearn.datasets.compas_dataset import fetch_compas

dataset = fetch_compas(data_home=None, binary_race=False,
                       usecols=['sex', 'age', 'age_cat', 'race', 'juv_fel_count',
                          'juv_misd_count', 'juv_other_count', 'priors_count',
                          'c_charge_degree', 'c_charge_desc'],
                       dropcols=[], numeric_only=False, dropna=True)
df = dataset.X

# ## incorporate binary race white nonwhite
aux = np.array(df['race'].values)
aux[aux != 'Caucasian'] = 'nonCaucasian'
df['race_bin'] = aux

two_year_recid = []
for val in dataset.y.values:
    if val == 'Survived':
        two_year_recid.append(0)
    else:
        two_year_recid.append(1)
df['two_year_recid'] = two_year_recid
df['sample'] = np.arange(len(df))

df = df.reset_index(drop=True) #drop indexes that are provided by fetch_compas
df.head()

Unnamed: 0,sex,age,age_cat,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,c_charge_desc,race_bin,two_year_recid,sample
0,Male,69,Greater than 45,Other,0,0,0,0,F,Aggravated Assault w/Firearm,nonCaucasian,0,0
1,Male,34,25 - 45,African-American,0,0,0,0,F,Felony Battery w/Prior Convict,nonCaucasian,1,1
2,Male,24,Less than 25,African-American,0,0,1,4,F,Possession of Cocaine,nonCaucasian,1,2
3,Male,44,25 - 45,Other,0,0,0,0,M,Battery,nonCaucasian,0,3
4,Male,41,25 - 45,Caucasian,0,0,0,14,F,Possession Burglary Tools,Caucasian,1,4


In [42]:
#convert categorical columns to one-hot !!exclude FAMILY INCOME!!
s_columns = ['race','race_bin','sex','two_year_recid','sample']  # extract sensitive columns
df_cat, s_dic = extract_sensitive_columns(s_columns, df)
df_cat = pd.get_dummies(df_cat)


print('Categories')
samples = df['sample'].values
for cat in sorted(df_cat.columns):
    print(cat,df_cat[cat].dtype)
    if (df_cat[cat].dtype == 'uint8') | (df_cat[cat].dtype == 'int64'):
        print(np.sum(df_cat[cat].values > 0))
        if np.sum(df_cat[cat].values > 0) < 20:
            samples = samples[df_cat[cat].values != 1] #remove samples with this feature on
            df_cat = df_cat[df_cat[cat] != 1] #remove samples with this feature on
            df_cat = df_cat.drop(columns = [cat])
            print('Too small dropped')
print()

#incorporate sensitive_columns
print('Sensitive columns')
for s in s_columns:
    print(s)
    df_cat[s] = s_dic[s].values[samples]
print()

df_cat.groupby(['sex','race_bin','two_year_recid']).count()

Categories
age int64
6167
age_cat_25 - 45 uint8
3528
age_cat_Greater than 45 uint8
1292
age_cat_Less than 25 uint8
1347
c_charge_degree_F uint8
3966
c_charge_degree_M uint8
2201
c_charge_desc_Abuse Without Great Harm uint8
1
Too small dropped
c_charge_desc_Agg Abuse Elderlly/Disabled Adult uint8
1
Too small dropped
c_charge_desc_Agg Assault W/int Com Fel Dome uint8
7
Too small dropped
c_charge_desc_Agg Battery Grt/Bod/Harm uint8
31
c_charge_desc_Agg Fleeing and Eluding uint8
6
Too small dropped
c_charge_desc_Agg Fleeing/Eluding High Speed uint8
2
Too small dropped
c_charge_desc_Aggr Child Abuse-Torture,Punish uint8
1
Too small dropped
c_charge_desc_Aggrav Battery w/Deadly Weapon uint8
57
c_charge_desc_Aggrav Child Abuse-Agg Battery uint8
3
Too small dropped
c_charge_desc_Aggrav Child Abuse-Causes Harm uint8
1
Too small dropped
c_charge_desc_Aggrav Stalking After Injunctn uint8
6
Too small dropped
c_charge_desc_Aggravated Assault uint8
5
Too small dropped
c_charge_desc_Aggravated Assaul

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,...,c_charge_desc_Resist Officer w/Violence,c_charge_desc_Resist/Obstruct W/O Violence,c_charge_desc_Susp Drivers Lic 1st Offense,c_charge_desc_Tamper With Witness/Victim/CI,c_charge_desc_Tampering With Physical Evidence,c_charge_desc_Uttering a Forged Instrument,c_charge_desc_Viol Injunct Domestic Violence,c_charge_desc_arrest case no charge,race,sample
sex,race_bin,two_year_recid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Female,Caucasian,0,262,262,262,262,262,262,262,262,262,262,...,262,262,262,262,262,262,262,262,262,262
Female,Caucasian,1,143,143,143,143,143,143,143,143,143,143,...,143,143,143,143,143,143,143,143,143,143
Female,nonCaucasian,0,373,373,373,373,373,373,373,373,373,373,...,373,373,373,373,373,373,373,373,373,373
Female,nonCaucasian,1,197,197,197,197,197,197,197,197,197,197,...,197,197,197,197,197,197,197,197,197,197
Male,Caucasian,0,785,785,785,785,785,785,785,785,785,785,...,785,785,785,785,785,785,785,785,785,785
Male,Caucasian,1,541,541,541,541,541,541,541,541,541,541,...,541,541,541,541,541,541,541,541,541,541
Male,nonCaucasian,0,1334,1334,1334,1334,1334,1334,1334,1334,1334,1334,...,1334,1334,1334,1334,1334,1334,1334,1334,1334,1334
Male,nonCaucasian,1,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,...,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460


In [46]:
save_dir = '/data/MLTdata/compas/dataset_processed/'
mkdir(save_dir)
df_cat.to_csv(save_dir+'bpf_dataset_cat.csv',index=0)

### Checking dataset splitting

In [52]:
from dataloaders.datasets import Compas_pandas

train_df,test_df,cov_tags = Compas_pandas(groups_list = ['race_bin','sex'],
                                            utility = 'two_year_recid',norm_std = True,split=1)

 Split :  1  is loaded...
['train'] ['test']
------- Compas Dataset processing ---------- 
utility :  two_year_recid ; stratification_tags :  ['race_bin', 'sex', 'two_year_recid'] ; standarization :  True ; len(cov_tags) :  63
ntrain :  4076  ; ntest :  1019
check: overlap train and test ?  False



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_train[c] = (pd_train[c].values - mean) / std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_test[c] = (pd_test[c].values - mean) / std


In [53]:
print('train : ', len(train_df), ' samples')
print(train_df.groupby(['strat'])['utility'].count() / len(train_df))
print()
print('test : ', len(test_df), ' samples')
print(test_df.groupby(['strat'])['utility'].count() / len(test_df))
print()

train_df.head()

train :  4076  samples
strat
Caucasian,Female,0,       0.051276
Caucasian,Female,1,       0.028214
Caucasian,Male,0,         0.154073
Caucasian,Male,1,         0.106232
nonCaucasian,Female,0,    0.073111
nonCaucasian,Female,1,    0.038763
nonCaucasian,Male,0,      0.261776
nonCaucasian,Male,1,      0.286555
Name: utility, dtype: float64

test :  1019  samples
strat
Caucasian,Female,0,       0.052012
Caucasian,Female,1,       0.027478
Caucasian,Male,0,         0.154073
Caucasian,Male,1,         0.105986
nonCaucasian,Female,0,    0.073602
nonCaucasian,Female,1,    0.038273
nonCaucasian,Male,0,      0.262022
nonCaucasian,Male,1,      0.286555
Name: utility, dtype: float64



Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,...,c_charge_desc_arrest case no charge,race,race_bin,sex,two_year_recid,sample,utility,sample_index,strat,dataset
0,2.957248,-0.128684,-0.183274,-0.232776,-0.691628,-1.172141,1.953187,-0.521029,0.765587,-0.765587,...,-0.426409,Other,nonCaucasian,Male,0,0,0,0,"nonCaucasian,Male,0,",train
1,-0.046439,-0.128684,-0.183274,-0.232776,-0.691628,0.852972,-0.511883,-0.521029,0.765587,-0.765587,...,-0.426409,African-American,nonCaucasian,Male,1,1,1,1,"nonCaucasian,Male,1,",train
2,-0.904635,-0.128684,-0.183274,1.888855,0.140105,-1.172141,-0.511883,1.918901,0.765587,-0.765587,...,-0.426409,African-American,nonCaucasian,Male,1,2,1,2,"nonCaucasian,Male,1,",train
3,0.811758,-0.128684,-0.183274,-0.232776,-0.691628,0.852972,-0.511883,-0.521029,-1.30593,1.30593,...,-0.426409,Other,nonCaucasian,Male,0,3,0,3,"nonCaucasian,Male,0,",train
4,0.554299,-0.128684,-0.183274,-0.232776,2.219437,0.852972,-0.511883,-0.521029,0.765587,-0.765587,...,-0.426409,Caucasian,Caucasian,Male,1,4,1,4,"Caucasian,Male,1,",train
