In [1]:
import numpy as np
import pandas as pd
import sys, os
sys.path.append("../")
from dataloaders.data_preprocessing import *
from general.utils import mkdir

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
file_dir = '/data/MLTdata/law_school/'
file_path = os.path.join(file_dir,'lsac.sas7bdat' )

#gender column: b'female', b'male'
df = pd.read_sas(file_path)
print('original attributes :', df.columns)
df = df[['parttime','lsat','ugpa','pass_bar',
         'zfygpa','fam_inc','gender','race1']].copy()

renameColumns={'gender':'sex',
               'race1':'race'}

# Renaming columns
df = df.rename(columns = renameColumns)
print('final attributes :', df.columns)
print('original nsamples' ,len(df))

print()

### Clean ###
df = df[np.isfinite(df['lsat']) & np.isfinite(df['ugpa'])]

df['sex'] = df['sex'].values.astype('str')
df = df[df['sex'].values != 'nan' ]

df['race'] = df['race'].values.astype('str')
df = df[df['race'].values != 'nan' ]

values_parttime = df['parttime'].values.astype('str')
values_parttime[values_parttime == 'nan'] = '?'

values_faminc = df['fam_inc'].values.astype('str')
values_faminc[values_faminc == 'nan'] = '?'

df['parttime'] = values_parttime
df['fam_inc'] = values_faminc

values_faminc_m12 = np.array(values_faminc)
values_faminc_m12[values_faminc_m12 == '1.0'] = '2.0'
print(np.unique(values_faminc_m12))
df['fam_inc_m12'] = values_faminc_m12

values_faminc_m45 = np.array(values_faminc)
values_faminc_m45[values_faminc_m45 == '5.0'] = '4.0'
print(np.unique(values_faminc_m45))
df['fam_inc_m45'] = values_faminc_m45

values_faminc_m12m45 = np.array(values_faminc_m45)
values_faminc_m12m45[values_faminc_m12m45 == '1.0'] = '2.0'
print(np.unique(values_faminc_m12m45))
df['fam_inc_m12m45'] = values_faminc_m12m45


## incorporate binary race white nonwhite
aux = np.array(df['race'].values)
aux[aux != 'white'] = 'nonwhite'
df['race_bin'] = aux

## index sample column
df['sample'] = np.arange(len(df))

print('total final nsamples', len(df))
print('final attributes : ')
for col in df.columns:
    print(col)
    
df.groupby(['race','sex','pass_bar']).count()

original attributes : Index(['decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster',
       'lsat', 'ugpa', 'zfygpa', 'DOB_yr', 'grad', 'zgpa', 'bar1', 'bar1_yr',
       'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'gender', 'parttime',
       'male', 'race1', 'race2', 'Dropout', 'other', 'asian', 'black', 'hisp',
       'pass_bar', 'bar', 'tier', 'index6040', 'indxgrp', 'indxgrp2'],
      dtype='object')
final attributes : Index(['parttime', 'lsat', 'ugpa', 'pass_bar', 'zfygpa', 'fam_inc', 'sex',
       'race'],
      dtype='object')
original nsamples 27478

['2.0' '3.0' '4.0' '5.0' '?']
['1.0' '2.0' '3.0' '4.0' '?']
['2.0' '3.0' '4.0' '?']
total final nsamples 27085
final attributes : 
parttime
lsat
ugpa
pass_bar
zfygpa
fam_inc
sex
race
fam_inc_m12
fam_inc_m45
fam_inc_m12m45
race_bin
sample


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,parttime,lsat,ugpa,zfygpa,fam_inc,fam_inc_m12,fam_inc_m45,fam_inc_m12m45,race_bin,sample
race,sex,pass_bar,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
asian,female,0.0,28,28,28,28,28,28,28,28,28,28
asian,female,1.0,422,422,422,389,422,422,422,422,422,422
asian,male,0.0,42,42,42,37,42,42,42,42,42,42
asian,male,1.0,413,413,413,391,413,413,413,413,413,413
black,female,0.0,185,185,185,180,185,185,185,185,185,185
black,female,1.0,648,648,648,620,648,648,648,648,648,648
black,male,0.0,113,113,113,105,113,113,113,113,113,113
black,male,1.0,399,399,399,377,399,399,399,399,399,399
hisp,female,0.0,59,59,59,55,59,59,59,59,59,59
hisp,female,1.0,419,419,419,395,419,419,419,419,419,419


In [3]:
#convert categorical columns to one-hot !!exclude FAMILY INCOME!!
s_columns = ['race','race_bin','sex','pass_bar','sample','fam_inc','fam_inc_m12','fam_inc_m45','fam_inc_m12m45']  # extract sensitive columns
# s_columns = ['race','race_bin','sex','pass_bar','sample','fam_inc_m12','fam_inc_m45','fam_inc_m12m45']  # extract sensitive columns
df_cat, s_dic = extract_sensitive_columns(s_columns, df)
df_cat = pd.get_dummies(df_cat)

print('Categories')
for cat in df_cat.columns:
    print(cat, np.sum(df_cat[cat].values > 0))
print()
#incorporate sensitive_columns
print('Sensitive columns')
for s in s_columns:
    print(s)
    df_cat[s] = s_dic[s].values
print()

df_cat.head()

Categories
lsat 27085
ugpa 27083
zfygpa 12582
parttime_0.0 24512
parttime_1.0 2530
parttime_? 43

Sensitive columns
race
race_bin
sex
pass_bar
sample
fam_inc
fam_inc_m12
fam_inc_m45
fam_inc_m12m45



  if __name__ == '__main__':


Unnamed: 0,lsat,ugpa,zfygpa,parttime_0.0,parttime_1.0,parttime_?,race,race_bin,sex,pass_bar,sample,fam_inc,fam_inc_m12,fam_inc_m45,fam_inc_m12m45
0,30.0,3.1,-1.79,1,0,0,white,white,female,,0,4.0,4.0,4.0,4.0
1,44.0,3.5,1.33,1,0,0,white,white,female,1.0,1,5.0,5.0,4.0,4.0
2,29.0,3.5,-0.11,1,0,0,white,white,female,1.0,2,4.0,4.0,4.0,4.0
3,35.0,3.0,1.22,0,1,0,white,white,female,,3,5.0,5.0,4.0,4.0
4,39.0,2.9,0.88,0,1,0,white,white,female,,4,4.0,4.0,4.0,4.0


In [4]:
save_dir = '/data/MLTdata/law_school/dataset_processed/'
mkdir(save_dir)
df_cat.to_csv(save_dir+'dataset_cat.csv',index=0)

### Checking dataset splitting

In [5]:
from dataloaders.datasets import lawschool_pandas

train_df,test_df,cov_tags = lawschool_pandas(groups_list = ['fam_inc_m12m45','race_bin'],
                                            utility = 'pass_bar',norm_std = True,split=1)

['fam_inc_m12m45', 'race_bin'] dict_keys(['race', 'race_bin', 'sex', 'fam_inc', 'zfygpa', 'pass_bar', 'sample', 'parttime', 'fam_inc_m12', 'fam_inc_m45', 'fam_inc_m12m45'])
*** split:  1
TRAIN: [    1     2     3 ... 22519 22520 22521] TEST: [    0     8     9 ... 22514 22517 22518]
% train  0.7999733593819377
 Split :  1  is loaded...
['train'] ['test']

*** split:  2
TRAIN: [    0     1     2 ... 22519 22520 22521] TEST: [    5    31    40 ... 22503 22508 22516]
% train  0.7999733593819377

*** split:  3
TRAIN: [    0     2     4 ... 22518 22520 22521] TEST: [    1     3     6 ... 22507 22513 22519]
% train  0.8000177604120415

*** split:  4
TRAIN: [    0     1     3 ... 22518 22519 22520] TEST: [    2     4    13 ... 22509 22515 22521]
% train  0.8000177604120415

*** split:  5
TRAIN: [    0     1     2 ... 22518 22519 22521] TEST: [   11    12    15 ... 22510 22512 22520]
% train  0.8000177604120415

------- Law school admission Dataset processing ---------- 
utility :  pass_bar ; 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_train[c] = (pd_train[c].values - mean) / std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_test[c] = (pd_test[c].values - mean) / std


In [6]:
print('train : ', len(train_df), ' samples')
print(train_df.groupby(['strat'])['utility'].count() / len(train_df))
print()
print('test : ', len(test_df), ' samples')
print(test_df.groupby(['strat'])['utility'].count() / len(test_df))
print()

train_df.head()

train :  18017  samples
strat
1.0,nonwhite,0.0,    0.002109
1.0,nonwhite,1.0,    0.008159
1.0,white,0.0,       0.000666
1.0,white,1.0,       0.009547
2.0,nonwhite,0.0,    0.005883
2.0,nonwhite,1.0,    0.025309
2.0,white,0.0,       0.002664
2.0,white,1.0,       0.065771
3.0,nonwhite,0.0,    0.009269
3.0,nonwhite,1.0,    0.055170
3.0,white,0.0,       0.010490
3.0,white,1.0,       0.282844
4.0,nonwhite,0.0,    0.005217
4.0,nonwhite,1.0,    0.042793
4.0,white,0.0,       0.011545
4.0,white,1.0,       0.381196
5.0,nonwhite,0.0,    0.000944
5.0,nonwhite,1.0,    0.006771
5.0,white,0.0,       0.002054
5.0,white,1.0,       0.071599
Name: utility, dtype: float64

test :  4505  samples
strat
1.0,nonwhite,0.0,    0.002220
1.0,nonwhite,1.0,    0.007991
1.0,white,0.0,       0.000666
1.0,white,1.0,       0.009545
2.0,nonwhite,0.0,    0.005771
2.0,nonwhite,1.0,    0.025083
2.0,white,0.0,       0.002664
2.0,white,1.0,       0.065927
3.0,nonwhite,0.0,    0.009323
3.0,nonwhite,1.0,    0.055050
3.0,white,0

Unnamed: 0,lsat,ugpa,race,race_bin,sex,fam_inc,zfygpa,pass_bar,sample,fam_inc_m12,fam_inc_m45,fam_inc_m12m45,utility,sample_index,strat,dataset
2,-1.437321,0.652408,white,white,female,4.0,-0.11,1.0,2,4.0,4.0,4.0,1.0,2,"4.0,white,1.0,",train
5,0.029596,0.411123,white,white,male,3.0,0.63,1.0,5,3.0,3.0,3.0,1.0,5,"3.0,white,1.0,",train
6,1.129784,0.169838,white,white,female,4.0,0.67,1.0,6,4.0,4.0,4.0,1.0,6,"4.0,white,1.0,",train
8,0.763055,0.169838,white,white,female,4.0,-0.67,1.0,8,4.0,4.0,4.0,1.0,8,"4.0,white,1.0,",train
9,-2.262462,-2.484297,white,white,male,4.0,-1.73,1.0,9,4.0,4.0,4.0,1.0,9,"4.0,white,1.0,",train
