In [13]:
import numpy as np
import pandas as pd
import sys, os
sys.path.append("../")
from dataloaders.data_preprocessing import *
from general.utils import mkdir

%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
file_dir = '/data/MLTdata/law_school/'
file_path = os.path.join(file_dir,'lsac.sas7bdat' )

#gender column: b'female', b'male'
df = pd.read_sas(file_path)
print('original attributes :', df.columns)
df = df[['lsat','ugpa','pass_bar',
         'zfygpa','fam_inc','gender','race1']].copy()

renameColumns={'gender':'sex',
               'race1':'race'}

# Renaming columns
df = df.rename(columns = renameColumns)
print('final attributes :', df.columns)
print('original nsamples' ,len(df))

print()

### Clean ###
df = df[np.isfinite(df['lsat']) & np.isfinite(df['ugpa'])]

df['sex'] = df['sex'].values.astype('str')
df = df[df['sex'].values != 'nan' ]

df['race'] = df['race'].values.astype('str')
df = df[df['race'].values != 'nan' ]

values_faminc = df['fam_inc'].values.astype('str')
values_faminc[values_faminc == 'nan'] = '?'
df['fam_inc'] = values_faminc

# values_faminc_m12 = np.array(values_faminc)
# values_faminc_m12[values_faminc_m12 == '1.0'] = '2.0'
# print(np.unique(values_faminc_m12))
# df['fam_inc_m12'] = values_faminc_m12

# values_faminc_m45 = np.array(values_faminc)
# values_faminc_m45[values_faminc_m45 == '5.0'] = '4.0'
# print(np.unique(values_faminc_m45))
# df['fam_inc_m45'] = values_faminc_m45

# values_faminc_m12m45 = np.array(values_faminc_m45)
# values_faminc_m12m45[values_faminc_m12m45 == '1.0'] = '2.0'
# print(np.unique(values_faminc_m12m45))
# df['fam_inc_m12m45'] = values_faminc_m12m45


## incorporate binary race white nonwhite
aux = np.array(df['race'].values)
aux[aux != 'white'] = 'nonwhite'
df['race_bin'] = aux

## index sample column
df['sample'] = np.arange(len(df))

print('total final nsamples', len(df))
print('final attributes : ')
for col in df.columns:
    print(col)
    
df.groupby(['race','sex','pass_bar']).count()

original attributes : Index(['decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster',
       'lsat', 'ugpa', 'zfygpa', 'DOB_yr', 'grad', 'zgpa', 'bar1', 'bar1_yr',
       'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'gender', 'parttime',
       'male', 'race1', 'race2', 'Dropout', 'other', 'asian', 'black', 'hisp',
       'pass_bar', 'bar', 'tier', 'index6040', 'indxgrp', 'indxgrp2'],
      dtype='object')
final attributes : Index(['lsat', 'ugpa', 'pass_bar', 'zfygpa', 'fam_inc', 'sex', 'race'], dtype='object')
original nsamples 27478

total final nsamples 27085
final attributes : 
lsat
ugpa
pass_bar
zfygpa
fam_inc
sex
race
race_bin
sample


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lsat,ugpa,zfygpa,fam_inc,race_bin,sample
race,sex,pass_bar,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
asian,female,0.0,28,28,28,28,28,28
asian,female,1.0,422,422,389,422,422,422
asian,male,0.0,42,42,37,42,42,42
asian,male,1.0,413,413,391,413,413,413
black,female,0.0,185,185,180,185,185,185
black,female,1.0,648,648,620,648,648,648
black,male,0.0,113,113,105,113,113,113
black,male,1.0,399,399,377,399,399,399
hisp,female,0.0,59,59,55,59,59,59
hisp,female,1.0,419,419,395,419,419,419


In [32]:
save_dir = '/data/MLTdata/law_school/dataset_processed/'
mkdir(save_dir)
df.to_csv(save_dir+'bpf_dataset_cat.csv',index=0)
df.head()

Unnamed: 0,lsat,ugpa,pass_bar,zfygpa,fam_inc,sex,race,race_bin,sample
0,30.0,3.1,,-1.79,4.0,female,white,white,0
1,44.0,3.5,1.0,1.33,5.0,female,white,white,1
2,29.0,3.5,1.0,-0.11,4.0,female,white,white,2
3,35.0,3.0,,1.22,5.0,female,white,white,3
4,39.0,2.9,,0.88,4.0,female,white,white,4


### Checking dataset splitting

In [37]:
from dataloaders.datasets import lawschool_pandas

train_df,test_df,cov_tags = lawschool_pandas(groups_list = ['sex','race_bin'],
                                            utility = 'pass_bar',norm_std = True,split=1)

['sex', 'race_bin'] dict_keys(['race', 'race_bin', 'sex', 'zfygpa', 'pass_bar', 'sample'])
 Split :  1  is loaded...
['train'] ['test']
------- Law school admission Dataset processing ---------- 
utility :  pass_bar ; stratification_tags :  ['pass_bar', 'sex', 'race_bin'] ; standarization :  True ; len(cov_tags) :  3
cov_tags :  ['lsat' 'ugpa' 'fam_inc']
ntrain :  18017  ; ntest :  4505
check: overlap train and test ?  False



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_train[c] = (pd_train[c].values - mean) / std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_test[c] = (pd_test[c].values - mean) / std


In [38]:
train_df

Unnamed: 0,lsat,ugpa,fam_inc,race,race_bin,sex,zfygpa,pass_bar,sample,utility,sample_index,strat,dataset
1,1.313149,0.652408,1.794524,white,white,female,1.33,1.0,1,1.0,1,"1.0,female,white,",train
2,-1.437321,0.652408,0.626930,white,white,female,-0.11,1.0,2,1.0,2,"1.0,female,white,",train
6,1.129784,0.169838,0.626930,white,white,female,0.67,1.0,6,1.0,6,"1.0,female,white,",train
8,0.763055,0.169838,0.626930,white,white,female,-0.67,1.0,8,1.0,8,"1.0,female,white,",train
9,-2.262462,-2.484297,0.626930,white,white,male,-1.73,1.0,9,1.0,9,"1.0,male,white,",train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27076,-0.978909,-0.312732,-0.540664,white,white,male,-0.65,1.0,27076,1.0,27076,"1.0,male,white,",train
27077,0.763055,-0.071447,1.794524,white,white,male,0.74,1.0,27077,1.0,27077,"1.0,male,white,",train
27080,-1.345639,0.652408,0.626930,white,white,male,-0.45,1.0,27080,1.0,27080,"1.0,male,white,",train
27081,-0.703862,-0.312732,-0.540664,black,nonwhite,male,-1.92,0.0,27081,0.0,27081,"0.0,male,nonwhite,",train


In [39]:
print('train : ', len(train_df), ' samples')
print(train_df.groupby(['strat'])['utility'].count() / len(train_df))
print()
print('test : ', len(test_df), ' samples')
print(test_df.groupby(['strat'])['utility'].count() / len(test_df))
print()

train_df.head()

train :  18017  samples
strat
0.0,female,nonwhite,    0.012599
0.0,female,white,       0.012044
0.0,male,nonwhite,      0.010823
0.0,male,white,         0.015374
1.0,female,nonwhite,    0.071821
1.0,female,white,       0.344175
1.0,male,nonwhite,      0.066326
1.0,male,white,         0.466837
Name: utility, dtype: float64

test :  4505  samples
strat
0.0,female,nonwhite,    0.012653
0.0,female,white,       0.011987
0.0,male,nonwhite,      0.010877
0.0,male,white,         0.015538
1.0,female,nonwhite,    0.071698
1.0,female,white,       0.344284
1.0,male,nonwhite,      0.066371
1.0,male,white,         0.466593
Name: utility, dtype: float64



Unnamed: 0,lsat,ugpa,fam_inc,race,race_bin,sex,zfygpa,pass_bar,sample,utility,sample_index,strat,dataset
1,1.313149,0.652408,1.794524,white,white,female,1.33,1.0,1,1.0,1,"1.0,female,white,",train
2,-1.437321,0.652408,0.62693,white,white,female,-0.11,1.0,2,1.0,2,"1.0,female,white,",train
6,1.129784,0.169838,0.62693,white,white,female,0.67,1.0,6,1.0,6,"1.0,female,white,",train
8,0.763055,0.169838,0.62693,white,white,female,-0.67,1.0,8,1.0,8,"1.0,female,white,",train
9,-2.262462,-2.484297,0.62693,white,white,male,-1.73,1.0,9,1.0,9,"1.0,male,white,",train
