**Copyright 2020 Google LLC.**

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

-------------
**Important note**


This notebook has been adapted to function properly with most recent versions of pandas, sklearn and seaborn on January 2021 for the ML challange.

-------------
J. Mohazzab, C.A. Wortmann, L.R. Weytingh, B. Brocades Zaalberg\
Involved master students AI \
Faculty of Science\
University of Amsterdam

In [12]:
from __future__ import division
import pandas as pd
import numpy as np
import json
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

## Overview
### Pre-process Law School Admissions Council Dataset (LSAC) 

Download the Law School dataset from: (http://www.seaphe.org/databases.php), convert SAS file to CSV, and save it in the `./data/law_school` folder.

Input: ./data/law_school/lsac.csv

Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json, dataset_stats.json

In [13]:
pd.options.display.float_format = '{:,.2f}'.format
dataset_base_dir = '../datasets/law_school/'
dataset_file_name = 'lsac.csv'

# Read SAS file
df = pd.read_sas(dataset_base_dir + 'lsac.sas7bdat', format="sas7bdat")

# Set bytes to string
str_df = df.select_dtypes([np.object])
str_df = str_df.stack().str.decode('utf-8').unstack()
for col in str_df:
    df[col] = str_df[col]

# Save df as csv.
df.to_csv('../datasets/law_school/' + dataset_file_name, index=False, encoding='utf-8')


### Processing original dataset

In [14]:
file_path = os.path.join(dataset_base_dir,dataset_file_name)
with open(file_path, "r") as file_name:
    temp_df = pd.read_csv(file_name)

# Columns of interest  
df = temp_df[['zfygpa','zgpa','DOB_yr','parttime','gender','race','tier','fam_inc','lsat','ugpa','pass_bar','index6040']].copy()
renameColumns={'gender':'sex',
               'index6040':'weighted_lsat_ugpa',
               'fam_inc':'family_income',
               'tier':'cluster_tier',
               'parttime':'isPartTime'}
target_variable = 'pass_bar'
target_value = 'Passed'

# Renaming columns
df = df.rename(columns = renameColumns)
# Reorder the columns
df = df[["zfygpa", "zgpa", "DOB_yr", "weighted_lsat_ugpa", "cluster_tier", "family_income", "lsat", "ugpa", "isPartTime", "sex", "race", "pass_bar"]].copy()

columns = df.columns

# NaN in 'pass_bar' refer to dropouts. Considering NaN as failing the bar.
df['pass_bar'] = df['pass_bar'].fillna(value=0.0)
df['pass_bar'] = df.apply(lambda x: 'Passed' if x['pass_bar']==1.0 else 'Failed_or_not_attempted', axis=1).astype('category')

df['zfygpa'] = df['zfygpa'].fillna(value=0.0)
df['zgpa'] = df['zgpa'].fillna(value=0.0)
df['DOB_yr'] = df['DOB_yr'].fillna(value=0.0)
df = df.dropna()

# Binarize target_variable
df['isPartTime'] = df.apply(lambda x: 'Yes' if x['isPartTime']==1.0 else 'No', axis=1).astype('category')

# Process protected-column values
race_dict = {3.0:'Black',7.0:'White'}
sex_dict = {'female':'Female','male':'Male'}
df['race'] = df.apply(lambda x: race_dict[x['race']] if x['race'] in race_dict.keys() else 'Other', axis=1).astype('category')
df['sex'] = df.apply(lambda x: sex_dict[x['sex']] if x['sex'] in sex_dict.keys() else 'Other', axis=1).astype('category')

In [15]:
df.head()

Unnamed: 0,zfygpa,zgpa,DOB_yr,weighted_lsat_ugpa,cluster_tier,family_income,lsat,ugpa,isPartTime,sex,race,pass_bar
0,-1.79,0.0,68.0,625.79,2.0,4.0,30.0,3.1,No,Female,White,Failed_or_not_attempted
1,1.33,1.88,69.0,886.84,4.0,5.0,44.0,3.5,No,Female,White,Passed
2,-0.11,-0.57,69.0,650.0,2.0,4.0,29.0,3.5,No,Female,White,Passed
3,1.22,0.95,58.0,694.74,3.0,5.0,35.0,3.0,Yes,Female,White,Failed_or_not_attempted
4,0.88,0.0,51.0,747.89,2.0,4.0,39.0,2.9,Yes,Female,White,Failed_or_not_attempted


### Shuffle and Split into Train (70%) and Test set (30%)

In [16]:
train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)

output_file_path = os.path.join(dataset_base_dir,'train.csv')
with open(output_file_path, mode="w") as output_file:
    train_df.to_csv(output_file,index=False,columns=columns,header=False)
    output_file.close()

output_file_path = os.path.join(dataset_base_dir,'test.csv')
with open(output_file_path, mode="w") as output_file:
    test_df.to_csv(output_file,index=False,columns=columns,header=False)
    output_file.close()

In [17]:
train_df.head()

Unnamed: 0,zfygpa,zgpa,DOB_yr,weighted_lsat_ugpa,cluster_tier,family_income,lsat,ugpa,isPartTime,sex,race,pass_bar
25670,0.14,-0.73,63.0,676.32,3.0,4.0,37.0,2.5,No,Male,White,Passed
8432,-1.48,0.0,69.0,681.58,3.0,2.0,31.0,3.5,No,Male,White,Failed_or_not_attempted
1278,0.61,-0.09,69.0,718.95,4.0,2.0,34.0,3.4,No,Male,White,Passed
20663,0.14,-0.5,66.0,942.63,5.0,3.0,45.0,3.9,No,Male,White,Passed
17260,0.99,0.94,67.0,667.37,2.0,3.0,32.0,3.2,No,Male,White,Failed_or_not_attempted


### Computing Invese propensity weights for each subgroup, and writes to directory.

IPS_example_weights_with_label.json: json dictionary of the format
        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach.

In [18]:
IPS_example_weights_without_label = {
  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male
  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female
  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male
  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female
}
  
output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_without_label))
    output_file.close()

print(IPS_example_weights_without_label)

{0: 1.8843151171043293, 1: 2.488618103910016, 2: 36.36986301369863, 3: 25.013458950201883}


In [19]:
IPS_example_weights_with_label = {
0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male
1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female
2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male
3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female
4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male
5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female
6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male
7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female
}
  
output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_with_label))
    output_file.close()

print(IPS_example_weights_with_label)

{0: 10.194733955019199, 1: 13.545918367346939, 2: 82.23451327433628, 3: 63.214285714285715, 4: 2.3115671641791047, 5: 3.048720472440945, 6: 65.21052631578948, 7: 41.39198218262806}


### Construct vocabulary.json, and write to directory.

vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features.

In [20]:
cat_cols = train_df.select_dtypes(include='category').columns
vocab_dict = {}
for col in cat_cols:
    vocab_dict[col] = list(set(train_df[col].cat.categories))

output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(vocab_dict))
    output_file.close()
print(vocab_dict)

{'isPartTime': ['Yes', 'No'], 'sex': ['Male', 'Female'], 'race': ['White', 'Black', 'Other'], 'pass_bar': ['Passed', 'Failed_or_not_attempted']}


### Construct mean_std.json, and write to directory

mean_std.json: json dictionary of the format feature_name: [mean, std]},
containing mean and std for numerical features. 

In [21]:
temp_dict = train_df.describe().to_dict()
mean_std_dict = {}
for key, value in temp_dict.items():
    mean_std_dict[key] = [value['mean'],value['std']]

output_file_path = os.path.join(dataset_base_dir,'mean_std.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(mean_std_dict))
    output_file.close()
print(mean_std_dict)

{'zfygpa': [0.007156308851224107, 0.956269325542025], 'zgpa': [0.005135324186171643, 0.9203686714713514], 'DOB_yr': [64.9954802259887, 6.374190672837983], 'weighted_lsat_ugpa': [741.9962436595317, 107.69097610619035], 'cluster_tier': [3.7390906645143933, 1.183449020338574], 'family_income': [3.4257734732311005, 0.8794618881913022], 'lsat': [36.57297820823245, 5.629890085895137], 'ugpa': [3.2242292171105733, 0.41846631192390027]}


### Construct datasets_stats.json


dataset_stats.json: json dictionary that contains whith information that is hardcoded in the original TensorFlow implementation by Lahoti et al.

In [22]:
stats = {}
stats["feature_names"] = list(train_df.columns)
stats["mean_std"] = mean_std_dict
stats["sensitive_column_names"] = ["sex", "race"]
stats["sensitive_column_values"] = ["Female", "Black"]
stats["target_column_name"] = "pass_bar"
stats["target_column_positive_value"] = "Passed"
stats["vocabulary"] = vocab_dict

output_file_path = os.path.join(dataset_base_dir, 'dataset_stats.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(stats, indent=4, sort_keys=True))
    output_file.close()
