**Copyright 2020 Google LLC.**

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

In [10]:
from __future__ import division
import pandas as pd
import numpy as np
import json
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

## Overview

### Pre-processes UCI Adult (Census Income) dataset:

Download the Adult train and test data files can be downloaded from:
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
and save them in the `./group_agnostic_fairness/data/uci_adult` folder.

Input: 

*   ./group_agnostic_fairness/data/uci_adult/adult.data 
*   ./group_agnostic_fairness/data/uci_adult/adult.test



Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json

In [11]:
pd.options.display.float_format = '{:,.2f}'.format
dataset_base_dir = '../data/uci_adult/'

### Load original dataset

In [12]:
def convert_object_type_to_category(df):
    """Converts columns of type object to category."""
    df = pd.concat([df.select_dtypes(include=[], exclude=['object']),
                  df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
                  ], axis=1).reindex(df.columns, axis=1)
    return df

In [13]:
TRAIN_FILE = os.path.join(dataset_base_dir,'adult.data')
TEST_FILE = os.path.join(dataset_base_dir,'adult.test')

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

target_variable = "income"
target_value = ">50K"

with open(TRAIN_FILE, "r") as TRAIN_FILE:
    train_df = pd.read_csv(TRAIN_FILE,sep=',',names=columns)

with open(TEST_FILE, "r") as TEST_FILE:
    test_df = pd.read_csv(TEST_FILE,sep=',',names=columns)

In [14]:
# Convert columns of type ``object`` to ``category`` 
train_df = convert_object_type_to_category(train_df)
test_df = convert_object_type_to_category(test_df)

In [15]:
# repl_list = {r'\.':'', ' ':'', r'\.': ''}
col_list_train = train_df.columns.values
col_list_test = test_df.columns.values


dtype = test_df['income']
test_df['income'] = test_df['income'].str.replace('50K.', '50K', regex=False)
test_df['income'] = test_df['income'].astype('category')
  
test_df = test_df.iloc[1:]
test_df = test_df[:-1]
train_df = train_df[:-1]



for column in col_list_test:
    test_df[column] = test_df[column].astype(str).str.replace(' ', '', regex=False)
    if column in ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income']:
        test_df[column] = test_df[column].astype('category')
    else:
        test_df[column] = test_df[column].astype('float64').astype('int64')

        
for column in col_list_train:
    train_df[column] = train_df[column].astype(str).str.replace(' ', '', regex=False)    
    if column in ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income']:
        train_df[column] = train_df[column].astype('category')
    else:
        train_df[column] = train_df[column].astype('float64').astype('int64')

      


AttributeError: Can only use .str accessor with string values!

### Computing Invese propensity weights for each subgroup, and writes to directory.

IPS_example_weights_with_label.json: json dictionary of the format
        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach.

In [16]:
IPS_example_weights_without_label = {
  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male
  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female
  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male
  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female
}

output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_without_label))
    output_file.close()

print(IPS_example_weights_without_label)

ZeroDivisionError: division by zero

In [17]:
IPS_example_weights_with_label = {
0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male
1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female
2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male
3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female
4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male
5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female
6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male
7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female
}

output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_with_label))
    output_file.close()

print(IPS_example_weights_with_label)

ZeroDivisionError: division by zero

### Construct vocabulary.json, and write to directory.

vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features.

In [18]:
cat_cols = train_df.select_dtypes(include='category').columns
print(cat_cols)
vocab_dict = {}
for col in cat_cols:
    vocab_dict[col] = list(set(train_df[col].cat.categories)-{"?"})

output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(vocab_dict))
    output_file.close()



Index(['age'], dtype='object')


### Construct mean_std.json, and write to directory

mean_std.json: json dictionary of the format feature_name: [mean, std]},
containing mean and std for numerical features. 

In [19]:
temp_dict = train_df.describe().to_dict()
mean_std_dict = {}
for key, value in temp_dict.items():
    print(value)
    mean_std_dict[key] = [value['mean'],value['std']]

output_file_path = os.path.join(dataset_base_dir,'mean_std.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(mean_std_dict))
    output_file.close()
print(mean_std_dict)

{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}
{'count': 0.0, 'mean': nan, 'std': nan, 'min': nan, '25%': nan, '50%': nan, '75%': nan, 'max': nan}


In [20]:
# TRAIN_FILE = os.path.join(dataset_base_dir,'adult.data')
# TEST_FILE = os.path.join(dataset_base_dir,'adult.test')
TRAIN_FILE = os.path.join(dataset_base_dir,'train.csv')
TEST_FILE = os.path.join(dataset_base_dir,'test.csv')

# with open(TRAIN_FILE, "w") as TRAIN_FILE:
train_df.to_csv(TRAIN_FILE, sep='\t', encoding='utf-8')


# with open(TEST_FILE, "w") as TEST_FILE:
test_df.to_csv(TEST_FILE, sep='\t', encoding='utf-8')


