In [28]:
import sys
sys.path.insert(0, '../scripts')
import pipeline
import pandas as pd

import seaborn as sns
from aequitas.preprocessing import preprocess_input_df
from aequitas.group import Group
#from aequitas.bias import Bias
from aequitas.fairness import Fairness
#from aequitas.plotting import Plot

import warnings; warnings.simplefilter('ignore')

%matplotlib inline

### Aequitas Formatting

Aequitas have format requirements for inputs into the program. We will use the following acceptable format requirements:

- Necessary columns (binary)
    - *score*: model's predictions
    - *label_value*: ground truth


- Attributes (continuous or categorical)
    - *majority_race*: (categorical) what race makes up 50% or more of the block group's population
    - *poverty_rate*: (continuous) % of the block group's population with income below poverty line for the past year


- Reserve Names
    - *entity_id*: will be used to replace "GEOID" in our data


For continuous attributes, Aequitas will bin the values into quartiles.

For categorical attributes (that are manually discretized), columns must be type 'str' to use Aequitas get_crosstabs().

### Generate Dataframe Input

Let us use the best model from our analysis and pass it through 2015 training data to and then generate predictions for 2016.

In [4]:
# Grab best model from evaluation table
et_file = '../outputs/evaluation_table.csv'
eval_table = pipeline.read(et_file)
preferred_metric = 'precision_at_0.1'

model, _ = pipeline.model_best_average(eval_table, preferred_metric)

In [5]:
# Retrieve data and define label
data_file = '../data/work/block-groups_2012-2016_with-acs_with-gen-features.csv'
df = pipeline.read(data_file)
label = 'upper10th_by_year'

In [6]:
df.head()

Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_evictions-effectiveness_4quantiles_1.0,prev-yr_evictions-effectiveness_4quantiles_2.0,prev-yr_evictions-effectiveness_4quantiles_3.0,prev-yr_evictions-effectiveness_4quantiles_4.0,prev-yr_evictions-effectiveness_4quantiles_nan,prev-yr_pct-am-ind_4quantiles_1.0,prev-yr_pct-am-ind_4quantiles_2.0,prev-yr_pct-am-ind_4quantiles_3.0,prev-yr_pct-am-ind_4quantiles_4.0,prev-yr_pct-am-ind_4quantiles_nan
0,170310101001,2012,101.1,"Cook County, Illinois",435.0,18.92,156.0,67.23,785.0,,...,0,0,0,0,1,0,0,0,0,1
1,170310101001,2013,101.1,"Cook County, Illinois",435.0,18.92,159.0,67.23,785.0,,...,0,0,1,0,0,1,0,0,0,0
2,170310101001,2014,101.1,"Cook County, Illinois",435.0,18.92,161.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
3,170310101001,2015,101.1,"Cook County, Illinois",435.0,18.92,164.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0


In [7]:
# Determine features to use for training: previous year Eviction Lab features and current year ACS features
features = ['prev-yr_population_4quantiles_1.0', 'prev-yr_population_4quantiles_2.0', 'prev-yr_population_4quantiles_3.0', \
            'prev-yr_population_4quantiles_4.0', 'prev-yr_poverty-rate_4quantiles_1.0', 'prev-yr_poverty-rate_4quantiles_2.0', \
            'prev-yr_poverty-rate_4quantiles_3.0', 'prev-yr_poverty-rate_4quantiles_4.0', \
            'prev-yr_renter-occupied-households_4quantiles_1.0', 'prev-yr_renter-occupied-households_4quantiles_2.0', \
            'prev-yr_renter-occupied-households_4quantiles_3.0', 'prev-yr_renter-occupied-households_4quantiles_4.0', \
            'prev-yr_pct-renter-occupied_4quantiles_1.0', 'prev-yr_pct-renter-occupied_4quantiles_2.0', \
            'prev-yr_pct-renter-occupied_4quantiles_3.0', 'prev-yr_pct-renter-occupied_4quantiles_4.0', \
            'prev-yr_median-gross-rent_4quantiles_1.0', 'prev-yr_median-gross-rent_4quantiles_2.0', \
            'prev-yr_median-gross-rent_4quantiles_3.0', 'prev-yr_median-gross-rent_4quantiles_4.0', \
            'prev-yr_median-gross-rent_4quantiles_nan', 'prev-yr_median-household-income_4quantiles_1.0', \
            'prev-yr_median-household-income_4quantiles_2.0', 'prev-yr_median-household-income_4quantiles_3.0', \
            'prev-yr_median-household-income_4quantiles_4.0', 'prev-yr_median-household-income_4quantiles_nan', \
            'prev-yr_median-property-value_4quantiles_1.0', 'prev-yr_median-property-value_4quantiles_2.0', \
            'prev-yr_median-property-value_4quantiles_3.0', 'prev-yr_median-property-value_4quantiles_4.0', \
            'prev-yr_median-property-value_4quantiles_nan', 'prev-yr_rent-burden_4quantiles_1.0', \
            'prev-yr_rent-burden_4quantiles_2.0', 'prev-yr_rent-burden_4quantiles_3.0', 'prev-yr_rent-burden_4quantiles_4.0', \
            'prev-yr_rent-burden_4quantiles_nan', 'prev-yr_pct-white_4quantiles_1.0', 'prev-yr_pct-white_4quantiles_2.0', \
            'prev-yr_pct-white_4quantiles_3.0', 'prev-yr_pct-white_4quantiles_4.0', 'prev-yr_pct-af-am_4quantiles_1.0', \
            'prev-yr_pct-af-am_4quantiles_2.0', 'prev-yr_pct-af-am_4quantiles_3.0', 'prev-yr_pct-af-am_4quantiles_4.0', \
            'prev-yr_pct-hispanic_4quantiles_1.0', 'prev-yr_pct-hispanic_4quantiles_2.0', 'prev-yr_pct-hispanic_4quantiles_3.0', \
            'prev-yr_pct-hispanic_4quantiles_4.0', 'prev-yr_pct-am-ind_4quantiles_1.0', 'prev-yr_pct-am-ind_4quantiles_2.0', \
            'prev-yr_pct-am-ind_4quantiles_3.0', 'prev-yr_pct-am-ind_4quantiles_4.0', 'prev-yr_pct-asian_4quantiles_1.0', \
            'prev-yr_pct-asian_4quantiles_2.0', 'prev-yr_pct-asian_4quantiles_3.0', 'prev-yr_pct-asian_4quantiles_4.0', \
            'prev-yr_pct-nh-pi_4quantiles_1.0', 'prev-yr_pct-nh-pi_4quantiles_2.0', 'prev-yr_pct-nh-pi_4quantiles_3.0', \
            'prev-yr_pct-nh-pi_4quantiles_4.0', 'prev-yr_pct-multiple_4quantiles_1.0', 'prev-yr_pct-multiple_4quantiles_2.0', \
            'prev-yr_pct-multiple_4quantiles_3.0', 'prev-yr_pct-multiple_4quantiles_4.0', 'prev-yr_pct-other_4quantiles_1.0', \
            'prev-yr_pct-other_4quantiles_2.0', 'prev-yr_pct-other_4quantiles_3.0', 'prev-yr_pct-other_4quantiles_4.0', \
            'prev-yr_eviction-filings_4quantiles_1.0', 'prev-yr_eviction-filings_4quantiles_2.0', \
            'prev-yr_eviction-filings_4quantiles_3.0', 'prev-yr_eviction-filings_4quantiles_4.0', \
            'prev-yr_eviction-filing-rate_4quantiles_1.0', 'prev-yr_eviction-filing-rate_4quantiles_2.0', \
            'prev-yr_eviction-filing-rate_4quantiles_3.0', 'prev-yr_eviction-filing-rate_4quantiles_4.0', \
            'prev-yr_evictions_4quantiles_1.0', 'prev-yr_evictions_4quantiles_2.0', \
            'prev-yr_evictions_4quantiles_3.0', 'prev-yr_evictions_4quantiles_4.0', \
            'prev-yr_eviction-rate_4quantiles_1.0', 'prev-yr_eviction-rate_4quantiles_2.0', \
            'prev-yr_eviction-rate_4quantiles_3.0', 'prev-yr_eviction-rate_4quantiles_4.0', \
            'prev-yr_evictions-effectiveness_4quantiles_1.0', 'prev-yr_evictions-effectiveness_4quantiles_2.0', \
            'prev-yr_evictions-effectiveness_4quantiles_3.0', 'prev-yr_evictions-effectiveness_4quantiles_4.0', \
            'total_for_public_assistance_income_4quantiles_1.0', \
            'total_for_public_assistance_income_4quantiles_2.0', \
            'total_for_public_assistance_income_4quantiles_3.0', \
            'total_for_public_assistance_income_4quantiles_4.0', \
            'with_public_assistance_income_4quantiles_1.0', 'with_public_assistance_income_4quantiles_2.0', \
            'with_public_assistance_income_4quantiles_3.0', 'with_public_assistance_income_4quantiles_4.0', \
            'estimate_total_in_labor_force_4quantiles_1.0', 'estimate_total_in_labor_force_4quantiles_2.0', \
            'estimate_total_in_labor_force_4quantiles_3.0', 'estimate_total_in_labor_force_4quantiles_4.0', \
            'estimate_civilian_unemployed_4quantiles_1.0', 'estimate_civilian_unemployed_4quantiles_2.0', \
            'estimate_civilian_unemployed_4quantiles_3.0', 'estimate_civilian_unemployed_4quantiles_4.0', \
            'total_for_householder_tenure_4quantiles_1.0', 'total_for_householder_tenure_4quantiles_2.0', \
            'total_for_householder_tenure_4quantiles_3.0', 'total_for_householder_tenure_4quantiles_4.0', \
            'renter_occupied_4quantiles_1.0', 'renter_occupied_4quantiles_2.0', \
            'renter_occupied_4quantiles_3.0', 'renter_occupied_4quantiles_4.0', \
            'renter_moved_2015/2010_later_4quantiles_1.0', 'renter_moved_2015/2010_later_4quantiles_2.0', \
            'renter_moved_2015/2010_later_4quantiles_3.0', 'renter_moved_2015/2010_later_4quantiles_4.0', \
            'renter_moved_2010-2014/2000-2009_4quantiles_1.0', 'renter_moved_2010-2014/2000-2009_4quantiles_2.0', \
            'renter_moved_2010-2014/2000-2009_4quantiles_3.0', 'renter_moved_2010-2014/2000-2009_4quantiles_4.0', \
            'renter_moved_2000-2009/1990-1999_4quantiles_1.0', 'renter_moved_2000-2009/1990-1999_4quantiles_2.0', \
            'renter_moved_2000-2009/1990-1999_4quantiles_3.0', 'renter_moved_2000-2009/1990-1999_4quantiles_4.0', \
            'renter_moved_1990-1999/1980-1989_4quantiles_1.0', 'renter_moved_1990-1999/1980-1989_4quantiles_2.0', \
            'renter_moved_1990-1999/1980-1989_4quantiles_3.0', 'renter_moved_1990-1999/1980-1989_4quantiles_4.0', \
            'renter_moved_1980-1989/1970-1979_4quantiles_1.0', 'renter_moved_1980-1989/1970-1979_4quantiles_2.0', \
            'renter_moved_1980-1989/1970-1979_4quantiles_3.0', 'renter_moved_1980-1989/1970-1979_4quantiles_4.0', \
            'renter_moved_1979/1969_earlier_4quantiles_1.0', 'renter_moved_1979/1969_earlier_4quantiles_2.0', \
            'renter_moved_1979/1969_earlier_4quantiles_3.0', 'renter_moved_1979/1969_earlier_4quantiles_4.0']

In [8]:
# Grab training data and train the model
year_train = 2015
train_X = df[df['year']==year_train][features]
train_y = df[df['year']==year_train][label]
final_model = model.fit(train_X, train_y)

In [9]:
# Grab testing data to generate predictions
year_test = 2016
df_test = df[df['year']==year_test]

In [10]:
# Make sure the number of rows of df_test makes sense (# of block groups)
print('df_test has {} rows and {} columns'.format(df_test.shape[0], df_test.shape[1]))
df_test.head()

df_test has 3993 rows and 417 columns


Unnamed: 0,GEOID,year,name,parent-location,population,poverty-rate,renter-occupied-households,pct-renter-occupied,median-gross-rent,median-household-income,...,prev-yr_evictions-effectiveness_4quantiles_1.0,prev-yr_evictions-effectiveness_4quantiles_2.0,prev-yr_evictions-effectiveness_4quantiles_3.0,prev-yr_evictions-effectiveness_4quantiles_4.0,prev-yr_evictions-effectiveness_4quantiles_nan,prev-yr_pct-am-ind_4quantiles_1.0,prev-yr_pct-am-ind_4quantiles_2.0,prev-yr_pct-am-ind_4quantiles_3.0,prev-yr_pct-am-ind_4quantiles_4.0,prev-yr_pct-am-ind_4quantiles_nan
4,170310101001,2016,101.1,"Cook County, Illinois",435.0,18.92,167.0,67.23,785.0,,...,0,1,0,0,0,1,0,0,0,0
9,170310101002,2016,101.2,"Cook County, Illinois",1496.0,28.28,829.0,92.78,762.0,22537.0,...,0,0,1,0,0,1,0,0,0,0
14,170310101003,2016,101.3,"Cook County, Illinois",2175.0,24.55,817.0,81.44,1008.0,49432.0,...,0,0,1,0,0,1,0,0,0,0
19,170310102011,2016,102.01.1,"Cook County, Illinois",1785.0,37.69,354.0,74.55,1075.0,41941.0,...,0,0,1,0,0,1,0,0,0,0
24,170310102012,2016,102.01.2,"Cook County, Illinois",4339.0,24.81,1041.0,79.99,1023.0,36377.0,...,0,1,0,0,0,1,0,0,0,0


In [11]:
# Generate predictions for the next year
predict_X = df_test[features]
predictions = pipeline.get_predictions(final_model, predict_X)
df_test['final_predictions'] = predictions

Now add columns that we will eventually use for the Aequitas dataframe input.

Let's start with adding the **score** column, which will be our predicted labels given the model's output/predictions.

In [12]:
# Add "score" (predictions of 'upper 10th by year' based on model's results)
# Based on: final_predictions
col = 'final_predictions'
df_test['decile'] = pd.qcut(df_test[col].rank(method='first'), 10, labels=False)
df_test.loc[(df_test['decile'] == 9), 'score'] = 1
df_test.loc[(df_test['decile'] < 9), 'score'] = 0

In [13]:
# Eyeball / Check
df_test[['year','upper10th_by_year', 'final_predictions', 'score']].sort_values(by=['final_predictions'], ascending=False).head()

Unnamed: 0,year,upper10th_by_year,final_predictions,score
5549,2016,1.0,0.975541,1.0
4934,2016,1.0,0.975373,1.0
5679,2016,1.0,0.973905,1.0
6099,2016,1.0,0.971675,1.0
6059,2016,1.0,0.968235,1.0


Now let us generate the **majority_race** column.

In [14]:
# List the relevant raw attributes from data
race_attributes = ['pct-white', 'pct-af-am', 'pct-hispanic', 'pct-am-ind', 'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other']

# Create a column that grabs the max value for each row in our test dataframe
df_test['race_max_perc'] = df[race_attributes].max(axis=1)

# Create a column that grabs the column name with that max value (race that makes up the highest percentage of block's population)
df_test['race_max'] = df[race_attributes].idxmax(axis=1)

In [15]:
# Now let us create the majority_race column by only grabbing the race name if they make up 50% or more of the block
df_test.loc[(df_test['race_max_perc'] > 50), 'majority_race'] = df_test['race_max']
df_test.loc[(df_test['race_max_perc'] <= 50), 'majority_race'] = 'None'

In [16]:
# Rename the majority_race values to only include the race name (i.e. remove 'pct-')
race_names = ['White', 'African-American', 'Hispanic', 'American-Indian', 'Asian', 'NativeHawaiian-PacificIslander', 'Multiple', 'Other']
df_test['majority_race'].replace(race_attributes, race_names, inplace=True)

In [17]:
# Eyeball / Check race attributes
df_test[['pct-white', 'pct-af-am', 'pct-hispanic', 'pct-am-ind', 'pct-asian', 'pct-nh-pi', 'pct-multiple', 'pct-other', 'race_max', 'race_max_perc', 'majority_race']].head()

Unnamed: 0,pct-white,pct-af-am,pct-hispanic,pct-am-ind,pct-asian,pct-nh-pi,pct-multiple,pct-other,race_max,race_max_perc,majority_race
4,49.89,40.23,6.67,0.0,0.0,0.0,3.22,0.0,pct-white,49.89,
9,26.6,55.08,14.3,0.0,0.94,0.0,3.07,0.0,pct-af-am,55.08,African-American
14,55.86,23.91,10.67,0.0,6.48,0.0,2.44,0.64,pct-white,55.86,White
19,22.13,36.25,37.82,0.0,2.52,0.0,1.29,0.0,pct-hispanic,37.82,
24,18.6,51.33,23.14,0.0,4.1,0.0,2.83,0.0,pct-af-am,51.33,African-American


Create and preprocess dataframe to pass to Aequitas.

In [18]:
aequitas_df = df_test[['GEOID', 'score', 'upper10th_by_year', 'majority_race', 'poverty-rate']]

# Rename columns to appropriate names recognized by Aequitas
aequitas_df.rename({'GEOID': 'entity_id', 'upper10th_by_year': 'label_value'}, axis=1, inplace=True)
aequitas_df.head()

Unnamed: 0,entity_id,score,label_value,majority_race,poverty-rate
4,170310101001,0.0,0.0,,18.92
9,170310101002,1.0,0.0,African-American,28.28
14,170310101003,1.0,1.0,White,24.55
19,170310102011,1.0,1.0,,37.69
24,170310102012,1.0,1.0,African-American,24.81


In [19]:
# Double-check that categorical columns are of type 'string'
aequitas_df['majority_race'] = aequitas_df['majority_race'].astype(str)

# Cut continuous attribute columns into quartiles
aequitas_df, _ = preprocess_input_df(aequitas_df)

In [20]:
print('dataframe has {} rows and {} columns'.format(aequitas_df.shape[0], aequitas_df.shape[1]))
aequitas_df.head()

dataframe has 3993 rows and 5 columns


Unnamed: 0,entity_id,score,label_value,majority_race,poverty-rate
4,170310101001,0.0,0.0,,9.58-22.42
9,170310101002,1.0,0.0,African-American,22.42-100.00
14,170310101003,1.0,1.0,White,22.42-100.00
19,170310102011,1.0,1.0,,22.42-100.00
24,170310102012,1.0,1.0,African-American,22.42-100.00


In [21]:
# Save Aequitas input dataframe to file
aequitas_df.to_csv('../outputs/Aequitas_input.csv')

### Do Bias and Fairness Analysis

In [22]:
g = Group()
xtab, _ = g.get_crosstabs(aequitas_df)

model_id, score_thresholds 1 {'rank_abs': [400]}


In [23]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

Unnamed: 0,attribute_name,attribute_value,fp,k,model_id,score_threshold,pp,pn,tp,fn,tn,group_label_neg,group_label_pos,group_size,total_entities
0,majority_race,African-American,94,400,1,binary 0/1,309,703,215,74,629,723,289,1012,3993
1,majority_race,Asian,0,400,1,binary 0/1,0,27,0,1,26,26,1,27,3993
2,majority_race,Hispanic,10,400,1,binary 0/1,22,673,12,17,656,666,29,695,3993
3,majority_race,,17,400,1,binary 0/1,31,428,14,29,399,416,43,459,3993
4,majority_race,White,19,400,1,binary 0/1,38,1762,19,19,1743,1762,38,1800,3993
5,poverty-rate,0.00-1.80,7,400,1,binary 0/1,12,987,5,16,971,978,21,999,3993
6,poverty-rate,1.80-9.58,18,400,1,binary 0/1,45,953,27,24,929,947,51,998,3993
7,poverty-rate,22.42-100.00,85,400,1,binary 0/1,254,744,169,62,682,767,231,998,3993
8,poverty-rate,9.58-22.42,30,400,1,binary 0/1,89,909,59,38,871,901,97,998,3993


In [24]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

Unnamed: 0,attribute_name,attribute_value,npv,tpr,ppr,precision,pprev,fnr,fdr,for,fpr,tnr,prev
0,majority_race,African-American,0.89,0.74,0.77,0.7,0.31,0.26,0.3,0.11,0.13,0.87,0.29
1,majority_race,Asian,0.96,0.0,0.0,,0.0,1.0,,0.04,0.0,1.0,0.04
2,majority_race,Hispanic,0.97,0.41,0.06,0.55,0.03,0.59,0.45,0.03,0.02,0.98,0.04
3,majority_race,,0.93,0.33,0.08,0.45,0.07,0.67,0.55,0.07,0.04,0.96,0.09
4,majority_race,White,0.99,0.5,0.1,0.5,0.02,0.5,0.5,0.01,0.01,0.99,0.02
5,poverty-rate,0.00-1.80,0.98,0.24,0.03,0.42,0.01,0.76,0.58,0.02,0.01,0.99,0.02
6,poverty-rate,1.80-9.58,0.97,0.53,0.11,0.6,0.05,0.47,0.4,0.03,0.02,0.98,0.05
7,poverty-rate,22.42-100.00,0.92,0.73,0.64,0.67,0.25,0.27,0.33,0.08,0.11,0.89,0.23
8,poverty-rate,9.58-22.42,0.96,0.61,0.22,0.66,0.09,0.39,0.34,0.04,0.03,0.97,0.1


In [25]:
aqp = Plot()

NameError: name 'Plot' is not defined