# 01 Data Preparation

We clean the predictions made by namsor

In [2]:
# >>> Import Libraries

print("Importing necessary libraries... ")

from aequitas.group import Group # Aequitas, see https://github.com/dssg/aequitas/blob/master/docs/source/examples/compas_demo.ipynb
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

import pandas as pd

print("Libraries imported.")

Importing necessary libraries... 
Libraries imported.


In [14]:
# >>> Import COMPAS data set

print("Importing COMPAS data set... ")

df = pd.read_csv("data/compas_with_predictions.csv")

print("Data set imported. It is has {} entries and looks like this:".format(df.shape[0]))
df.head()

Importing COMPAS data set... 
Data set imported. It is has 7214 entries and looks like this:


Unnamed: 0,entity_id,level_0,index,first,last,score,label_value,race,sex,age_cat,sex_pred,sex_pred_prob,race_pred,race_pred_prob
0,1,0,0,miguel,hernandez,0.0,0,Other,Male,Greater than 45,male,0.999286,HL,0.975499
1,3,1,1,kevon,dixon,0.0,1,African-American,Male,25 - 45,male,0.95672,B_NL,0.857965
2,4,2,2,ed,philo,0.0,1,African-American,Male,Less than 25,male,0.968813,A,0.611053
3,5,3,3,marcu,brown,1.0,0,African-American,Male,Less than 25,male,0.622665,B_NL,0.764072
4,6,4,4,bouthy,pierrelouis,0.0,0,Other,Male,25 - 45,male,0.509131,B_NL,0.800832


In [None]:
# >>> make sure no data is missing
df[df.isnull().any(axis=1)]

In [None]:
# >>> Compare values for sex as given by COMPAS and NamSor

In [18]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [19]:
df.sex_pred.unique()

array(['Male', 'Female'], dtype=object)

In [17]:
# make sex_pred start with a capital letter
# https://www.geeksforgeeks.org/capitalize-first-letter-of-a-column-in-pandas-dataframe/
df['sex_pred'] = df['sex_pred'].str.capitalize() 
df.sex_pred.unique()

Unnamed: 0,entity_id,level_0,index,first,last,score,label_value,race,sex,age_cat,sex_pred,sex_pred_prob,race_pred,race_pred_prob
0,1,0,0,miguel,hernandez,0.0,0,Other,Male,Greater than 45,Male,0.999286,HL,0.975499
1,3,1,1,kevon,dixon,0.0,1,African-American,Male,25 - 45,Male,0.95672,B_NL,0.857965
2,4,2,2,ed,philo,0.0,1,African-American,Male,Less than 25,Male,0.968813,A,0.611053
3,5,3,3,marcu,brown,1.0,0,African-American,Male,Less than 25,Male,0.622665,B_NL,0.764072
4,6,4,4,bouthy,pierrelouis,0.0,0,Other,Male,25 - 45,Male,0.509131,B_NL,0.800832


In [None]:
# >>> Compare values for race as given by COMPAS and NamSor

In [23]:
df.race.unique()

array(['Other', 'African-American', 'Caucasian', 'Hispanic',
       'Native American', 'Asian'], dtype=object)

In [21]:
df.race_pred.unique()

array(['HL', 'B_NL', 'A', 'W_NL'], dtype=object)

In [None]:
# We see the classes are not the same

In [23]:
def mapToCompas(namsor):
    correspondingValues = {
        'A': 'Asian',
        'B_NL': 'African-American',
        'HL': 'Hispanic',
        'W_NL': 'Caucasian'
    }
    return correspondingValues.get(namsor, namsor)

In [27]:
# map race_pred to one of race
# W_NL -> Caucasian
# HL -> Hispanic
# B_NL -> African-American
# A -> Asian
# nan -> Other
df['race_pred'] = df['race_pred'].apply(lambda x: mapToCompas(x)) 

In [28]:
df.race_pred.unique()

array(['Hispanic', 'African-American', 'Asian', 'Caucasian'], dtype=object)

In [None]:
# Add in report:
# "Output is W_NL (white, non latino), HL (hispano latino), A (asian, non latino), B_NL (black, non latino)."
# Missing: Native American, Other
# When does NamSor return nan?
# In the COMPAS set, does african-american / caucasian exclude hispanic as well?
# there's a difference made between race and ethnicity in the census https://de.wikipedia.org/wiki/Race_(United_States_Census)

In [21]:
# Saving results to 'names_cat.csv'
print("Saving compas dataframe with predictions for gender and ethnicity to CSV... ")
df.to_csv("data/compas_with_predictions_cleaned.csv")
print("CSV saved!")

Saving compas dataframe with predictions for gender and ethnicity to CSV... 
CSV saved!
