# 01 Data Preparation

To the pre-prepared COMPAS data we add predictions for name gender and origin made as made by NamSor API.

In [130]:
# >>> Import Libraries

print("Importing necessary libraries... ")

import openapi_client #NamSor, see https://github.com/namsor/namsor-python-sdk2
from openapi_client.rest import ApiException

from aequitas.group import Group # Aequitas, see https://github.com/dssg/aequitas/blob/master/docs/source/examples/compas_demo.ipynb
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

import pandas as pd

print("Libraries imported.")

Importing necessary libraries... 
Libraries imported.


In [180]:
# >>> Import COMPAS data set

print("Importing COMPAS data set... ")

df = pd.read_csv("data/compas_for_namsor.csv")

print("Data set imported. It is has {} entries and looks like this:".format(df.shape[0]))
df.head()

Importing COMPAS data set... 
Data set imported. It is has 7214 entries and looks like this:


Unnamed: 0,entity_id,first,last,score,label_value,race,sex,age_cat
0,1,miguel,hernandez,0.0,0,Other,Male,Greater than 45
1,3,kevon,dixon,0.0,1,African-American,Male,25 - 45
2,4,ed,philo,0.0,1,African-American,Male,Less than 25
3,5,marcu,brown,1.0,0,African-American,Male,Less than 25
4,6,bouthy,pierrelouis,0.0,0,Other,Male,25 - 45


In [3]:
# >>> Preparing for API use

# Get private API Key for NamSor API v2 (contained in txt file)
print("Getting private key... ")

key = ''

try:
    with open("key.txt", "r") as file:
        key = file.read()
    if(len(key) < 0):
        raise FileNotFoundError()
except (FileNotFoundError):
    print("Could not find private key. Please make sure you have an API key that you stored as key.txt in the root folder.")

print("Got private key.")

Getting private key... 
Got private key.


In [4]:
print("Setting up NamSor API v2 connection settings...")

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key

# create an instance of the personal API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

print("Connection set.")

Setting up NamSor API v2 connection settings...
Connection set.


In [5]:
# >>> Classifying names with NamSor API

# Formatting a df of names and iso2 country codes, which is always 'us' in the COMPAS data set
print('Formatting names dataframe...')

names_df = df[['entity_id', 'first', 'last']]

print('Names dataframe formatted. It looks like this: ')
print(names_df.head())

Formatting names dataframe...
Names dataframe formatted. It looks like this: 
   entity_id   first         last
0          1  miguel    hernandez
1          3   kevon        dixon
2          4      ed        philo
3          5   marcu        brown
4          6  bouthy  pierrelouis


In [71]:
def to_first_last_name_geo_in(row) :
    ''' This function turns a tuple of values [id, first_name, last_name] into a to_first_last_name_geo_in object'''
    # https://github.com/namsor/namsor-python-sdk2/blob/master/docs/FirstLastNameGeoIn.md
    if(not row[0] or not row[1] or not row[2]):
        print("Entered invalid data to be turned into to_first_last_name_geo_in")
        return
    return openapi_client.FirstLastNameGeoIn(id=row[0], 
                                         first_name=row[1], 
                                         last_name=row[2], 
                                         country_iso2='us') # http://www.vas.com/Tnotes/Country%20Codes.htm

In [72]:
# Formatting a list of batches from the names df so names can be fed to the API batch-wise
print('Creating list of name-batches...')

names_stack = list() # this will be a list of name-batches generated from the df

limit = len(names_df.index)
start = 0
end = -1
batch_size = 1000 #1000 is the API limit given by NamSor

while(end < limit):
    start = end + 1
    
    try_end = end + batch_size
    if(try_end <= limit):
        end = try_end
    else:
        end = limit
    
    # each list item will fit openapi_client.BatchFirstLastNameGeoIn
    current_df_batch = names_df[start:end]
    
    # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas/55557758#55557758
    list_first_last_name_geo_in = [to_first_last_name_geo_in(row) for row in current_df_batch[['entity_id', 'first', 'last']].to_numpy()]
    names_stack.append(list_first_last_name_geo_in)
    
print('List of batches created.')

print('Will need to make {} calls.'.format(len(names_stack)))

Creating list of name-batches...
[1 'miguel' 'hernandez']
[3 'kevon' 'dixon']
[4 'ed' 'philo']
[5 'marcu' 'brown']
[6 'bouthy' 'pierrelouis']
[7 'marsha' 'miles']
[8 'edward' 'riddle']
[9 'steven' 'stewart']
[10 'elizabeth' 'thieme']
[13 'bo' 'bradac']
[14 'benjamin' 'franc']
[15 'ellyaher' 'lanza']
[16 'kortney' 'coleman']
[18 'jarrod' 'turbe']
[19 'craig' 'gilbert']
[20 'samuel' 'seraphin']
[21 'mario' 'hernandez']
[22 'darrious' 'davis']
[23 'neil' 'heckart']
[24 'michael' 'lux']
[25 'columbus' 'wilson']
[26 'vandivuiet' 'williams']
[27 'nelson' 'avalo']
[28 'janel' 'denicola']
[30 'dominic' 'pabon']
[32 'russell' 'sottile']
[33 'andre' 'ashley']
[37 'deandrae' 'counts']
[38 'victoria' 'soltau']
[39 'najee' 'sapp']
[40 'victor' 'moreno']
[41 'william' 'pirkle']
[42 'maslin' 'brutus']
[45 'mark' 'friedland']
[50 'maurice' 'watson']
[51 'kurt' 'fowks']
[52 'michael' 'tritsch']
[53 'brooks' 'nunez']
[54 'walter' 'atwell']
[55 'darling' 'madrano']
[56 'kiante' 'slocum']
[57 'porfirio' '

[3458 'jose' 'chavez']
[3460 'brice' 'hawes']
[3462 'vidal' 'cervantes']
[3463 'john' 'grant']
[3464 'christopher' 'mcghie']
[3465 'herold' 'delinois']
[3468 'oneil' 'kerone']
[3470 'jamal' 'khan']
[3474 'sean' 'hanrahan']
[3475 'donald' 'duke']
[3476 'eliasin' 'reyes']
[3479 'shalika' 'scott']
[3480 'willie' 'wiggins']
[3481 'shantel' 'smith']
[3483 'jason' 'herring']
[3484 'samuel' 'bellamy']
[3485 'lemariani' 'camel']
[3487 'raul' 'calvet']
[3488 'james' 'conroy']
[3489 'wilnika' 'wilson']
[3491 'rogrigo' 'penaranda']
[3492 'roderick' 'thomas']
[3493 'gymmy' 'justin']
[3498 'nakiea' 'maywa']
[3499 'johnny' 'masses']
[3500 'justin' 'darrall']
[3501 'jamal' 'jackson']
[3502 'sammy' 'garcia']
[3504 'jacolby' 'floyd']
[3506 'timothy' 'newson']
[3507 'paulo' 'lapa']
[3508 'elexus' 'shell']
[3510 'michael' 'ridgley']
[3511 'angel' 'navarro']
[3512 'christopher' 'stewart']
[3515 'james' 'true']
[3516 'onique' 'williams']
[3517 'edson' 'ferrari']
[3519 'anthony' 'irby']
[3521 'nickson' 'mar

[5858 'juan' 'hidalgo']
[5861 'sharon' 'abel']
[5863 'stephen' 'maxwell']
[5864 'jose' 'alfonso']
[5865 'james' 'hawkins']
[5867 'kenroy' 'whitley']
[5869 'javier' 'gonzalez']
[5871 'david' 'wilkinson']
[5873 'eric' 'mobley']
[5875 'daniel' 'campbell']
[5877 'joseph' 'pryor']
[5878 'terryann' 'mcdonald']
[5880 'addison' 'oliver']
[5881 'ken' 'delva']
[5882 'philo' 'little']
[5883 'courtney' 'walraven']
[5885 'raymond' 'platt']
[5886 'glenn' 'bard']
[5887 'ricky' 'joseph']
[5889 'aeritta' 'covington']
[5890 'romone' 'gray']
[5891 'david' 'gutman']
[5892 'rochelle' 'rochester']
[5895 'kevin' 'sasnett']
[5896 'carlos' 'pastrana']
[5898 'luis' 'gonzalez']
[5899 'freddy' 'hall']
[5900 'matthew' 'logiudice']
[5901 'joshua' 'perez']
[5902 'shanard' 'twensey']
[5904 'kevin' 'edwards']
[5905 'paul' 'demus']
[5906 'jerline' 'jean']
[5907 'anthony' 'williams']
[5908 'john' 'wiborg']
[5912 'julio' 'esquiagola']
[5914 'donald' 'perdue']
[5915 'abrahiem' 'darwish']
[5916 'devin' 'cooper']
[5917 'cli

[9061 'shanard' 'roland']
[9062 'april' 'hood']
[9064 'tyrel' 'fairclough']
[9067 'jacob' 'domkoski']
[9070 'bruce' 'sinert']
[9072 'james' 'travis']
[9074 'wendell' 'thomas']
[9075 'sussan' 'cabrera']
[9076 'shameka' 'lowery']
[9077 'joshua' 'mercado']
[9079 'jerko' 'delacruz']
[9080 'juan' 'guevara']
[9081 'lansberth' 'blackwood']
[9083 'ed' 'hughes']
[9085 'joseph' 'brock']
[9086 'nicholas' 'graham']
[9087 'robert' 'grady']
[9088 'william' 'rodman']
[9089 'michael' 'preston']
[9091 'mark' 'ribeiro']
[9092 'jesus' 'roa']
[9093 'jerel' 'dean']
[9094 'mark' 'simon']
[9096 'janoi' 'turner']
[9097 'terrance' 'rogers']
[9099 'vanessa' 'byrd']
[9102 'patrick' 'shirley']
[9104 'alfonso' 'glenn']
[9106 'pedro' 'melo']
[9109 'shane' 'hall']
[9110 'james' 'wilson']
[9111 'andrew' 'thornton']
[9112 'jason' 'mutrux']
[9115 'robert' 'bochini']
[9116 'ramon' 'matute']
[9119 'jabaris' 'gibson']
[9120 'rodquez' 'lovett']
[9123 'jason' 'franco']
[9125 'nicanor' 'durand']
[9126 'deangelo' 'ash']
[9127

In [80]:
# TODO: Test & Update for names separated 
def predict_batch(list_first_last_name_geo_in):
    batch_first_last_name_geo_in = openapi_client.BatchFirstLastNameGeoIn(personal_names=list_first_last_name_geo_in)
    api_response = pers_api_instance.gender_geo_batch(batch_first_last_name_geo_in=batch_first_last_name_geo_in)# call api
    return api_response.personal_names # return result

In [88]:
# Sending in one batch at a time and saving the result answer by answer.

print("Sending batches to the API...")

result = []
current = 0
limit = 2 #len(names_stack)

while(len(result) < limit):
    try:
        result.extend(predict_batch(names_stack[current]))
        current += 1
    except ApiException as e:
        print("Exception when calling PersonalApi: gender_geo_batch: {}".format(e))
        if(len(result) == (batch_size * current + len(names_stack[current]))):
            print("No names got lost. Trying again with stack size {}...".format(len(names_stack)))
        else:
            print("Some names got lost when the exception at stack {} occurred. Please try again.".format(current))

print("All batches analyzed.")
print(result)

[{'first_name': 'miguel',
 'gender_scale': -0.9918105205926329,
 'id': '1',
 'last_name': 'hernandez',
 'likely_gender': 'male',
 'probability_calibrated': 0.9959052602963164,
 'score': 40.48141974289901}, {'first_name': 'kevon',
 'gender_scale': -0.9515955909827281,
 'id': '3',
 'last_name': 'dixon',
 'likely_gender': 'male',
 'probability_calibrated': 0.975797795491364,
 'score': 23.930992858410622}, {'first_name': 'ed',
 'gender_scale': -0.9609662625502784,
 'id': '4',
 'last_name': 'philo',
 'likely_gender': 'male',
 'probability_calibrated': 0.9804831312751392,
 'score': 25.219659863806516}, {'first_name': 'marcu',
 'gender_scale': -0.5838491405761623,
 'id': '5',
 'last_name': 'brown',
 'likely_gender': 'male',
 'probability_calibrated': 0.7919245702880812,
 'score': 9.503246398640567}, {'first_name': 'bouthy',
 'gender_scale': -0.28456843167061563,
 'id': '6',
 'last_name': 'pierrelouis',
 'likely_gender': 'male',
 'probability_calibrated': 0.6422842158353078,
 'score': 3.636442

In [167]:
df['entity_id'] = df['entity_id'].astype('str')
#print(type(df['entity_id'][0]))
df.reset_index()
df.set_index('entity_id', inplace=True)
#df.index
df.loc['3'].at['first']

'kevon'

In [182]:
# >>> TODO: Save results to dataframe
'''
list of results looks like this:
[{'first_name': 'miguel',
 'gender_scale': -0.9918105205926329,
 'id': '1',
 'last_name': 'hernandez',
 'likely_gender': 'male',
 'probability_calibrated': 0.9959052602963164,
 'score': 40.48141974289901}, {'first_name': 'kevon',
 'gender_scale': -0.9515955909827281,
 'id': '3',
 'last_name': 'dixon',
 'likely_gender': 'male',
 'probability_calibrated': 0.975797795491364,
 'score': 23.930992858410622}]
'''

df.reset_index(inplace=True)
df.set_index('entity_id', inplace=True)

# Convert results (list of openapi_client.models.personal_name_gendered_out.PersonalNameGenderedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    current_id = int(oapi_el.id)
    df.loc[current_id, 'sex_p'] = oapi_el.likely_gender
    df.loc[current_id, 'sex_p_prob'] = oapi_el.probability_calibrated
    
print('Dataframe completed with API results. Here are some results: {}'.format(df.head()))

Filling the results into the names dataframe...
Dataframe completed with API results. Here are some results:            index   first         last  score  label_value              race  \
entity_id                                                                     
1              0  miguel    hernandez    0.0            0             Other   
3              1   kevon        dixon    0.0            1  African-American   
4              2      ed        philo    0.0            1  African-American   
5              3   marcu        brown    1.0            0  African-American   
6              4  bouthy  pierrelouis    0.0            0             Other   

            sex          age_cat sex_p  sex_p_prob  
entity_id                                           
1          Male  Greater than 45  male    0.995905  
3          Male          25 - 45  male    0.975798  
4          Male     Less than 25  male    0.980483  
5          Male     Less than 25  male    0.791925  
6          Male    