# 01 Data Preparation

To the pre-prepared COMPAS data we add predictions for name gender and origin made as made by NamSor API.

In [None]:
# >>> Import Libraries

print("Importing necessary libraries... ")

import openapi_client #NamSor, see https://github.com/namsor/namsor-python-sdk2
from openapi_client.rest import ApiException

import pandas as pd

print("Libraries imported.")

In [None]:
# >>> Import COMPAS data set

print("Importing COMPAS data set... ")

df = pd.read_csv("data/compas_for_namsor.csv")

print("Data set imported. It is has {} entries and looks like this:".format(df.shape[0]))
df.head()

In [None]:
# >>> Preparing for API use

# Get private API Key for NamSor API v2 (contained in txt file)
print("Getting private key... ")

key = ''

try:
    with open("key.txt", "r") as file:
        key = file.read()
    if(len(key) < 0):
        raise FileNotFoundError()
except (FileNotFoundError):
    print("Could not find private key. Please make sure you have an API key that you stored as key.txt in the root folder.")

print("Got private key.")

In [None]:
print("Setting up NamSor API v2 connection settings...")

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key

# create an instance of the personal API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

print("Connection set.")

In [None]:
# >>> Classifying names with NamSor API

# Formatting a df of names
print('Formatting names dataframe...')

names_df = df[['entity_id', 'first', 'last']]

print('Names dataframe formatted. It looks like this: ')
print(names_df.head())

In [None]:
def to_first_last_name_geo_in(row) :
    ''' This function turns a tuple of values [id, first_name, last_name] into a to_first_last_name_geo_in object'''
    # https://github.com/namsor/namsor-python-sdk2/blob/master/docs/FirstLastNameGeoIn.md
    if(not row[0] or not row[1] or not row[2]):
        print("Entered invalid data to be turned into to_first_last_name_geo_in")
        return
    return openapi_client.FirstLastNameGeoIn(id=row[0], 
                                         first_name=row[1], 
                                         last_name=row[2], 
                                         country_iso2='us') # http://www.vas.com/Tnotes/Country%20Codes.htm

In [None]:
# Formatting a list of batches from the names df so names can be fed to the API batch-wise
print('Creating list of name-batches...')

names_stack = list() # this will be a list of name-batches generated from the df

limit = len(names_df.index)
start = 0
end = -1
batch_size = 1000 #1000 is the API limit given by NamSor

while(end < limit):
    start = end + 1
    
    try_end = start + batch_size
    if(try_end <= limit):
        end = try_end
    else:
        end = limit
    
    # each list item will fit openapi_client.BatchFirstLastNameGeoIn
    current_df_batch = names_df[start:end]
    
    # https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas/55557758#55557758
    list_first_last_name_geo_in = [to_first_last_name_geo_in(row) for row in current_df_batch[['entity_id', 'first', 'last']].to_numpy()]
    names_stack.append(list_first_last_name_geo_in)
    
print('List of batches created.')

print('Will need to make {} calls.'.format(len(names_stack)))

In [None]:
def get_batch(list_first_last_name_geo_in):
    return openapi_client.BatchFirstLastNameGeoIn(personal_names=list_first_last_name_geo_in)

In [None]:
def predict_gender_batch(list_first_last_name_geo_in):
    api_response = pers_api_instance.gender_geo_batch(batch_first_last_name_geo_in=batch_first_last_name_geo_in)# call api
    return api_response.personal_names

In [None]:
def predict_ethnicity_batch(list_first_last_name_geo_in):
    # "Output is W_NL (white, non latino), HL (hispano latino), A (asian, non latino), B_NL (black, non latino)."
    api_response = pers_api_instance.us_race_ethnicity_batch(batch_first_last_name_geo_in=batch_first_last_name_geo_in)# call api
    return api_response.personal_names

In [None]:
# Sending in one batch at a time and saving the result answer by answer.

print("Sending batches to the API...")

result_gender = []
result_ethnicity = []
current = 0
limit = len(names_stack)

while(current < limit): # I assume len(result_gender) == len(result_ethnicity)
    print(current)
    batch_first_last_name_geo_in = get_batch(names_stack[current])
    try:
        result_gender.extend(predict_gender_batch(batch_first_last_name_geo_in))
        result_ethnicity.extend(predict_ethnicity_batch(batch_first_last_name_geo_in))
    except ApiException as e:
        print("Exception when calling PersonalApi: {}".format(e))
        if(len(result_gender) != (batch_size * current + len(names_stack[current])) or
          len(result_ethnicity) != (batch_size * current + len(names_stack[current]))):
            print("Some names got lost when the exception at stack {} occurred. Please try again.".format(current))
            break
        if(len(result_gender) == (batch_size * current + len(names_stack[current]))):
            print("No names got lost for gender predictions. Trying again with stack size {}...".format(len(names_stack)))
        if(len(result_ethnicity) == (batch_size * current + len(names_stack[current]))):
            print("No names got lost for ethnicity predictions. Trying again with stack size {}...".format(len(names_stack)))
        current -= 1
        continue
    current += 1

print("All batches analyzed.")
print(result_gender[:5])
print(result_ethnicity[:5])

In [None]:
# >>> TODO: Save results to dataframe

df.reset_index(inplace=True)
df.set_index('entity_id', inplace=True)

# Convert results (list of openapi_client.models.personal_name_gendered_out.PersonalNameGenderedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for i in range(len(result_gender)):
    oapi_el = result_gender[i]
    current_id = int(oapi_el.id)
    df.loc[current_id, 'sex_pred'] = oapi_el.likely_gender
    df.loc[current_id, 'sex_pred_prob'] = oapi_el.probability_calibrated
    
    oapi_el = result_ethnicity[i]
    df.loc[current_id, 'race_pred'] = oapi_el.race_ethnicity
    df.loc[current_id, 'race_pred_prob'] = oapi_el.probability_calibrated

print('Dataframe completed with API results. Here are some results: {}'.format(df.head()))

In [None]:
# Saving results to 'names_cat.csv'
print("Saving compas dataframe with predictions for gender and ethnicity to CSV... ")
df.to_csv("data/compas_with_predictions.csv")
print("CSV saved!")