# 01 Data Preparation

To the pre-prepared COMPAS data we add predictions for name gender and origin made as made by NamSor API.

In [None]:
# >>> Import Libraries

print("Importing necessary libraries... ")

import openapi_client #NamSor, see https://github.com/namsor/namsor-python-sdk2
from openapi_client.rest import ApiException

from aequitas.group import Group # Aequitas, see https://github.com/dssg/aequitas/blob/master/docs/source/examples/compas_demo.ipynb
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

import pandas as pd

print("Libraries imported.")

In [None]:
# >>> Import COMPAS data set

print("Importing COMPAS data set... ")

df = pd.read_csv("data/compas_for_namsor.csv")

print("Data set imported. It is has {} entries and looks like this:".format(df.shape[0]))
df.head()

In [None]:
# >>> Preparing for API use

# Get private API Key for NamSor API v2 (contained in txt file)
print("Getting private key... ")

key = ''

try:
    with open("key.txt", "r") as file:
        key = file.read()
    if(len(key) < 0):
        raise FileNotFoundError()
except (FileNotFoundError):
    print("Could not find private key. Please make sure you have an API key that you stored as key.txt in the root folder.")

print("Got private key.")

In [None]:
print("Setting up NamSor API v2 connection settings...")

# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = key

# create an instance of the personal API class
pers_api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))

print("Connection set.")

In [None]:
# TODO: Test & Update for names separated 
def predict_batch(li):
    "This function prepares a list of unformatted names for the API call and then calls the API calling function. It returns the API's name classifications."
    personal_names = list(map(openapi_client.PersonalNameIn(id=name, name=name), li))  # format the names
    batch_personal_name_in = openapi_client.BatchPersonalNameIn(personal_names=personal_names)# format the batch
    api_response =  pers_api_instance.gender_full_batch(batch_personal_name_in=batch_personal_name_in)# call api
    return api_response.personal_names # return result

In [None]:
# >>> Classifying names with NamSor API

# Sending in one batch at a time and saving the result answer by answer.

batch_size = 1000 #1000 is the API limit given by NamSor
start = 0
end = batch_size
result = []

names_stack = list(); # TODO: create list of first and last names

print('Will need to make {} calls.'.format(len(names_stack) / batch_size))

In [None]:
while (len(names_stack) >= batch_size):
    try:
        result = result + predict_batch(names_stack[start:end])
        del names_stack[start:end] # delete the names that have already been categorized from the stack
        print("Batch of names analyzed. {} names left.".format(len(names_stack)))
        
        # categorize remaining names if they are less than a batch size
        if(len(names_stack) < batch_size and len(names_stack) > 0):
            result = result + predict_batch(names_stack)
            names_stack = [] # empty the stack
            print("Batch of names analyzed. {} names left.".format(len(names_stack)))
    except ApiException as e: # Sometimes with a big batch of batches, the API calling gets interrupted (don't panic!)
        print("Exception when calling PersonalApi: gender_full_batch: {}".(e))
        
        if((len(list(names.index.values))-len(result)) == len(names_stack)): #check that no names got lost
            print("No names got lost. Trying again with stack size {}...".format(len(names_stack)))
            continue
        else:
            print("Some names got lost when the exception occurred. Please try again.")

print("All batches analyzed.")

In [None]:
# >>> TODO: Save results to dataframe

# Convert results (list of openapi_client.models.personal_name_gendered_out.PersonalNameGenderedOut) to (list of dictionaries)
print('Filling the results into the names dataframe...')
for oapi_el in result:
    # names.at[oapi_el.id, 'likely_gender'] = oapi_el.likely_gender
    # names.at[oapi_el.id, 'score'] = oapi_el.score
print('Dataframe completed with API results. Here are some results: {}'.format(names[:10]))