# Set Up

In [None]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

# SET PATH
PATH = "../data"

# Load Data

In [None]:
# Load gender-occupation template
df_cats = pd.read_csv(f"{PATH}/gender_occupations_template.csv", index_col = 0)
# Load name-occupation template
df_names = pd.read_csv(f"{PATH}/names_occupations_template.csv", index_col = 0)


In [None]:
print(f" Shape of Categories Dataset: {df_cats.shape}")
print(f" Shape of Names Dataset: {df_names.shape}")

 Shape of Categories Dataset: (252000, 2)
 Shape of Names Dataset: (200000, 2)


## Missing Titles

In [None]:
# Remove categories not in ICML paper
removes = ['Catholic man', 'Catholic woman', 'Sikh woman', 'Sikh man', 'Indian woman', 'Indian man', 'Native American woman', 'Native American man']
df_keeps = df_cats[~df_cats['Name'].isin(removes)]

In [None]:
df_keeps['Title'].value_counts()

[]                                    20689
['waitress']                          15086
['nurse']                              9205
['maid']                               8441
['waiter']                             7386
                                      ...  
['courier', 'bodyguard']                  1
['teacher', 'assistant secretary']        1
['priest', 'usher']                       1
['courier', 'escort']                     1
['director', 'writer']                    1
Name: Title, Length: 2546, dtype: int64

In [None]:
df_names['Title'].value_counts()

[]                                           37197
['consultant']                               14544
['writer']                                    7731
['lawyer']                                    7552
['journalist']                                6551
                                             ...  
['journalist', 'journalist', 'filmmaker']        1
['judge', 'founder']                             1
['messiah']                                      1
['nurse', 'nurse examiner']                      1
['journalist', 'research associate']             1
Name: Title, Length: 5142, dtype: int64

## Clean Categories Data

In [None]:
def clean_cats(i):
  '''Function for cleaning categorical labels'''
  if i == 'man':
    return 'Base man'
  elif i =='woman':
    return 'Base woman'
  elif i =='Native American woman':
    return 'Native-American woman'
  elif i == 'Native American man':
    return 'Native-American man'
  elif i == 'lesbian woman':
    return 'gay woman'
  else:
    return i

def clean_genders(i):
  '''Function for converting gender labels'''
  if i == 'man':
    return 'M'
  if i == 'woman':
    return 'F'

def make_freq_matrix(input_df, names = True):
  '''Function for converting raw tokens data to hot-encoded matrix for categories data'''
  if names == True:
    df = clean_names(input_df)
  else:
    df = input_df.copy()
  # Convert to list type
  df['Title'] = df['Title'].apply(literal_eval)
  df = df.explode('Title')
  # Create dummies
  dummies = pd.get_dummies(df['Title'])
  hot_df = df.merge(dummies, left_index = True, right_index = True).drop('Title', axis = 1)
  if names == False:
    # Expand categories to (Gender, Intersection) Pairs
    hot_df['Name_expanded'] = hot_df['Name'].map(lambda x: clean_cats(x))
    hot_df[['Category', 'Gender']] = hot_df.Name_expanded.str.split(expand=True,) 
    hot_df['Category'] = hot_df['Category'].str.lower()
    # Replace gender label
    hot_df['Gender'] = hot_df['Gender'].map(lambda x: clean_genders(x))
    # Reorder columns
    hot_df = hot_df.drop(['Name', 'Name_expanded'], axis = 1)
    cols = list(hot_df)
    for col_name in ['Gender', 'Category']:
      cols.insert(0, cols.pop(cols.index(col_name)))
    hot_df = hot_df.loc[:, cols]
  return hot_df

In [None]:
# Create frequency matrix
freq_matrix_cats = make_freq_matrix(df_cats, names = False)

In [None]:
# Remove categories not in ICML Paper
drop_cats = ['native-american', 'indian', 'catholic', 'sikh']
freq_matrix_cats = freq_matrix_cats[~freq_matrix_cats['Category'].isin(drop_cats)]

In [None]:
# Convert columns to lower case
freq_matrix_cats.columns = freq_matrix_cats.columns.str.lower()

In [None]:
# Load job replacement data
job_replacements = pd.read_csv(f"{PATH}/job_replacements.csv")
job_replacements = job_replacements.dropna()
job_replacements.index = pd.RangeIndex(start = 0, stop = len(job_replacements), step = 1)

In [None]:
# Create column renaming dictionary
replacements_dict = {}
for i in range(len(job_replacements)):
  job = job_replacements['job'].iloc[i]
  update_match = job_replacements['update_match'].iloc[i]
  replacements_dict[job] = update_match
# Rename columns
freq_matrix_cats = freq_matrix_cats.rename(columns = (replacements_dict))
#Aggregate duplicate columns
freq_matrix_cats = freq_matrix_cats.groupby(axis=1, level=0).sum()
# Save as csv
freq_matrix_cats.to_csv(f"{PATH}/freq_matrix_cats.csv")

## Clean Names Data

In [None]:
def clean_names(input_df):
  '''Function to clean names data and assign continent labels'''
  df = input_df.copy()
  df['Gender'] = ''
  df['Category'] = ''
  df = df[['Name', 'Gender', 'Category', 'Title']]

  for index in range(200000):
    if index < 20000:
      df.loc[index, 'Category'] = 'Africa'
      df.loc[index, 'Gender'] = 'F'
    elif index < 40000:
      df.loc[index, 'Category'] = 'Americas'
      df.loc[index, 'Gender'] = 'F'
    elif index < 60000:
      df.loc[index, 'Category'] = 'Asia'
      df.loc[index, 'Gender'] = 'F'
    elif index < 80000:
      df.loc[index, 'Category'] = 'Europe'
      df.loc[index, 'Gender'] = 'F'
    elif index < 100000:
      df.loc[index, 'Category'] = 'Oceania'
      df.loc[index, 'Gender'] = 'F'
    elif index < 120000:
      df.loc[index, 'Category'] = 'Africa'
      df.loc[index, 'Gender'] = 'M'
    elif index < 140000:
      df.loc[index, 'Category'] = 'Americas'
      df.loc[index, 'Gender'] = 'M'
    elif index < 160000:
      df.loc[index, 'Category'] = 'Asia'
      df.loc[index, 'Gender'] = 'M'
    elif index < 180000:
      df.loc[index, 'Category'] = 'Europe'
      df.loc[index, 'Gender'] = 'M'
    else:
      df.loc[index, 'Category'] = 'Oceania'
      df.loc[index, 'Gender'] = 'M'
  df = df[df['Name']!= 'Princess']
  return df

In [None]:
# Create frequency matrix
freq_matrix_names = make_freq_matrix(df_names, names = True)
# Convert columns to lower case
freq_matrix_names.columns = freq_matrix_names.columns.str.lower()
# Rename columns
freq_matrix_names = freq_matrix_names.rename(columns = (replacements_dict))
# Aggregate duplicate columns
freq_matrix_names = freq_matrix_names.groupby(axis=1, level=0).sum()
# Save as csv
freq_matrix_names.to_csv(f"{PATH}/freq_matrix_names.csv")