### Imports

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from IPython.display import display
import json
import numpy as np
import pandas as pd
import os
import random
import re
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import tensorflow as tf

### Read scored test data

In [2]:
standard_data_path = 'gs://conversationai-models/biosbias/scored_data/test_data_0327_v2.csv'
# scrubbed_data_path = 'gs://conversationai-models/biosbias/scored_data/scrubbed_test.csv'

perf_df = pd.read_csv(tf.gfile.Open(standard_data_path)).drop_duplicates(subset=['tokens'])
# scrubbed_df = pd.read_csv(tf.gfile.Open(scrubbed_data_path)).drop_duplicates(subset=['tokens'])

In [3]:
print(perf_df.shape)
#print(scrubbed_df.shape)

(59824, 267)


In [4]:
df = perf_df
#df = perf_df.join(scrubbed_df, rsuffix = '_scrubbed')

In [5]:
df.head()

Unnamed: 0,tokens,gender,label,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_0,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_1,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_2,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_3,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_4,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_5,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_6,...,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_23,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_24,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_25,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_26,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_27,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_28,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_29,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_30,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_31,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_32
0,"[u'ms.', u'grandal', u'practices', u'medicine'...",F,23,5.727137e-10,2.916685e-28,8.39932e-09,1.294314e-08,2.742447e-08,5.902747e-15,5.34621e-16,...,0.999982,3.449516e-12,5.492861e-09,1e-06,1.2114110000000001e-27,4.068439e-31,1.713995e-11,1.462761e-05,6.660692e-12,8.769331000000001e-17
1,"[u'her', u'clinical', u'practice', u',', u'tea...",F,25,3.56306e-06,4.176983e-14,1.370124e-05,3.733084e-05,0.0001216295,4.37787e-09,3.357444e-08,...,0.053336,1.462687e-07,0.936171,0.005295,1.777507e-14,1.653589e-15,2.987385e-06,0.000230399,0.0008814193,2.817673e-06
2,"[u'dr.', u'pankaj', u'savla', u'practices', u'...",M,7,5.004311e-06,2.232138e-16,1.417693e-06,8.938866e-07,4.047489e-06,2.755533e-09,1.622926e-08,...,0.000131,2.076018e-05,6.177096e-05,3e-06,2.947059e-08,3.274288e-17,3.224972e-06,0.05307177,6.932695e-05,1.332975e-07
3,"[u'watch', u'her', u'discuss', u'her', u'resea...",F,25,0.02305727,7.231186e-12,0.0009480168,0.002633551,8.78855e-06,2.570692e-06,1.277006e-05,...,0.000157,0.0002135647,0.7258582,0.006123,1.487029e-10,2.164053e-13,0.002287528,9.199022e-05,0.02494066,9.204572e-08
4,"[u'she', u'graduated', u'with', u'honors', u'f...",F,17,4.612723e-07,5.236891e-25,8.497399e-09,3.84757e-07,1.115638e-07,1.102317e-14,4.010708e-14,...,9e-06,4.729366e-11,1.814765e-05,4e-06,1.538144e-26,2.235718e-32,1.864666e-13,1.228733e-09,8.040961e-08,8.570274e-12


In [6]:
df.shape

(59824, 267)

In [7]:
df = df.dropna()
print(df.shape)

(59824, 267)


### Preprocessing

In [8]:
def get_class_from_col_name(col_name):
    #print(col_name)
    pattern = r'^.*_(\d+)$'
    return int(re.search(pattern, col_name).group(1))

In [9]:
def find_best_class(df, model_name, class_names):
    model_class_names = ['{}_{}'.format(model_name, class_name) for class_name in class_names]
    sub_df = df[model_class_names]
    df['{}_class'.format(model_name)] = sub_df.idxmax(axis=1).apply(get_class_from_col_name)

In [12]:
MODEL_NAMES = {
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117': 'glove_baseline',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103152': 'debiased',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103209': 'debiased_biosbias',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103232': 'strong_biosbias',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103245': 'strong_biosbias_2',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103254': 'strong_biosbias_3', 
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103300': 'strong_biosbias_4',
    'tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329': 'strong_biosbias_noequalize',
}

In [13]:
CLASS_NAMES = range(33)

In [14]:
for _model in MODEL_NAMES:  m_count = df[df['gender'] == "M"]['count'].values[0]
    f_count = df[df['gender'] == "F"]['count'].values[0]
    find_best_class(df, _model, CLASS_NAMES)

In [15]:
# Labels with either gender having too few examples
bad_labels = df.groupby('label').gender.value_counts().reset_index(name = 'count').query('count < 5').label.values
assert len(bad_labels) == 0

### Accuracy Calculation

In [16]:
accuracy_list = []
for _model in MODEL_NAMES:
    is_correct = (df['{}_class'.format(_model)] == df['label'])
    _acc = sum(is_correct)/len(is_correct)
    accuracy_list.append(_acc)
    print ('Accuracy for model {}: {}'.format(MODEL_NAMES[_model], 100*round(_acc,3)))

Accuracy for model strong_biosbias_noequalize: 81.1
Accuracy for model glove_baseline: 81.8
Accuracy for model strong_biosbias_4: 81.5
Accuracy for model debiased_biosbias: 81.1
Accuracy for model strong_biosbias_3: 81.4
Accuracy for model debiased: 82.0
Accuracy for model strong_biosbias: 81.4
Accuracy for model strong_biosbias_2: 81.1


### Fairness Metrics

In [17]:
for _class in CLASS_NAMES:
    df['label_{}'.format(_class)] = (df['label'] == _class)

In [18]:
# Gender ratios of classes
gender_counts = df.groupby('label').gender.value_counts().reset_index(name = 'count')

In [19]:
def frac_female(df):
    m_count = df[df['gender'] == "M"]['count'].values[0]
    f_count = df[df['gender'] == "F"]['count'].values[0]
    return {'label': df['label'].values[0], 'frac_female': f_count/(m_count+f_count)}

In [20]:
frac_female_df = pd.DataFrame(list(gender_counts.groupby('label', as_index = False).apply(frac_female)))

In [21]:
def compute_tpr(df, _class, _model, threshold = 0.5):
    tpr = metrics.recall_score(df['label_{}'.format(_class)],
                               df['{}_{}'.format(_model,_class)] > threshold)
    return tpr
    
def compute_tpr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m = compute_tpr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f = compute_tpr(df.query('gender == "F"'), _class, _model, threshold)
    return {'M': tpr_m, 'F': tpr_f}

In [22]:
def compute_tpr_tnr(df, _class, _model, threshold = 0.5):
    #cm = metrics.confusion_matrix(df['label_{}'.format(_class)],
    #                              df['{}_{}'.format(_model,_class)] > threshold)
    cm = pd.crosstab(df['label_{}'.format(_class)], df['{}_{}'.format(_model,_class)] > threshold)
#     display(cm)
#     print (cm.shape)
    if cm.shape[1]>1: #cm.shape[0] > 1 and
        tn = cm.iloc[0,0]
        fp = cm.iloc[0,1]
        fn = cm.iloc[1,0]
        tp = cm.iloc[1,1]
        tpr = tp/(tp+fn)
        tnr = tn/(tn+fp)  
    else:
        tpr = 0
        tnr = 1
    return tpr, tnr

def compute_tr_by_gender(df, _class, _model, threshold = 0.5):
    tpr_m, tnr_m = compute_tpr_tnr(df.query('gender == "M"'), _class, _model, threshold)
    tpr_f, tnr_f = compute_tpr_tnr(df.query('gender == "F"'), _class, _model, threshold)
    return {'TPR_m': tpr_m, 'TPR_f': tpr_f, 'TNR_m': tnr_m, 'TNR_f': tnr_f}

In [23]:
for _class in CLASS_NAMES:
    for _model in MODEL_NAMES:
        tpr_1 = compute_tpr(df, _class, _model)
        tpr_2, _ = compute_tpr_tnr(df, _class, _model)
        assert tpr_1 == tpr_2, '{} != {}'.format(tpr_1, tpr_2)
        #print('{} == {}'.format(tpr_1, tpr_2))

In [24]:
tpr_df = pd.DataFrame()
for _class in frac_female_df.label:
    row = {}
    row['label'] = _class
    for _model, _model_type in MODEL_NAMES.items():
        tpr, tnr = compute_tpr_tnr(df, _class, _model)
        row['{}_tpr'.format(_model_type)] = tpr
        row['{}_tnr'.format(_model_type)] = tnr
        gender_trs = compute_tr_by_gender(df, _class, _model)
        row['{}_tpr_F'.format(_model_type)] = gender_trs['TPR_f']
        row['{}_tpr_M'.format(_model_type)] = gender_trs['TPR_m']
        row['{}_tpr_gender_gap'.format(_model_type)] = gender_trs['TPR_f'] - gender_trs['TPR_m']
        row['{}_tnr_F'.format(_model_type)] = gender_trs['TNR_f']
        row['{}_tnr_M'.format(_model_type)] = gender_trs['TNR_m']
        row['{}_tnr_gender_gap'.format(_model_type)] = gender_trs['TNR_f'] - gender_trs['TNR_m']
    tpr_df = tpr_df.append(row, ignore_index = True)

In [25]:
results_df = pd.merge(tpr_df, frac_female_df, on = 'label')

In [53]:
results_df['glove_baseline_tpr']

0     0.659929
1     0.577793
2     0.861430
3     0.531616
4     0.726862
5     0.734069
6     0.935018
7     0.820872
8     0.564103
9     0.793103
10    0.471154
11    0.599217
12    0.672872
13    0.754683
14    0.693709
15    0.422594
16    0.548209
17    0.609756
18    0.859062
19    0.823558
20    0.667293
21    0.892070
22    0.659278
23    0.712121
24    0.662605
25    0.612370
26    0.405372
27    0.719424
Name: glove_baseline_tpr, dtype: float64

In [6]:
TITLE_LABELS = [
    'accountant', 'acupuncturist', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist',
    'dietitian', 'dj', 'filmmaker', 'interior_designer', 'journalist', 'landscape_architect', 'magician',
    'massage_therapist', 'model', 'nurse', 'painter', 'paralegal', 'pastor', 'personal_trainer',
    'photographer', 'physician', 'poet', 'professor', 'psychologist', 'rapper',
    'real_estate_broker', 'software_engineer', 'surgeon', 'teacher', 'yoga_teacher']

In [27]:
results_df['label_profession'] = results_df['label'].apply(lambda x: TITLE_LABELS[int(x)])

In [28]:
results_df[['frac_female']+['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]].corr()[['frac_female']]
    

Unnamed: 0,frac_female
frac_female,1.0
strong_biosbias_noequalize_tpr_gender_gap,0.83812
glove_baseline_tpr_gender_gap,0.790874
strong_biosbias_4_tpr_gender_gap,0.645819
debiased_biosbias_tpr_gender_gap,0.810301
strong_biosbias_3_tpr_gender_gap,0.706904
debiased_tpr_gender_gap,0.828508
strong_biosbias_tpr_gender_gap,0.613817
strong_biosbias_2_tpr_gender_gap,0.614299


In [29]:
tpr_gender_gap_cols = ['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
tnr_gender_gap_cols = ['{}_tnr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]

In [61]:
gender_gap_df = results_df[['label_profession', 'frac_female']+tpr_gender_gap_cols+tnr_gender_gap_cols + ['glove_baseline_tpr']]
#gender_gap_df.columns = ['label_profession', 'frac_female']+['{}'.format(_model) for _model in MODEL_NAMES.values()]

In [62]:
gender_gap_df.sort_values('frac_female', ascending = False)

Unnamed: 0,label_profession,frac_female,strong_biosbias_noequalize_tpr_gender_gap,glove_baseline_tpr_gender_gap,strong_biosbias_4_tpr_gender_gap,debiased_biosbias_tpr_gender_gap,strong_biosbias_3_tpr_gender_gap,debiased_tpr_gender_gap,strong_biosbias_tpr_gender_gap,strong_biosbias_2_tpr_gender_gap,strong_biosbias_noequalize_tnr_gender_gap,glove_baseline_tnr_gender_gap,strong_biosbias_4_tnr_gender_gap,debiased_biosbias_tnr_gender_gap,strong_biosbias_3_tnr_gender_gap,debiased_tnr_gender_gap,strong_biosbias_tnr_gender_gap,strong_biosbias_2_tnr_gender_gap,glove_baseline_tpr
7,dietitian,0.920561,0.270927,0.231412,0.195581,0.259381,0.255001,0.284463,0.226933,0.237086,-0.001813,-0.002806,-0.002032,-0.001763,-0.001348,-0.002296,-0.001484,-0.001342,0.820872
13,nurse,0.914625,0.070693,0.082392,0.014937,0.117445,0.033787,0.088529,0.023204,0.029622,-0.006702,-0.005857,-0.004133,-0.010711,-0.005267,-0.009405,-0.002279,-0.00501,0.754683
15,paralegal,0.866109,0.343901,0.271437,0.327144,0.305254,0.315217,0.428593,0.298158,0.160628,-0.000323,-0.000164,2.1e-05,-0.000238,1.6e-05,-0.000854,0.000101,5.6e-05,0.422594
27,yoga_teacher,0.859712,0.178092,0.210492,0.061903,0.169724,0.147195,0.223581,0.061367,0.07231,-0.001051,-0.00124,-0.000817,-0.001033,-0.001026,-0.001173,-0.001337,-0.001654,0.719424
12,model,0.819149,0.525847,0.504265,0.439649,0.520563,0.472816,0.524191,0.445251,0.420614,-0.000248,-0.000975,0.000476,-0.000248,0.000334,-0.00105,0.000374,0.000132,0.672872
10,interior_designer,0.783654,0.193456,0.260941,0.100068,0.417996,0.086299,0.257941,0.045535,0.047171,-0.000231,-0.0002,0.000196,-0.000476,-0.000182,-0.000542,-8.2e-05,4.6e-05,0.471154
22,psychologist,0.620665,0.016101,0.018972,0.045098,0.044894,0.03877,0.040708,0.024681,0.029187,-0.006085,-0.002273,-0.002029,-0.003809,-0.004257,-0.00566,-0.003184,-0.00434,0.659278
26,teacher,0.604545,0.135975,0.138175,0.104611,0.149527,0.115657,0.151547,0.111391,0.108555,-0.00588,-0.002659,-0.001973,-0.006841,-0.003217,-0.004298,-0.003318,-0.001492,0.405372
11,journalist,0.492167,0.064489,0.057953,0.008818,0.034302,0.019679,0.055824,0.018545,-0.008931,-0.000311,-8.2e-05,0.001904,0.000406,0.001425,-0.000559,0.000991,0.002358,0.599217
19,physician,0.491782,0.061496,0.024822,0.052208,0.003196,0.034958,0.027148,0.063337,0.066835,0.00493,0.007442,0.003874,0.008639,0.006641,0.006818,0.002999,0.00267,0.823558


In [65]:
list(gender_gap_df['glove_baseline_tpr'])

[0.6599286563614745,
 0.5777925531914894,
 0.8614300100704935,
 0.531615925058548,
 0.7268623024830699,
 0.7340686274509803,
 0.9350180505415162,
 0.8208722741433022,
 0.5641025641025641,
 0.7931034482758621,
 0.47115384615384615,
 0.5992167101827677,
 0.6728723404255319,
 0.7546829971181557,
 0.6937086092715232,
 0.4225941422594142,
 0.5482093663911846,
 0.6097560975609756,
 0.8590622429394023,
 0.823557847244602,
 0.6672932330827067,
 0.8920704845814978,
 0.6592777976403289,
 0.7121212121212122,
 0.662605435801312,
 0.6123701605288008,
 0.4053719008264463,
 0.7194244604316546]

In [49]:
gender_gap_df.sort_values('frac_female', ascending = False)[[
    'label_profession',
    'frac_female',
    'glove_baseline_tpr_gender_gap',
    'strong_biosbias_tpr_gender_gap']]

Unnamed: 0,label_profession,frac_female,glove_baseline_tpr_gender_gap,strong_biosbias_tpr_gender_gap
7,dietitian,0.920561,0.231412,0.226933
13,nurse,0.914625,0.082392,0.023204
15,paralegal,0.866109,0.271437,0.298158
27,yoga_teacher,0.859712,0.210492,0.061367
12,model,0.819149,0.504265,0.445251
10,interior_designer,0.783654,0.260941,0.045535
22,psychologist,0.620665,0.018972,0.024681
26,teacher,0.604545,0.138175,0.111391
11,journalist,0.492167,0.057953,0.018545
19,physician,0.491782,0.024822,0.063337


In [50]:
to_export = gender_gap_df.sort_values('frac_female', ascending = False)[[
    'label_profession',
    'frac_female',
    'glove_baseline_tpr_gender_gap',
    'strong_biosbias_tpr_gender_gap']]

In [51]:
to_export.to_csv('bias.csv')

In [32]:
# Fraction of comments where new model has lower
# TPR gap than the baseline

def compute_fraction_improved(df, baseline_model, improved_model):
    is_improved = np.abs(df[baseline_model]) >= np.abs(df[improved_model])
    return np.mean(is_improved)

In [33]:
# for _model in MODEL_NAMES.values():
#     print(_model)
#     print(compute_fraction_improved(gender_gap_df, 'glove_untuned_tpr_gender_gap', '{}_tpr_gender_gap'.format(_model)))

In [34]:
tpr_cols = ['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
tnr_cols = ['{}_tnr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]
gender_gap_cols = tpr_cols + tnr_cols

In [35]:
gender_gap_df[gender_gap_cols].apply(lambda x: np.mean(x**2))

strong_biosbias_noequalize_tpr_gender_gap    0.027226
glove_baseline_tpr_gender_gap                0.022468
strong_biosbias_4_tpr_gender_gap             0.016390
debiased_biosbias_tpr_gender_gap             0.031339
strong_biosbias_3_tpr_gender_gap             0.017389
debiased_tpr_gender_gap                      0.030300
strong_biosbias_tpr_gender_gap               0.014632
strong_biosbias_2_tpr_gender_gap             0.012508
strong_biosbias_noequalize_tnr_gender_gap    0.000008
glove_baseline_tnr_gender_gap                0.000008
strong_biosbias_4_tnr_gender_gap             0.000004
debiased_biosbias_tnr_gender_gap             0.000012
strong_biosbias_3_tnr_gender_gap             0.000007
debiased_tnr_gender_gap                      0.000011
strong_biosbias_tnr_gender_gap               0.000004
strong_biosbias_2_tnr_gender_gap             0.000006
dtype: float64

In [36]:
gender_gap_df[gender_gap_cols].apply(lambda x: np.mean(np.abs(x)))

strong_biosbias_noequalize_tpr_gender_gap    0.111566
glove_baseline_tpr_gender_gap                0.096138
strong_biosbias_4_tpr_gender_gap             0.081080
debiased_biosbias_tpr_gender_gap             0.120339
strong_biosbias_3_tpr_gender_gap             0.078722
debiased_tpr_gender_gap                      0.113238
strong_biosbias_tpr_gender_gap               0.070427
strong_biosbias_2_tpr_gender_gap             0.072298
strong_biosbias_noequalize_tnr_gender_gap    0.001840
glove_baseline_tnr_gender_gap                0.001991
strong_biosbias_4_tnr_gender_gap             0.001388
debiased_biosbias_tnr_gender_gap             0.002164
strong_biosbias_3_tnr_gender_gap             0.001767
debiased_tnr_gender_gap                      0.002266
strong_biosbias_tnr_gender_gap               0.001364
strong_biosbias_2_tnr_gender_gap             0.001738
dtype: float64

In [37]:
def plot_tpr_gap(df, _model):
    fig, ax = plt.subplots(figsize=(15, 6))
    x = 'frac_female'
    y = '{}_tpr_gender_gap'.format(_model)
    p1 = sns.regplot(x = x, y = y, data = df)
    p1.set(xlabel = "% Female", ylabel = "TPR Gender Gap", title = _model)

    for line in range(0,df.shape[0]):
         p1.text(results_df[x][line]+0.01, df[y][line], df['label_profession'][line], horizontalalignment='left', size='medium', color='black')
    plt.show()

In [38]:
for _model in MODEL_NAMES.values():
    if 'untuned' in _model:
        plot_tpr_gap(results_df, _model)

In [39]:
results_df[['frac_female']+['{}_tpr_gender_gap'.format(_model) for _model in MODEL_NAMES.values()]].corr()[['frac_female']]

Unnamed: 0,frac_female
frac_female,1.0
strong_biosbias_noequalize_tpr_gender_gap,0.83812
glove_baseline_tpr_gender_gap,0.790874
strong_biosbias_4_tpr_gender_gap,0.645819
debiased_biosbias_tpr_gender_gap,0.810301
strong_biosbias_3_tpr_gender_gap,0.706904
debiased_tpr_gender_gap,0.828508
strong_biosbias_tpr_gender_gap,0.613817
strong_biosbias_2_tpr_gender_gap,0.614299


In [66]:
# Read other data

In [3]:
standard_data_path = 'gs://conversationai-models/biosbias/scored_data/test_data_0327_v2.csv'#'gs://conversationai-models/biosbias/scored_data/test_data_gender.csv'
perf_df = pd.read_csv(tf.gfile.Open(standard_data_path)).drop_duplicates(subset=['tokens'])

In [4]:
perf_df.head()

Unnamed: 0,tokens,gender,label,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_0,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_1,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_2,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_3,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_4,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_5,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103117_6,...,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_23,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_24,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_25,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_26,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_27,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_28,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_29,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_30,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_31,tf_trainer_tf_gru_attention_multiclass_biosbias_glove:v_20190328_103329_32
0,"[u'ms.', u'grandal', u'practices', u'medicine'...",F,23,5.727137e-10,2.916685e-28,8.39932e-09,1.294314e-08,2.742447e-08,5.902747e-15,5.34621e-16,...,0.999982,3.449516e-12,5.492861e-09,1e-06,1.2114110000000001e-27,4.068439e-31,1.713995e-11,1.462761e-05,6.660692e-12,8.769331000000001e-17
1,"[u'her', u'clinical', u'practice', u',', u'tea...",F,25,3.56306e-06,4.176983e-14,1.370124e-05,3.733084e-05,0.0001216295,4.37787e-09,3.357444e-08,...,0.053336,1.462687e-07,0.936171,0.005295,1.777507e-14,1.653589e-15,2.987385e-06,0.000230399,0.0008814193,2.817673e-06
2,"[u'dr.', u'pankaj', u'savla', u'practices', u'...",M,7,5.004311e-06,2.232138e-16,1.417693e-06,8.938866e-07,4.047489e-06,2.755533e-09,1.622926e-08,...,0.000131,2.076018e-05,6.177096e-05,3e-06,2.947059e-08,3.274288e-17,3.224972e-06,0.05307177,6.932695e-05,1.332975e-07
3,"[u'watch', u'her', u'discuss', u'her', u'resea...",F,25,0.02305727,7.231186e-12,0.0009480168,0.002633551,8.78855e-06,2.570692e-06,1.277006e-05,...,0.000157,0.0002135647,0.7258582,0.006123,1.487029e-10,2.164053e-13,0.002287528,9.199022e-05,0.02494066,9.204572e-08
4,"[u'she', u'graduated', u'with', u'honors', u'f...",F,17,4.612723e-07,5.236891e-25,8.497399e-09,3.84757e-07,1.115638e-07,1.102317e-14,4.010708e-14,...,9e-06,4.729366e-11,1.814765e-05,4e-06,1.538144e-26,2.235718e-32,1.864666e-13,1.228733e-09,8.040961e-08,8.570274e-12


In [4]:
perf_df['predicted_gender'] = list(map(lambda x: 'F' if x==0 else 'M', list(perf_df['tf_gru_attention_multiclass_gender_biosbias_glove:v_20190405_142640'])))

In [5]:
np.mean(perf_df['predicted_gender'] == perf_df['gender'])

0.8423208077025943

In [9]:
for i, title in enumerate(TITLE_LABELS):
    selected_df = perf_df[perf_df['label'] ==i]
    print ('{}:{}'.format(title, np.mean(selected_df['predicted_gender'] == selected_df['gender'])))

accountant:0.809750297265
acupuncturist:nan
architect:0.840425531915
attorney:0.810876132931
chiropractor:0.758215962441
comedian:0.860045146727
composer:0.871323529412
dentist:0.735227785295
dietitian:0.945482866044
dj:0.884615384615
filmmaker:0.838122605364
interior_designer:0.831730769231
journalist:0.860639686684
landscape_architect:nan
magician:nan
massage_therapist:nan
model:0.905141843972
nurse:0.927233429395
painter:0.816225165563
paralegal:0.887029288703
pastor:0.870523415978
personal_trainer:0.839024390244
photographer:0.866191390184
physician:0.776667740896
poet:0.843045112782
professor:0.85389133627
psychologist:0.82946013586
rapper:0.79797979798
real_estate_broker:nan
software_engineer:0.925960637301
surgeon:0.846081208687
teacher:0.865702479339
yoga_teacher:0.888489208633
