In [None]:
import os
import pickle

import numpy as np
import scipy
from scipy.stats.stats import spearmanr
from scipy.stats.stats import pearsonr
from sklearn.decomposition import PCA


## Loading ...

### Word embedding model

Get sure that you have downloaded the provided GloVe word embedding. If you want to create a new set of vectors, you can simply use the tool, originally provided by Stanford NLP.

In [3]:
we_model_path = 'word_embeddings/GloVe/vectors.txt'
we_vector_size = 300

we_vocabs = []
we_vectors = []
we_biases = []
we_ctx_vectors = []
we_ctx_biases = []
with open(we_model_path) as fr:
    for line in fr:
        vals = line.strip().split()
        we_vocabs.append(vals[0])
        we_vectors.append([float(x) for x in vals[1:1+we_vector_size]])
        we_biases.append([float(x) for x in vals[1+we_vector_size:2+we_vector_size]][0])
        we_ctx_vectors.append([float(x) for x in vals[2+we_vector_size:2+2*we_vector_size]])
        we_ctx_biases.append([float(x) for x in vals[2+2*we_vector_size:]][0])
        

we_vocabs = np.array(we_vocabs)
we_vectors = np.array(we_vectors)
we_biases = np.array(we_biases)
we_ctx_vectors = np.array(we_ctx_vectors)
we_ctx_biases = np.array(we_ctx_biases)

print ('Loading completed!')
print ('Number of word vectors:', len(we_vocabs))

Loading completed!
Number of word vectors: 197549


In [4]:
we_vocabs_indextowords = {}
for v_i, v in enumerate(we_vocabs):
    we_vocabs_indextowords[v] = v_i

### Word lists

In [6]:
jobs = []
with open('resources/wordlist_occupations.txt') as fr:
    for l in fr:
        _tuple=l.strip('\n').split(',')
        if _tuple[0] in we_vocabs:
            jobs.append(_tuple[0])

print ("Number of jobs:", len(jobs))

Number of jobs: 497


In [8]:
representative_words_feml = []
representative_words_male = []

with open('resources/wordlist_genderspecific.txt') as fr:
    for l in fr:
        _tuple=l.strip('\n').split(',')
        if _tuple[0] in we_vocabs:
            if _tuple[1] == 'f':
                representative_words_feml.append(_tuple[0])
            elif _tuple[1] == 'm':
                representative_words_male.append(_tuple[0])
        else:
            print ("Missing", _tuple[0])
print ("Number of female specific words:", len(representative_words_feml))
print ("Number of male specific words:", len(representative_words_male))

Number of female specific words: 28
Number of male specific words: 28


In [9]:
gender_words_direction = []
with open('resources/wordpairs_direction.txt') as fr:
    for l in fr:
        gender_words_direction.append(l.strip().split(','))
print ("Number of pairs of gendered words:", len(gender_words_direction))


Number of pairs of gendered words: 28


## Calculting gender bias

In [10]:
def Cosine(vec1, vec2):
    return 1 - scipy.spatial.distance.cosine(vec1, vec2)


### Smoothed First Order - Average

In [12]:
context_vectors_feml = []
context_vectors_male = []

for _w in representative_words_feml:
    _w_idx = we_vocabs_indextowords[_w]
    _context_vec = we_ctx_vectors[_w_idx]
    context_vectors_feml.append(_context_vec)
context_vectors_feml = np.array(context_vectors_feml)

for _w in representative_words_male:
    _w_idx = we_vocabs_indextowords[_w]
    _context_vec = we_ctx_vectors[_w_idx]
    context_vectors_male.append(_context_vec)
context_vectors_male = np.array(context_vectors_male)


In [16]:
def get_firstorder_average(word, gender_context_vectors):
    
    _word_vec = we_vectors[we_vocabs_indextowords[word]]

    _relations = np.dot(gender_context_vectors, _word_vec)
    return np.mean(_relations)

word = 'nurse'
print ("Smoothed first-order relation of '%s' to female: %f" % (word, get_firstorder_average(word, context_vectors_feml)))
print ("Smoothed first-order relation of '%s' to male: %f" % (word, get_firstorder_average(word, context_vectors_male)))


Smoothed first-order relation of 'nurse' to female: 2.580334
Smoothed first-order relation of 'nurse' to male: 1.507453


In [17]:
bias_firstorder_average = {}
for _job in jobs:
    _bias = get_firstorder_average(_job, context_vectors_feml) - get_firstorder_average(_job, context_vectors_male)
    bias_firstorder_average[_job] = _bias
    


### High Order - Directional

In [18]:
_matrix = []
for a, b in gender_words_direction:
    _a_vec = we_vectors[we_vocabs_indextowords[a]]
    _b_vec = we_vectors[we_vocabs_indextowords[b]]
    _matrix.append((_a_vec - _b_vec)/2.0)
    _matrix.append((_b_vec - _a_vec)/2.0)
    
_matrix = np.array(_matrix)
pca = PCA(n_components = 10)
gender_direction_vec = pca.fit(_matrix).components_[0]


In [19]:
def get_highorder_directional(word, gender_direction_vec):
    _word_vec = we_vectors[we_vocabs_indextowords[word]]
    return np.dot(_word_vec, gender_direction_vec)

word = 'nurse'
print ("High-order bias of '%s' in respect to female-male direction: %f" % (word, get_highorder_directional(word, gender_direction_vec)))


High-order bias of 'nurse' in respect to female-male direction: 1.026388


In [20]:
bias_highorder_directional = {}
for _job in jobs:
    bias_highorder_directional[_job] = get_highorder_directional(_job, gender_direction_vec)



### High Order - Centriod

In [21]:
word_vector_centroid_feml = np.mean(np.array([we_vectors[we_vocabs_indextowords[_w]] for _w in representative_words_feml]), axis=0)
word_vector_centroid_male = np.mean(np.array([we_vectors[we_vocabs_indextowords[_w]] for _w in representative_words_male]), axis=0)


In [22]:
def get_highorder_centroid(word, vector_centroid):
    _word_vec = we_vectors[we_vocabs_indextowords[word]]
    return Cosine(vector_centroid, _word_vec)

word = 'nurse'
print ("High-order relation of '%s' to female: %f" % (word, get_highorder_centroid(word, word_vector_centroid_feml)))
print ("High-order relation of '%s' to male: %f" % (word, get_highorder_centroid(word, word_vector_centroid_male)))



High-order relation of 'nurse' to female: 0.505788
High-order relation of 'nurse' to male: 0.266729


In [23]:
bias_highorder_centroid = {}
for _job in jobs:
    _bias = get_highorder_centroid(_job, word_vector_centroid_feml) - get_highorder_centroid(_job, word_vector_centroid_male)
    bias_highorder_centroid[_job] = _bias


### High Order - Average

In [24]:
word_vectors_feml = []
word_vectors_male = []

for _w in representative_words_feml:
    word_vectors_feml.append(we_vectors[we_vocabs_indextowords[_w]])
word_vectors_feml = np.array(word_vectors_feml)

for _w in representative_words_male:
    word_vectors_male.append(we_vectors[we_vocabs_indextowords[_w]])
word_vectors_male = np.array(word_vectors_male)


In [26]:
def get_highorder_average(word, gender_word_vectors):
    
    _word_vec = we_vectors[we_vocabs_indextowords[word]]
    _relations = np.array([Cosine(_gen_vec, _word_vec) for _gen_vec in gender_word_vectors])
    
    return np.mean(_relations)

word = 'nurse'
print ("High-order relation of '%s' to female: %f" % (word, get_highorder_average(word, word_vectors_feml)))
print ("High-order relation of '%s' to male: %f" % (word, get_highorder_average(word, word_vectors_male)))


High-order relation of 'nurse' to female: 0.273086
High-order relation of 'nurse' to male: 0.147788


In [27]:
bias_highorder_average = {}
for _job in jobs:
    _bias = get_highorder_average(_job, word_vectors_feml) - get_highorder_average(_job, word_vectors_male)
    bias_highorder_average[_job] = _bias


## Correlation to Occupation Statistics

In [28]:

winobias_stats = {}
with open('resources/occupations_stats_winobias.txt') as fr:
    for l in fr:
        _tuple=l.strip('\n').split(',')
        winobias_stats[_tuple[0]] = float(_tuple[1])
print ("Number of data points in WinoBias dataset: ", len(winobias_stats))

census_stats={}
with open('resources/occupations_stats_census.txt') as fr:
    for l in fr:
        _tuple=l.strip('\n').split(',')
        census_stats[_tuple[0]] = float(_tuple[1])
print ("Number of data points in WinoBias dataset: ", len(census_stats))


Number of data points in WinoBias dataset:  40
Number of data points in WinoBias dataset:  96


In [29]:
def calc_correlations(bias_stats, bias_text):
    
    _list1 = []
    _list2 = []
    for _job in bias_stats:
        _list1.append(bias_stats[_job])
        _list2.append(bias_text[_job])
        
    results = {}
    results['spearman'] = abs(spearmanr(_list1, _list2).correlation)
    results['pearson'] = abs(pearsonr(_list1, _list2)[0])
    
    print ("Spearman %0.3f \t Pearson %0.3f" % (results['spearman'], results['pearson']))
    
    return results
    

results = {'winobias': {}, 'census': {}}

print ("Labor Data")
print ("----------")
print ('High Order - Directional')
results['winobias']['highorder_directional'] = calc_correlations(winobias_stats, bias_highorder_directional)
print ('High Order - Centroid')
results['winobias']['highorder_centroid'] = calc_correlations(winobias_stats, bias_highorder_centroid)
print ('High Order - Average')
results['winobias']['highorder_average'] = calc_correlations(winobias_stats, bias_highorder_average)
print ('First Order - Average')
results['winobias']['firstorder_average'] = calc_correlations(winobias_stats, bias_firstorder_average)
print ()

print ("Census Data")
print ("----------")
print ('High Order - Directional')
results['census']['highorder_directional'] = calc_correlations(census_stats, bias_highorder_directional)
print ('High Order - Centroid')
results['census']['highorder_centroid'] = calc_correlations(census_stats, bias_highorder_centroid)
print ('High Order - Average')
results['census']['highorder_average'] = calc_correlations(census_stats, bias_highorder_average)
print ('First Order - Average')
results['census']['firstorder_average'] = calc_correlations(census_stats, bias_firstorder_average)
print ()




Labor Data
----------
High Order - Directional
Spearman 0.511 	 Pearson 0.544
High Order - Centroid
Spearman 0.577 	 Pearson 0.599
High Order - Average
Spearman 0.596 	 Pearson 0.610
First Order - Average
Spearman 0.560 	 Pearson 0.574

Census Data
----------
High Order - Directional
Spearman 0.337 	 Pearson 0.445
High Order - Centroid
Spearman 0.394 	 Pearson 0.507
High Order - Average
Spearman 0.394 	 Pearson 0.507
First Order - Average
Spearman 0.423 	 Pearson 0.520

