In [1]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import gensim
import numpy as np
import json
from brand.debiasing import EmbeddingDebias
from brand.utils import get_embedding_mat

In [2]:
# load word vector model
pretrained_embedding_file = '../data/GoogleNews-vectors-negative300.bin.gz'
kv = gensim.models.KeyedVectors.load_word2vec_format(pretrained_embedding_file, binary=True, limit=500000)

In [6]:
# load gender specific words
gender_cfg_file = '../vocab/gender.json'
with open(gender_cfg_file, 'r') as f:
    cfg = json.load(f)
    ds_gender = cfg['definite_sets']

In [7]:
# load race specific words
race_cfg_file = '../vocab/race.json'
with open(race_cfg_file, 'r') as f:
    cfg = json.load(f)
    ds_race = cfg['definite_sets']

In [8]:
# prepare definitial (direction) matrix
dmat_gender = []
for _, words in enumerate(ds_gender):
    mat = get_embedding_mat(words, kv)
    dmat_gender.append(np.asarray(mat))

In [9]:
# prepare definitial (direction) matrix
dmat_race = []
for _, words in enumerate(ds_race):
    mat = get_embedding_mat(words, kv)
    dmat_race.append(np.asarray(mat))

In [15]:
# create 2 debiasing workers
method = 'Hard'
k = 1
debias_gender = EmbeddingDebias(dmat_gender, embedding=kv, k=k, method=method)
debias_race = EmbeddingDebias(dmat_race, embedding=kv, k=k, method=method)

In [28]:
# check the brand embedding
test_brand_word = 'Shiseido' # personal care company
test_embedding = kv[test_brand_word]
print('length of embedding:', np.linalg.norm(test_embedding))

length of embedding: 4.031033


In [17]:
# dot product between subspace direction and the brand
dot_res_gender = test_embedding.dot(debias_gender._B)
print('dot product of gender and Shiseido:{}'.format(dot_res_gender))
dot_res_race = test_embedding.dot(debias_race._B)
print('dot product of race and Shiseido:{}'.format(dot_res_race))

dot product of gender and Shiseido:[0.80453193]
dot product of race and Shiseido:[0.3784532]


In [26]:
# debias gender
_, embedding_gender_debiased = debias_gender.debiasing([test_embedding.reshape((1, -1))])
dot_res_gender_debiased = embedding_gender_debiased[0].reshape(-1,).dot(debias_gender._B)
print('dot product of gender and Shiseido after gender debiasing:{}'.format(dot_res_gender_debiased))
dot_res_race_gender_debiased = embedding_gender_debiased[0].reshape(-1,).dot(debias_race._B)
print('dot product of race and Shiseido after gender debiasing:{}'.format(dot_res_race_gender_debiased))

dot product of gender and Shiseido after gender debiasing:[-7.450581e-09]
dot product of race and Shiseido after gender debiasing:[0.09195894]


In [31]:
# dot product between gender direction and race direction
dot_gender_race = debias_race._B.reshape(-1, ).dot(debias_gender._B.reshape(-1, )) 
dot_gender_race

0.018920083

In [32]:
# debias race then check its dot product with gender direction
_, embedding_race_debiased = debias_race.debiasing([test_embedding.reshape((1, -1))])
dot_res_race_debiased = embedding_race_debiased[0].reshape(-1,).dot(debias_._B)
print('dot product of race and Shiseido after race debiasing:{}'.format(dot_res_race_debiased))

dot product of race and Shiseido after race debiasing:[0.19869235]
