In [1]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import gensim
import numpy as np
import json
from brand.debiasing import EmbeddingDebias
from brand.utils import get_embedding_mat

In [2]:
# load gender specific words
gender_cfg_file = '../vocab/gender.json'
with open(gender_cfg_file, 'r') as f:
    cfg = json.load(f)
    ds = cfg['definite_sets']

In [3]:
ds

[['he', 'she'],
 ['his', 'hers'],
 ['son', 'daughter'],
 ['father', 'mother'],
 ['male', 'female'],
 ['boy', 'girl'],
 ['uncle', 'aunt']]

In [4]:
# load word vector model
pretrained_embedding_file = '../data/GoogleNews-vectors-negative300.bin.gz'
kv = gensim.models.KeyedVectors.load_word2vec_format(pretrained_embedding_file, binary=True, limit=500000)

In [5]:
# prepare definitial (direction) matrix
dmat = []
for _, words in enumerate(ds):
    mat = get_embedding_mat(words, kv)
    dmat.append(np.asarray(mat))

In [6]:
# create a debiasing worker
method = 'Hard'
k = 1
debias_worker = EmbeddingDebias(dmat, embedding=kv, k=k, method=method)

In [7]:
# check subspace
print(debias_worker._B.shape)
print(debias_worker._B)
print(debias_worker._Qb.shape)
print(debias_worker._Qb)
print(debias_worker._C.shape)
print(debias_worker._C)

(300, 1)
[[-0.08313393]
 [-0.1334019 ]
 [-0.01469883]
 [ 0.07489313]
 [ 0.04135582]
 [-0.0174852 ]
 [ 0.04321396]
 [ 0.02006703]
 [-0.0616052 ]
 [ 0.06212714]
 [-0.01860954]
 [-0.01826932]
 [-0.02629876]
 [ 0.01580281]
 [ 0.11647368]
 [ 0.01311997]
 [ 0.00625287]
 [-0.08571745]
 [-0.00529978]
 [-0.01389976]
 [ 0.03244121]
 [-0.06551483]
 [-0.00398326]
 [-0.11401508]
 [-0.00909965]
 [ 0.01961642]
 [-0.03769291]
 [-0.00769611]
 [-0.02692243]
 [-0.08400678]
 [-0.08600696]
 [-0.04367273]
 [-0.04995795]
 [-0.0995248 ]
 [-0.05413671]
 [-0.06619735]
 [-0.00516187]
 [ 0.03748215]
 [-0.05637397]
 [ 0.00759167]
 [-0.08302037]
 [ 0.04906293]
 [-0.03752581]
 [-0.04260368]
 [ 0.01115519]
 [-0.04482107]
 [ 0.01968338]
 [ 0.05093568]
 [-0.02338805]
 [ 0.09740178]
 [ 0.01295252]
 [-0.06824786]
 [ 0.00698884]
 [ 0.04526343]
 [-0.05985754]
 [ 0.0075031 ]
 [-0.02451097]
 [ 0.01297342]
 [ 0.05540032]
 [-0.00108799]
 [ 0.05294229]
 [-0.03767921]
 [ 0.01205523]
 [ 0.01666154]
 [-0.01757122]
 [-0.02919705]
 

In [8]:
# check the brand embedding
test_brand_word = 'Shiseido' # personal care company
test_embedding = kv[test_brand_word]
test_embedding

array([-0.25195312,  0.29492188,  0.14648438, -0.04150391, -0.13378906,
        0.07421875,  0.10009766, -0.40625   , -0.09667969,  0.0378418 ,
       -0.19921875,  0.46875   ,  0.17578125,  0.2578125 , -0.12060547,
       -0.17675781,  0.04443359,  0.18359375,  0.17285156, -0.00811768,
        0.12988281, -0.00105286,  0.18945312, -0.18261719,  0.17285156,
        0.01275635,  0.18554688, -0.00561523, -0.13183594, -0.19628906,
       -0.12060547, -0.10839844, -0.37304688, -0.19824219, -0.22753906,
       -0.16113281,  0.07861328,  0.14648438,  0.38085938,  0.07617188,
       -0.14648438, -0.01525879, -0.02929688, -0.00823975, -0.04614258,
       -0.00823975,  0.16210938, -0.0234375 ,  0.22949219, -0.11181641,
       -0.01672363,  0.11523438,  0.29882812,  0.24511719, -0.11523438,
        0.05566406,  0.15722656,  0.02050781, -0.01672363, -0.37304688,
       -0.55078125,  0.04174805,  0.14648438, -0.09082031,  0.19726562,
        0.46484375,  0.10205078,  0.19921875,  0.23144531,  0.03

In [9]:
# project the test word onto the gender subspace
projected_embedding = debias_worker.project(test_brand_word)
projected_embedding

array([-0.06688391, -0.10732608, -0.01182568,  0.06025391,  0.03327208,
       -0.0140674 ,  0.03476701,  0.01614457, -0.04956335,  0.04998327,
       -0.01497197, -0.01469825, -0.02115819,  0.01271386,  0.09370678,
        0.01055544,  0.00503063, -0.06896242, -0.00426384, -0.0111828 ,
        0.02609999, -0.05270877, -0.00320466, -0.09172877, -0.00732096,
        0.01578204, -0.03032515, -0.00619176, -0.02165996, -0.06758612,
       -0.06919535, -0.0351361 , -0.04019276, -0.08007087, -0.0435547 ,
       -0.05325788, -0.00415289,  0.03015559, -0.04535466,  0.00610774,
       -0.06679255,  0.0394727 , -0.03019072, -0.03427602,  0.00897471,
       -0.03605998,  0.01583591,  0.04097938, -0.01881643,  0.07836284,
        0.01042072, -0.05490758,  0.00562274,  0.03641588, -0.0481573 ,
        0.00603648, -0.01971986,  0.01043753,  0.04457133, -0.00087533,
        0.04259377, -0.03031413,  0.00969881,  0.01340474, -0.01413661,
       -0.02348996,  0.03482072,  0.09674884,  0.00982722, -0.01

In [10]:
# dot product between gender direction and the brand
dot_res = test_embedding.dot(debias_worker._B)
dot_res

array([0.80453193], dtype=float32)

In [11]:
test_brand_word2 = 'Tesla'
test_embedding2 = kv[test_brand_word2]
dot_res2 = test_embedding2.dot(debias_worker._B)
dot_res2

array([0.02907436], dtype=float32)

In [12]:
test_brand_word3 = 'Facebook'
test_embedding3 = kv[test_brand_word3]
dot_res3 = test_embedding3.dot(debias_worker._B)
dot_res3

array([0.49307954], dtype=float32)

In [13]:
test_brand_word4 = 'Marlboro'
test_embedding4 = kv[test_brand_word4]
dot_res4 = test_embedding4.dot(debias_worker._B)
dot_res4

array([-0.11155926], dtype=float32)

In [14]:
test_brand_word5 = 'Amazon'
test_embedding5 = kv[test_brand_word5]
dot_res5 = test_embedding5.dot(debias_worker._B)
dot_res5

array([0.5521182], dtype=float32)

In [19]:
test_brand6 = "Chanel"
test_embedding6 = kv[test_brand6]
dot_res6 = test_embedding6.dot(debias_worker._B)
dot_res6

array([0.8203788], dtype=float32)

In [22]:
test_brand7 = "BOSS"
test_embedding7 = kv[test_brand7]
dot_res7 = test_embedding7.dot(debias_worker._B)
dot_res7

array([-0.5512352], dtype=float32)