In [143]:
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import numpy as np

import pandas as pd

from sklearn.decomposition import PCA
import pickle
import json

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.cluster import KMeans

In [144]:
import re
import string
from numpy import linalg as LA

In [145]:
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8-sig')
    text = file.read()
    file.close()
    return text

In [146]:
def load_emb(glove_file, fullLoad=False, n_vecs=20000):
    """ Loads glove vectors from a file """
    tok2vec = {}
    with open(glove_file, 'r') as glove_fh:
        for i, row in enumerate(glove_fh):
            word, vec = row.split(' ', 1)
            tok2vec[word] = np.array([float(n) for n in vec.split(' ')])
            if (not fullLoad and i+1 >= n_vecs):
                break
    return tok2vec

In [147]:
def filterVocab(glove,genderwords):
    """ filters words as described in paper remove upper-case letters, digits, or
        punctuation, and words longer than 20 characters."""
    newDict = {}
    #regex = re.compile('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~') 
    for w in glove:
        if len(w)<= 20 and (not re.search("\d", w)) and (w.islower()) and \
        (not any(p in w for p in string.punctuation)) and w not in genderwords:
            newDict[w] = glove[w]
    return newDict

In [148]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [149]:
def getProjectionHeShe(glove,filterGlove):
    B = torch.Tensor(glove['he'] - glove['she'])
    WN = torch.cat([torch.Tensor(filterGlove[n]).view(1, -1) for n in filterGlove])
    print(WN.shape)
    return torch.matmul(WN, B).squeeze().numpy()

In [150]:
def getProjectionSubsapace(emb,filteremb, D, var_ratio=0.5, norm=True):

    W = filteremb
    F = emb
#     if norm:
#         W = W / W.norm(2, dim=1).view(-1, 1)

    C = []
    # Stack all of the differences between the gender pairs
    for idx in range(len(D)):
        idxs = D[idx]
        u = F[idxs[0]]
        v = F[idxs[1]]
        C.append(torch.Tensor((u - v)/2).view(1, -1))
        #print(torch.Tensor((u - v)/2))
    C = torch.cat(C, dim=0)

    print(C.shape)
    # Get prinipal components
    U, S, V = torch.svd(C)

    # Find k such that we capture 100*var_ratio% of the gender variance
    var = S**2

    norm_var = var/var.sum()
    cumul_norm_var = torch.cumsum(norm_var, dim=0)
    _, k_idx = cumul_norm_var[cumul_norm_var >= var_ratio].min(dim=0)

    
    # Get first k components to for gender subspace
    B = V[:, :k_idx.data[0]+1]
    #loss = torch.matmul(W[N], B).norm(2) ** 2
    B = V[:, :1]
    
    
    WN = torch.cat([torch.Tensor(W[n]).view(1, -1) for n in W])
    print(WN.shape)
    
    return torch.matmul(WN, B).squeeze().numpy()

In [151]:
def normalizeEmb(emb):
    for k in emb.keys():
        emb[k] = emb[k]/LA.norm(emb[k])
    return emb

In [9]:
maleBiasWords = load_doc('/scratch/um367/EMB/wordlists/male_word_file.txt').split('\n')
femaleBiasWords = load_doc('/scratch/um367/EMB/wordlists/female_word_file.txt').split('\n')
genderwords = maleBiasWords + femaleBiasWords
len(genderwords)

448

In [18]:
glove_file = '/scratch/um367/EMB/embeddings/wiki_glove.txt'
glove = load_emb(glove_file, fullLoad=False, n_vecs=50000)
len(glove)

50000

In [36]:
glove = normalizeEmb(glove)

In [37]:
filtered_glove = filterVocab(glove,genderwords)
len(filtered_glove) ##47,698

47699

In [38]:
projections = getProjectionHeShe(glove,filtered_glove)

torch.Size([47699, 300])


In [41]:
sortedP = list(zip(filtered_glove.keys(),projections))
sortedP.sort(key = lambda t: t[1])
biasedwords = [w for w,p in sortedP]

In [42]:
save_doc(biasedwords[-500:], 'gloveMaleBiasedWords.txt')

In [43]:
save_doc(biasedwords[:500], 'gloveFemaleBiasedWords.txt')