In [1]:
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import numpy as np

import pandas as pd

from sklearn.decomposition import PCA
import pickle
import json

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics
from sklearn.cluster import KMeans

In [19]:
from numpy import linalg as LA
from sklearn.ensemble  import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8-sig')
    text = file.read()
    file.close()
    return text

In [4]:
def load_emb(glove_file, fullLoad=False, n_vecs=20000):
    """ Loads glove vectors from a file """
    tok2vec = {}
    with open(glove_file, 'r') as glove_fh:
        for i, row in enumerate(glove_fh):
            word, vec = row.split(' ', 1)
            tok2vec[word] = np.array([float(n) for n in vec.split(' ')])
            if (not fullLoad and i+1 >= n_vecs):
                break
    return tok2vec

In [5]:
def clusterPerformance(vectors,labels,verbose = False):
    #print(vectors.shape)
    accuracy = []
    for i in range(20):
        kmeans = KMeans(n_clusters=2)
        kmeans.fit(vectors)
        p = sum(kmeans.labels_ == labels)/ len(labels)
        accuracy.append(max(p,1-p))
    if verbose:
        if p > 1-p:
            print([w for w,l,tl in zip(words,kmeans.labels_,labels) if l!=tl])
        else:
            print([w for w,l,tl in zip(words,kmeans.labels_,labels) if l==tl])
    return np.mean(accuracy)

In [6]:
def getConceptScore(emb_vecs,concept):
    c = emb_vecs[concept]
    scrM = []
    for w in maleBiasWords:
        scrM.append(np.dot(c,emb_vecs[w]))
    scrF = []
    for w in femaleBiasWords:
        scrF.append(np.dot(c,emb_vecs[w]))
    return np.abs(np.mean(scrM) - np.mean(scrF))

In [7]:
def getConceptVectors(emb_vecs,concepts,maleBiasWords,femaleBiasWords):
    scrM = []
    for w in maleBiasWords:
        features = []
        for con in concepts:
            c = emb_vecs[con]
            features.append(np.dot(c,emb_vecs[w]))
        scrM.append(np.array(features))
    scrF = []
    for w in femaleBiasWords:
        features = []
        for con in concepts:
            c = emb_vecs[con]
            features.append(np.dot(c,emb_vecs[w]))
        scrF.append(np.array(features))
    return scrM,scrF

In [8]:
def normalizeEmb(emb):
    for k in emb.keys():
        emb[k] = emb[k]/LA.norm(emb[k])
    return emb

In [22]:
def removeProtectedAttribute(emb):
    emb = {}
    for w in gnglove:
        emb[w] = gnglove[w][:299]
    return emb

In [9]:
maleBiasWords = load_doc('../pig/gloveMaleBiasedWords.txt').split('\n')
len(maleBiasWords)

500

In [10]:
femaleBiasWords = load_doc('../pig/gloveFemaleBiasedWords.txt').split('\n')
len(femaleBiasWords)

500

In [None]:
#concepts = ['sport','cloth','family','work','he','she','war','politics']
#concepts = ['sports','war','emotion','cloth']

#concepts = ['fashion','sports','outdoor','sex','food','baby','cosmetic','power','violence','love','emotion'\
#            ,'religion','science','marriage','politics']

#concepts = ['sports','army','religion','travel','crime','politics']
concepts = ['sports','army','religion','politics','cloth','food','violence','love','education','sex']

In [11]:
glove_file = '/scratch/um367/EMB/embeddings/wiki_glove.txt'
glove = load_emb(glove_file, fullLoad=False, n_vecs=50000)
len(glove)

50000

In [12]:
glove = normalizeEmb(glove)

In [15]:
maleFeatures,femaleFeatures = getConceptVectors(glove,concepts,maleBiasWords,femaleBiasWords)

In [16]:
words = femaleBiasWords + maleBiasWords
features = femaleFeatures + maleFeatures
labels = [False for w in femaleFeatures] + [True for w in maleFeatures]

In [17]:
X = np.array(features)
Y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2018, shuffle=True)
print(X_train.shape)

(700, 10)


In [18]:
clf = rfc()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.89      0.89      0.89       151
        True       0.89      0.89      0.89       149

    accuracy                           0.89       300
   macro avg       0.89      0.89      0.89       300
weighted avg       0.89      0.89      0.89       300





In [20]:
gnglove_file = '/scratch/um367/EMB/embeddings/wiki_gnglove.txt'
gnglove = load_emb(gnglove_file, fullLoad=False, n_vecs=50000)
len(gnglove)

50000

In [23]:
gnglove = normalizeEmb(gnglove)
gnglove = removeProtectedAttribute(gnglove)

In [27]:
maleFeatures,femaleFeatures = getConceptVectors(gnglove,concepts,maleBiasWords,femaleBiasWords)
words = femaleBiasWords + maleBiasWords
features = femaleFeatures + maleFeatures
labels = [False for w in femaleFeatures] + [True for w in maleFeatures]

In [28]:
X = np.array(features)
Y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2018, shuffle=True)
print(X_train.shape)

(700, 10)


In [29]:
clf = rfc()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.91      0.90       151
        True       0.90      0.88      0.89       149

    accuracy                           0.89       300
   macro avg       0.89      0.89      0.89       300
weighted avg       0.89      0.89      0.89       300



