In [1]:
import os
from collections import Counter
from numpy import float32
import json
import pandas as pd
from preprocessing_text import l_of_ls, invoc
from computing_PMI import list_pmi, comp_freq
from tqdm import tqdm
import seaborn as sns
from statsmodels.stats.weightstats import ztest
from gensim.models import KeyedVectors
import numpy as np
from sklearn.manifold import TSNE
import random
from responsibly.we import weat
from matplotlib import pylab as plt
from scipy import stats
from sklearn.manifold import TSNE
from matplotlib import pylab as plt
import logging
import responsibly
from responsibly.we import BiasWordEmbedding

[nltk_data] Downloading package stopwords to /home/nicola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# A seed value is fixed for the replicability of experiments
seed_value = 42  
random.seed(seed_value)

In [3]:
# Reading seeds file
f = open(os.getcwd()+'/data/seeds/dict_PMI_WE.json')
seeds = json.load(f)

In [4]:
# There could be some seeds that are duplicates. These are removed.
for k,v in seeds.items():
    seeds[k] = list(set(v))

In [5]:
# Reading of sentences tokens
with open(os.getcwd()+'/data/tokens/sentences_tokens.txt', 'r') as f:
    l1 = []
    for ele in f:
        line = ele.split()
        l1.append(line)
tokens_l = list(map(l_of_ls, l1))

In [6]:
# Also if the original file does not contain empy list or None,
# the loading process can add these 2 type of elements. So, these
# elements are removed.
tokens_l = [lst for lst in tokens_l if lst is not None and any(lst)]

# Some tokens could contain empty spaces. there are removed.
tokens_l = [[t for t in ts if t != ''] for ts in tokens_l]

In [7]:
# tokens_l is transformed in a single list with all the tokens of all the senteces
# for counting the requency of each token.
all_tokens = [elemento for lista in tokens_l if lista for elemento in lista]

# freq is a dict where the keys are the tokens, and the value the absolute frequency. 
freq = Counter(all_tokens)

In [8]:
# A group for calculating PMI, for detecting gender bias
A_g = seeds["espacio_m"]
# B group for calculating PMI, for detecting gender bias
B_g = seeds["espacio_f"]
# A group for calculating PMI, for detecting religious bias
A_r = seeds['christianity words']
# B group for calculating PMI, for detecting religious bias
B_r = seeds["islam words"]

In [9]:
# The following keys and associated seeds are removed for creating 
# a seeds dictionary where each seeds list is represented as X (PMI)
seeds.pop("espacio_m") # A_g
seeds.pop("espacio_f") #B_g
seeds.pop("christianity words") # A_r
seeds.pop("islam words") # B_r
seeds.pop("profesiones_male") # not usefull for measuring the two types of bias
seeds.pop("profesiones_female") # not usefull for measuring the two types of bias
seeds.pop("islam") # not usefull for measuring the two types of bias
seeds.pop("pleasant 6") # equal to pleasantness
seeds.pop("unpleasant 6") # equal to unpleasantness

['agonía', 'desagradable', 'fracaso', 'terrible', 'horrible', 'guerra']

In [None]:
####################################################################################################################
# Experiments about Pointwise Mutual Information (PMI)
####################################################################################################################

In [None]:
# Computing PMI for gender bias case
pmi_g = list_pmi(seeds, A_g, B_g, tokens_l, freq)

In [None]:
# Add, at the values of pmi_g dictionary, the mean frequency of the seeds
comp_freq(pmi_g, seeds, freq)

In [None]:
# Computing PMI for religion bias case
pmi_r = list_pmi(seeds, A_r, B_r, tokens_l, freq)

In [None]:
# Add, at the values of pmi_r dictionary, the mean frequency of the seeds
comp_freq(pmi_r, seeds, freq)

In [None]:
# Converting pmi_g to DataFrame
df_g = pd.DataFrame.from_dict(pmi_g, orient='index', columns = ["PMI", "freq_mean"])

In [None]:
# Converting pmi_r to DataFrame
df_r = pd.DataFrame.from_dict(pmi_r, orient='index', columns = ["PMI", "freq_mean"])

In [None]:
# It could be happen that, given a key, that any seeds is contained in the tokens file.
# In this case, the key is removed. 
df_g = df_g[df_g['PMI'] != False]
df_r = df_r[df_r['PMI'] != False]

In [None]:
sns.scatterplot(x="freq_mean", y="PMI", data=df_r)

In [None]:
r_g = np.corrcoef(df_g.loc[:,"PMI"].astype(float32), df_g.loc[:,"freq_mean"].astype(float32))
# Correlation coefficient
round(r_g[0,1],2)

In [None]:
r_r = np.corrcoef(df_r.loc[:,"PMI"].astype(float32), df_r.loc[:,"freq_mean"].astype(float32))
# Correlation coefficient
round(r_r[0,1],2)

In [None]:
plt.hist(df_g.loc[:,"PMI"].astype(float32), bins=10, edgecolor='black')
plt.xlabel('PMI')
plt.ylabel('Frequency')
plt.title('Histogram of $PMI_{gender}$')
plt.show()

In [None]:
plt.hist(df_r.loc[:,"PMI"].astype(float32), bins=10, edgecolor='black')
plt.xlabel('PMI')
plt.ylabel('Frequency')
plt.title('Histogram of $PMI_{religion}$')
plt.show()

In [None]:
# Approximated Z score test for gender bias
ztest_Score_g, p_value_g = ztest(df_g.loc[:,"PMI"], value = 0.0, alternative='two-sided')#0.0053188242738449465 0.00532
round(p_value_g,5)

In [None]:
# Approximated Z score test for gender bias
ztest_Score_r, p_value_r = ztest(df_r.loc[:,"PMI"], value = 0.0, alternative='two-sided')#0.12617294102300938 0.12617
round(p_value_r,5)

In [None]:
####################################################################################################################
# Experiments about Word Embedding 
####################################################################################################################

In [10]:
# Directory that contains the 30-d vectors
file_path = os.getcwd()+'/data/WE/emb_pr_es_30.vec'
model = KeyedVectors.load_word2vec_format(file_path, binary=False)

In [None]:
# Take the most common words in the corpus between 200 and 600
words = [word for word in model.index2word[200:600]]

# convert the words to vectors
embeddings = np.array([model[word] for word in words])

# perform T-SNE
words_embedded = TSNE(n_components=2).fit_transform(embeddings)

# visualize
plt.figure(figsize=(20, 20))
for i, label in enumerate(words):
    x, y = words_embedded[i, :]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                 ha='right', va='bottom', size=11)
plt.show()

In [11]:
# Object that hands gender bias using the vectors omega
we_genero = BiasWordEmbedding(model, only_lower=True)

# Computing of the gender bias direction
we_genero._identify_direction('Masculino', 'Femenino',
                                          definitional=(A_g, B_g),
                                          method='sum')

In [None]:
# Projections plot for exploration purpose
f, ax = plt.subplots(1, figsize=(10, 10))
we_genero.plot_projection_scores(
    seeds["math 1"]+["hombre"]+["mujer"],# + espacio_f[:1] + espacio_m[:1],
    n_extreme=20,
    ax=ax);

In [12]:
we_relig = BiasWordEmbedding(model, only_lower=True)
we_relig._identify_direction('Mundo Cristiano', 'Mundo Islámico',
                                          definitional=(A_r, B_r),
                                          method='sum')

In [None]:
f, ax = plt.subplots(1, figsize=(10, 10))
we_relig.plot_projection_scores(
    random.sample(seeds["terrorism"],10)+["misil"]+['jesús']+['muhammad'],
    n_extreme=20,
    ax=ax);

In [None]:
# Retrieving of the keys for which was computed PMI in gender case
keys_f_g = list(df_g.index)
# Retrieving of the keys for which was computed PMI in religion case
keys_f_r = list(df_r.index)

# Dictionary that will contain the Bolukbasi values for each key (Direct bias values)
# gender
db_g = {}
for k in keys_f_g :
    W = list(seeds[k]) 
    for w in W:    
        if w not in we_genero.model.index2word :
            W.remove(w)
    db_g[k] = we_genero.calc_direct_bias(W, c=None)

In [None]:
# religion
db_r = {}
for k in keys_f_r :
    W = list(seeds[k]) 
    for w in W:    
        if w not in we_genero.model.index2word :
            W.remove(w)
    db_r[k] = we_relig.calc_direct_bias(W, c=None)

In [None]:
# mean frequency VS Bolukbasi value for each group of seeds, gender case
plt.scatter(df_g.loc[:,"freq_mean"], list(db_g.values()), c='blue',s=20, alpha=0.5)
plt.xlabel("freq_mean")
plt.ylabel("bolukbasi metric values")
plt.show()

In [None]:
"""
This is the corr. coef. between the PMI values and Bolukbasi values, gender case

r_g2 = np.corrcoef(np.array(df_g.loc[:,"PMI"]).astype('float32'), list(db_g.values()))
round(r_g2[0,1],2)
"""

In [None]:
r_g1 = np.corrcoef(df_g.loc[:,"freq_mean"], list(db_g.values()))
# corr. coef. for (freq_mean,Bolukbasi values), gender case
round(r_g1[0,1],2)

In [None]:
"""
This is the corr. coef. between the PMI values and Bolukbasi values, religion case

r_g2 = np.corrcoef(np.array(df_r.loc[:,"PMI"]).astype('float32'), list(db_r.values()))
round(r_g2[0,1],2)
"""

In [None]:
# mean frequency VS Bolukbasi value for each group of seeds, religion case
plt.scatter(df_r.loc[:,"freq_mean"], list(db_r.values()), c='blue',s=20, alpha=0.5)
plt.xlabel("freq_mean")
plt.ylabel("bolukbasi metric values")
plt.show()

In [None]:
r_g2 = np.corrcoef(df_r.loc[:,"freq_mean"], list(db_r.values()))
round(r_g2[0,1],2)

In [None]:
"""
This is the corr. coef. between the PMI values and Bolukbasi values, religion case

r_g2 = np.corrcoef(np.array(df_r.loc[:,"PMI"]).astype('float32'), list(db_r.values()))
round(r_g2[0,1],2)
"""


In [None]:
# Gender bias
plt.hist(list(db_g.values()), bins=10, edgecolor='black')
plt.xlabel('DirectBias')
plt.ylabel('Frequency')
plt.title("Histogram of the Bolukbasi's metric on gender bias")
plt.show()

In [None]:
# Religious bias
plt.hist(list(db_r.values()), bins=10, edgecolor='black')
plt.xlabel('DirectBias')
plt.ylabel('Frequency')
plt.title("Histogram of the Bolukbasi's metric on religion bias")
plt.show()

In [None]:
# Approximated Z score for gender bias
ztest_Score_g, p_value_g = ztest(list(db_g.values()), value = 0.0, alternative='two-sided')#3.197867413085225e-78
p_value_g

In [None]:
# Approximated Z score for religious bias
ztest_Score_r, p_value_r = ztest(list(db_r.values()), value = 0.0, alternative='two-sided')#3.5039766047728225e-36
p_value_r

In [None]:
#####################################################################################################################
# Experiments about WEAT
#####################################################################################################################

In [13]:
fa_science_1 =  seeds['math 1'] #X1
sa_arts_1 =  seeds['arts 1'] #Y1
fa_science_2 =  seeds['science 1'] #X2
sa_arts_2 =  seeds['arts 2'] #Y2
fa_ins = seeds['instruments'] #X3
sa_wea = seeds['weapons'] #Y3
fa_car_1 =  seeds['career'] #X4
sa_fam_1 =  seeds['family'] #Y4
fa_car_2 =  seeds['career words'] #X5
sa_fam_2 =  seeds['family words'] #Y5
fa_pl_2 =  seeds['pleasantness'] #X6
sa_unpl_2 = seeds['unpleasantness'] #Y6

# Dictionary with key as the two concepts of Xi,Yi, and the words in Xi, Yi as values for each key
# Target sets: (X1, Y1), (X2, Y2), (X3, Y3), ...
# Targets sets for gender
targ_g = {"math 1_arts 1":[fa_science_1, sa_arts_1],
          "science 1_arts 2":[fa_science_2, sa_arts_2],
          "career_family":[fa_car_1, sa_fam_1],
          "career words_family words":[fa_car_2, sa_fam_2],
          "pleasantness_unpleasantness":[fa_pl_2, sa_unpl_2]
         }

# Targets sets for religion
targ_r = {
          "instruments_weapons":[fa_ins, sa_wea],
          "pleasantness_unpleasantness":[fa_pl_2, sa_unpl_2]
         }

In [14]:
# Removing of the words that have not a vector representation
targ_g = invoc(targ_g, we_genero)
targ_r = invoc(targ_r, we_relig)

In [15]:
# WEAT Gender
# The results, w.r.t the thesis, could be different in terms of value BUT NOT in terms of test decisions.
# the difference is due to the permutation test done in WEAT by responsibly package 
first_target, second_target, first_attribute, second_attribute = {}, {}, {}, {}
# Attribute set A
first_attribute["words"] = A_g
first_attribute["name"] = "masc"
# Attribute set B
second_attribute["words"] =  B_g
second_attribute["name"] = "fem"
# List that contains WEAT output
l_g = []
for k,v in targ_g.items():
    if len(v[0]) > len(v[1]) :
        length = len(v[1])
    else :
        length = len(v[0])  
    first_target["name"] = k.split("_")[0]
    first_target["words"] = random.sample(v[0],length)

    second_target["name"] = k.split("_")[1]
    second_target["words"] = random.sample(v[1],length)

    l_g.append(weat.calc_single_weat(model, first_target, 
                      second_target, first_attribute, 
                      second_attribute, with_pvalue=True, pvalue_kwargs=None))

In [16]:
# WEAT Religion
# The results, w.r.t the thesis, could be different in terms of value BUT NOT in terms of test decisions.
# the difference is due to the permutation test done in WEAT by responsibly package 
first_target, second_target, first_attribute, second_attribute = {}, {}, {}, {}
# Attribute set A
first_attribute["words"] = A_r
first_attribute["name"] = "cris"
# Attribute set B
second_attribute["words"] = random.sample(B_r, len(A_r))
second_attribute["name"] = "isl"
l_r = []
for k,v in targ_r.items() :
    if len(v[0]) > len(v[1]) :
        length = len(v[1])
    else :
        length = len(v[0])  
    if length > 10:
        length = 10
    first_target["name"] = k.split("_")[0]
    first_target["words"] = random.sample(v[0],length)
    second_target["name"] = k.split("_")[1]
    second_target["words"] = random.sample(v[1],length)
    l_r.append(weat.calc_single_weat(model, first_target, 
                      second_target, first_attribute, 
                      second_attribute, with_pvalue=True, pvalue_kwargs=None))

In [17]:
# Results of WEAT for gender bias. The results about targets 'math 1 vs. arts 1' 
# and 'career words vs. family words' are reported in the thesis
l_g

[{'Target words': 'math 1 vs. arts 1',
  'Attrib. words': 'masc vs. fem',
  's': 0.6327524855732918,
  'd': 1.5296344,
  'p': 0.001456876456876457,
  'Nt': '7x2',
  'Na': '9x2'},
 {'Target words': 'science 1 vs. arts 2',
  'Attrib. words': 'masc vs. fem',
  's': 0.43256331980228424,
  'd': 0.9069404,
  'p': 0.05128205128205128,
  'Nt': '7x2',
  'Na': '9x2'},
 {'Target words': 'career vs. family',
  'Attrib. words': 'masc vs. fem',
  's': 0.40579575300216675,
  'd': 1.1145061,
  'p': 0.032467532467532464,
  'Nt': '6x2',
  'Na': '9x2'},
 {'Target words': 'career words vs. family words',
  'Attrib. words': 'masc vs. fem',
  's': 0.38511137664318085,
  'd': 1.2442881,
  'p': 0.004079254079254079,
  'Nt': '7x2',
  'Na': '9x2'},
 {'Target words': 'pleasantness vs. unpleasantness',
  'Attrib. words': 'masc vs. fem',
  's': -0.1668972671031952,
  'd': -0.5045142,
  'p': 0.7824675324675324,
  'Nt': '6x2',
  'Na': '9x2'}]

In [18]:
# Results of WEAT for religious bias. 
l_r

[{'Target words': 'instruments vs. weapons',
  'Attrib. words': 'cris vs. isl',
  's': 1.6455432921648026,
  'd': 1.4525306,
  'p': 8.118816168351773e-05,
  'Nt': '10x2',
  'Na': '15x2'},
 {'Target words': 'pleasantness vs. unpleasantness',
  'Attrib. words': 'cris vs. isl',
  's': 0.2793673425912857,
  'd': 0.8637376,
  'p': 0.07792207792207792,
  'Nt': '6x2',
  'Na': '15x2'}]

In [19]:
# WEAT Gender, with different attributes sets for studying the seeds frequency effects effect 
first_target, second_target, first_attribute, second_attribute = {}, {}, {}, {}

A_g1, B_g1 = ['señor','él'], ['señora', 'ella']
A_g2, B_g2 = ['hombre','padre'], ['mujer', 'madre']
A_g3, B_g3 = ['chico','abuelo'], ['chica', 'abuela']
l_att = [[A_g1, B_g1], [A_g2, B_g2], [A_g3, B_g3]]

l_g1 = []
for att in tqdm(l_att):
    first_attribute["name"] = str(att[0])
    second_attribute["name"] = str(att[1])
    first_attribute["words"] = att[0]
    second_attribute["words"] = att[1]
    for k,v in targ_g.items() :
        if len(v[0]) > len(v[1]) :
            length = len(v[1])
        else :
            length = len(v[0])  
        first_target["name"] = k.split("_")[0]
        first_target["words"] = random.sample(v[0],length)
        second_target["name"] = k.split("_")[1]
        second_target["words"] = random.sample(v[1],length)
        l_g1.append(weat.calc_single_weat(model, first_target, 
                      second_target, first_attribute, 
                      second_attribute, with_pvalue=True, pvalue_kwargs=None))

100%|██████████| 3/3 [00:00<00:00,  4.73it/s]


In [20]:
# WEAT Gender, with different attributes sets for studying the seeds frequency effects effect 
first_target, second_target, first_attribute, second_attribute = {}, {}, {}, {}
A_r1, B_r1 = ['iglesia','cristiano'], ['islam', 'musulmán']
A_r2, B_r2 = ['salvación','evangelio'], ['velo', 'mezquita']
A_r3, B_r3 = ['mesías','bautismo'], ['sultan', 'allah']
l_att = [[A_r1, B_r1], [A_r2, B_r2], [A_r3, B_r3]]

l_r1 = []
for att in tqdm(l_att):
    first_attribute["name"] = str(att[0])
    second_attribute["name"] = str(att[1])
    first_attribute["words"] = att[0]
    second_attribute["words"] = att[1]
    for k,v in targ_r.items() :
        if len(v[0]) > len(v[1]) :
            length = len(v[1])
        else :
            length = len(v[0]) 
        if length > 10:
            length = 10
        first_target["name"] = k.split("_")[0]
        first_target["words"] = random.sample(v[0],length)
        second_target["name"] = k.split("_")[1]
        second_target["words"] = random.sample(v[1],length)
        l_r1.append(weat.calc_single_weat(model, first_target, 
                      second_target, first_attribute, 
                      second_attribute, with_pvalue=True, pvalue_kwargs=None))

100%|██████████| 3/3 [00:09<00:00,  3.30s/it]


In [21]:
# Results of WEAT gender for different attributes sets
l_g1

[{'Target words': 'math 1 vs. arts 1',
  'Attrib. words': "['señor', 'él'] vs. ['señora', 'ella']",
  's': 0.7323949933052063,
  'd': 1.5870523,
  'p': 0.001456876456876457,
  'Nt': '7x2',
  'Na': '2x2'},
 {'Target words': 'science 1 vs. arts 2',
  'Attrib. words': "['señor', 'él'] vs. ['señora', 'ella']",
  's': 0.5280629396438599,
  'd': 1.1130006,
  'p': 0.016025641025641024,
  'Nt': '7x2',
  'Na': '2x2'},
 {'Target words': 'career vs. family',
  'Attrib. words': "['señor', 'él'] vs. ['señora', 'ella']",
  's': 0.4626176059246063,
  'd': 1.2256644,
  'p': 0.01948051948051948,
  'Nt': '6x2',
  'Na': '2x2'},
 {'Target words': 'career words vs. family words',
  'Attrib. words': "['señor', 'él'] vs. ['señora', 'ella']",
  's': 0.5014137327671051,
  'd': 1.302963,
  'p': 0.005536130536130536,
  'Nt': '7x2',
  'Na': '2x2'},
 {'Target words': 'pleasantness vs. unpleasantness',
  'Attrib. words': "['señor', 'él'] vs. ['señora', 'ella']",
  's': -0.13719740509986877,
  'd': -0.39243847,
  'p

In [22]:
# Results of WEAT religious for different attributes sets
l_r1

[{'Target words': 'instruments vs. weapons',
  'Attrib. words': "['iglesia', 'cristiano'] vs. ['islam', 'musulmán']",
  's': 1.6629449054598808,
  'd': 1.2523228,
  'p': 0.0021596051007815712,
  'Nt': '10x2',
  'Na': '2x2'},
 {'Target words': 'pleasantness vs. unpleasantness',
  'Attrib. words': "['iglesia', 'cristiano'] vs. ['islam', 'musulmán']",
  's': 0.6260063201189041,
  'd': 1.2141101,
  'p': 0.017316017316017316,
  'Nt': '6x2',
  'Na': '2x2'},
 {'Target words': 'instruments vs. weapons',
  'Attrib. words': "['salvación', 'evangelio'] vs. ['velo', 'mezquita']",
  's': 1.4136398285627365,
  'd': 1.5881547,
  'p': 3.78878087856416e-05,
  'Nt': '10x2',
  'Na': '2x2'},
 {'Target words': 'pleasantness vs. unpleasantness',
  'Attrib. words': "['salvación', 'evangelio'] vs. ['velo', 'mezquita']",
  's': 0.6587641835212708,
  'd': 1.3027245,
  'p': 0.01406926406926407,
  'Nt': '6x2',
  'Na': '2x2'},
 {'Target words': 'instruments vs. weapons',
  'Attrib. words': "['mesías', 'bautismo'] 

In [23]:
# Mean frequencies
(freq['señor']+freq['él']+freq['señora']+freq['ella'])/4

19970.75

In [24]:
(freq['chico']+freq['abuelo']+freq['chica']+freq['abuela'])/4

20.75

In [25]:
(freq['iglesia']+freq['cristiano']+freq['islam']+freq['musulmán'])/4

187.5

In [26]:
(freq['mesías']+freq['bautismo']+freq['sultan']+freq['allah'])/4

1.25