In [None]:
import os
import random
import json
from preprocessing_text import read_text, \
                               lower_case, \
                               is_in_par, \
                               remove_point, \
                               get_stopwords, \
                               get_en_dict, \
                               translate_dict

In [None]:
#####################################################################################################################
# Creation of tokens file from the Europarl texts.
#####################################################################################################################

In [None]:
# Directory of the folder that contains the Europarl text files.
path_folder_txt = os.getcwd()+"/data/texts"

# List where each element represents a sentence (as a str).
sentences = read_text(path_folder_txt)

In [None]:
# Remove sentences enclosed in parentheses.
# The reason for this is as follows:
# The Europass texts contain sentences with details, which are enclosed 
# in parentheses, about the developments in line with the European Parliament sitting.
# These details are considered not useful for detecting bias.
sentences = [sentence for sentence in sentences if not is_in_par(sentence)]

In [None]:
# Fixed seed for generating the same tokens and 
# seeds of the thesis experiments.
seed_value = 42  
random.seed(seed_value)

In [None]:
# Sample of senteces for reducing computational cost.
sentences = random.sample(sentences, 900000)
# Extraction of tokens from each sentence.
sentences_tokens = [sentence.split() for sentence in sentences]

In [None]:
# Remove final point from tokens.
sentences_tokens = remove_point(sentences_tokens)

In [None]:
# Remove final comma (",") from tokens.
sentences_tokens = [[token.rstrip(',') for token in tokens] for tokens in sentences_tokens]

# Replace the specified characters with an empty string.
sentences_tokens = [[token.replace('-', '').replace('–', '')
                                           .replace(':','')
                                           .replace(';','')
                                           .replace('"','')
                                           .strip() for token in tokens] for tokens in sentences_tokens]

In [None]:
# At this step, there could be some tokens that are equal to '' or "'".
# There types of tokens are removed.
sentences_tokens = [[token for token in tokens if token != '' or token != "'"] for tokens in sentences_tokens]

# Tokens are converted in lower case.
sentences_tokens = lower_case(sentences_tokens)

# There could be some equals tokens in a sentence. The duplicates
# are removed.
sentences_tokens = [list(set(tokens)) for tokens in sentences_tokens]

# The empty lists are removed. 
sentences_tokens = [tokens for tokens in sentences_tokens if tokens]

In [None]:
# Getting spanish stopwords.
stopw = get_stopwords()

# Stopwords are removed.
sentences_tokens = [[token for token in tokens if token not in stopw] for tokens in sentences_tokens]

In [None]:
# Saving preprocessed sentences tokens in a txt file 
# that is used for chapter 3 experiments.
with open("data/tokens/sentences_tokens.txt", 'w') as file:
    for tokens in sentences_tokens:
        linea = ','.join(map(str, tokens))  
        file.write(linea + '\n')  
file.close()

In [None]:
#####################################################################################################################
# Creation of spanish seeds file from the seeds extracted from https://aclanthology.org/2021.acl-long.148/ and
# from https://github.com/PLN-FaMAF/Bias-in-word-embeddings/blob/main/main_tutorial_bias_word_embedding.ipynb
#####################################################################################################################

In [None]:
# Directory of the folder that contains the seeds from https://aclanthology.org/2021.acl-long.148/
path_seeds = os.getcwd()+"/data/seeds/gathered_seeds.json"
with open(path_seeds) as f:
    json_data = json.load(f)

In [None]:
# As stated in the thesis (Chapter 3: Analysis of Results), two characteristics, 
# gender and religion, have been taken into account. For each characteristic, 
# certain keys (concepts) are employed to detect bias within the D and W spaces.

# keys for gender.
keys_g = ["pleasant", "unpleasant", 'instruments', 'weapons','pleasantness','unpleasantness',
        'career', 'family', 'math 1', 'arts 1', 'science 1', 'arts 2', 
        'careers', 'depressed 1', 'physically ill', 'occupations', 'adjectives sensitive',  
        'profesiones_neutras', 'verbos', 'profesiones_colectivos', 'sustantivos_abstractos', 'adjetivos_neutros', 
        'temporary', 'permanent', 'pleasant 6', 'unpleasant 6', 'adjectives appearance','adjectives intelligence', 
        "adjectives otherization", 'adjectives princeton', 'clothing', 'sports', 'family words', 'career words',
        'attractive', 'ugliness', 'violence'
       ]

# keys for religion.
keys_r = ['profesiones_neutras', 'verbos', 'profesiones_colectivos', 'sustantivos_abstractos','adjetivos_neutros',
          "pleasant", 'unpleasant', 'instruments', 'weapons', 'violence', 
          'attractive', 'ugliness', 'positive_emotion', 'negative_emotion', 
          'high morality and low\/neutral warmth','low\/neutral and morality high warmth',
          "high competence",'careers', 'depressed 1', 'terrorism', 'sports', 'domestic_work', 'high competence',
           'math 1', 'arts 1', 'science 1', 'arts 2', 'christianity', 'islam', 'islam words', 'christianity words'
         ]

In [None]:
# Extracting english dict using selected keys
dict_en = get_en_dict(json_data, keys_g, keys_r)

In [None]:
# The values (seeds) associated to keys are translated, 
# from english to spanish
dict_es = translate_dict(dict_en)

In [None]:
# Some preprocessed actions are done " by hands". The actions are:
#  1)  Some lists are not translated due to query length limit. These are translated by hand. One
#      example are the seeds associated to the key 'careers'. These seeds, as others, are translated
#      by hands.
#  2) Some keys are removed because are associated with gender specific terms and because are not good for 
#     religious bias. But this is a investigator choice. Every investigator could think to mantain these 
#     keys. It is recommended to carry out this selection of keys by hand to better check 
#     which are the best terms to use for measuring the bias. The normal command for removing a key and its
#     seeds is:  diact_es.pop(key)
#  3) Some seeds are not usefull for detecting bias. These seeds are removed from them key with the
#     command: diact_es[key].remove(seed_in_value_list).

In [None]:
# The following items are added to dict_es. These item are extracted from 
# https://github.com/PLN-FaMAF/Bias-in-word-embeddings/blob/main/main_tutorial_bias_word_embedding.ipynb
pn = {'profesiones_neutras' : [
    'chofer',
    'columnista',
    'publicista',
    'naturista',
    'asistente',
    'taxista',
    'psiquiatra',
    'policía',
    'dentista',
    'florista',
    'docente',
    'periodista',
    'electricista',
    'economista',
    'atleta',
    'terapeuta',
    'piloto',
    'modelo',
    'estudiante',
    'comerciante',
    'chef',
    'cantante',
    'militar'
                               ]
     }
ver = {'verbos' : [ 
          'comprar',
          'vender',
          'dormir',
          'despertar',
          'soñar',
          'llorar',
          'gritar',
          'hablar',
          'preguntar',
          'pensar',
          'inventar',
          'bailar',
          'cantar',
          'cocinar',
          'sentir',
          'bordar',
          'tejer',
          'coser',
          'razonar',
          'argumentar',
          'cursar',
          'programar'
                   ]
      }
pc = {'profesiones_colectivos' : [
'ingeniería',
'arquitectura',
'psicología',
'enfermería',
'medicina',
'carpintería',
'presidencia',
'biología',
'cocina',
'docencia',
'abogacía',
'cirugía',
'neurocirugía',
'actuación',
'música',
'canto'
                                   ]
     }
sa = {'sustantivos_abstractos' : [
'inteligencia',
'belleza',
'humildad',
'sabiduría',
'poder',
'cariño',
'bondad',
'ambición',
'delicadeza',
'amabilidad',
'paciencia',
'popularidad',
'fama',
'generosidad',
'honestidad',
'canto',
'maldad',
'soberbia',
'violencia'
                                  ]
     }
an = {'adjetivos_neutros' : [
'inteligente',
'humilde',
'amable',
'dulce',
'audaz',
'paciente',
'popular',
'flexible',
'grande',
'brillante',
'inocente',
'fácil',
'agradable',
'infeliz',
'capaz',
'difícil',
'temperamental',
                            ]
     }
pfr = {'profesiones_female' : ['arquitecta',
                          'ingeniera',
                          'diseñadora',
                          'doctora',
                          'abogada',
                          'profesora',
                          'contadora',
                          'científica',
                          'bióloga',
                          'cocinera',
                          'psicóloga',
                          'enfermera',
                          'obrera',
                          'actriz'
                          ]}

pmr = {'profesiones_male' : ['arquitecto', 'ingeniero', 'diseñador', 'doctor', 'abogado', 'profesor', 
                    'contador', 'científico', 'biólogo', 'cocinero', 'psicólogo', 'enfermero', 
                    'obrero', 'actor']
                             }
espacio_f = {'espacio_f' : ['mujer', 'ella' , 'chica', 'niña', 'esposa', 'señora', 'hermana', 'madre', 'abuela']}
espacio_m = {'espacio_m' : ['hombre', 'él', 'chico', 'niño', 'esposo', 'señor', 'hermano', 'padre','abuelo']}
l_d = [pn, ver, pc, sa, an, pfr, pmr, espacio_f, espacio_m]

In [None]:
# Updating for dict_es with the items of the above cell
for d in l_d :
    dict_es.update(d)

In [None]:
# The dictionary is saved. The file used for the experiment is in data/seeds folder. 
file = open("dict_PMI_WE.json", "w")
json.dump(dict_es, file)
file.close()