# INFORMATION RETRIEVAL PROJECT

---
## Gender stereotypes in parliamentary speeches

In word embedding models, each word is assigned to a high-dimensional vector such that the geometry of the vectors captures semantic relations between the words – e.g. vectors being closer together has been shown to correspond to more similar words. Recent works in machine learning demonstrate that word embeddings also capture common stereotypes, as these stereotypes are likely to be present, even if subtly, in the large corpora of training texts. These stereotypes are automatically learned by the embedding algorithm and could be problematic in many context if the embedding is then used for sensitive applications such as search rankings, product recommendations, or translations. An important direction of research is on developing algorithms to debias the word embeddings.

This project aims to use the word embeddings to study historical trends – specifically trends in the gender and ethnic stereotypes in the Italian parliamentary speeches from 1948 to 2020.

In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import os
from collections import defaultdict, OrderedDict
from tqdm.auto import tqdm

from INFORET_project import load_embed_model
# import matplotlib.pylab as plt
pd.set_option("display.max_rows", 100, "display.max_columns", 100)

In [2]:
from INFORET_project import YEARS

In [3]:
YEARS

['1948_1968', '1968_1985', '1985_2000', '2000_2020']

In [4]:
model = load_embed_model(YEARS[0])

In [7]:
model = load_embed_model(YEARS[1])

In [8]:
model = load_embed_model(YEARS[2])

In [16]:
model = load_embed_model(YEARS[3])

# 5. DEBIASING WORDS AND ASSESSING EAB

In [6]:
from INFORET_project import Debias
from INFORET_project.data import gendered_neutral_words

In [6]:
d = Debias(model.wv, definitional_pairs=['uomo','donna'],
                              identify_direction='single')
debiased_model_single = d.hard_debias(equality_sets=gendered_neutral_words['equality_sets'])

In [7]:
d = Debias(model.wv, definitional_pairs=['uomo','donna'],
                              identify_direction='sum')
debiased_model_sum = d.hard_debias(equality_sets=gendered_neutral_words['equality_sets'])

In [7]:
d = Debias(model.wv, definitional_pairs=['uomo','donna'], identify_direction='pca')
debiased_model_pca = d.hard_debias(equality_sets=gendered_neutral_words['equality_sets'])

---

In [5]:
from INFORET_project.data import gendered_neutral_words
from INFORET_project import EAB
from INFORET_project import WORDS_GROUP

Print only the top biased words to see whether the bias is reduced when using debiased models

In [7]:
# averaged EAB version

for year in tqdm(YEARS,
                 desc='Passing years'):
    print(f'\nYEAR: {year}')
    model = load_embed_model(year)
    
    for word in tqdm(WORDS_GROUP,
                     desc='Passing group of words'):
        print(f'\n\nGROUP OF WORDS: {word}\n')

        print('\nMODEL: Not debiased model')
        score = EAB(model.wv,
                    use_avg_gender=True,
                    type_most_similar = 'cosmul')
        top_bias = score.get_top_bias(pred_positive_word=word, verbose=False)[0]
        print( (top_bias[0], abs(top_bias[1][0] - top_bias[1][1])) )
        
        for deb_type in ['single', 'sum', 'pca']:
            d = Debias(model.wv, definitional_pairs=['uomo','donna'],
                              identify_direction=deb_type)
            debiased_model = d.hard_debias(equality_sets=gendered_neutral_words['equality_sets'])
            
            score = EAB(debiased_model,
                        use_avg_gender=True,
                        type_most_similar = 'cosmul')
            print(f'\nMODEL: Debiased model {deb_type}')
            top_bias = score.get_top_bias(pred_positive_word=word, verbose=False)[0]
            print( (top_bias[0], abs(top_bias[1][0] - top_bias[1][1])) )


HBox(children=(HTML(value='Passing years'), FloatProgress(value=0.0, max=4.0), HTML(value='')))


YEAR: 1948_1968


HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('grasso', 0.06127673238515857)

MODEL: Debiased model single
('splendido', 0.331249711662531)

MODEL: Debiased model sum
('bello', 0.021406903862953186)

MODEL: Debiased model pca
('brutto', 0.3139020971953869)


GROUP OF WORDS: family


MODEL: Not debiased model
('bambino', 0.04381868690252305)

MODEL: Debiased model single
('genitore', 0.3135473936796188)

MODEL: Debiased model sum
('matrimonio', 0.03230702877044678)

MODEL: Debiased model pca
('bambino', 0.2875015281140805)


GROUP OF WORDS: career


MODEL: Not debiased model
('capo', 0.0394399270415306)

MODEL: Debiased model single
('capo', 0.37785068228840824)

MODEL: Debiased model sum
('potere', 0.004791694879531849)

MODEL: Debiased model pca
('capo', 0.35457774773240086)


GROUP OF WORDS: rage


MODEL: Not debiased model
('aggressivo', 0.0696475218981504)

MODEL: Debiased model single
('cattivo', 0.3821953348815441)

MODEL: Debiased model sum
('crudele', 0.01668818

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('sensuale', 0.03292045444250108)

MODEL: Debiased model single
('grasso', 0.3629248946905136)

MODEL: Debiased model sum
('bello', 0.01601635217666625)

MODEL: Debiased model pca
('grasso', 0.004736405611038219)


GROUP OF WORDS: family


MODEL: Not debiased model
('bambino', 0.05103850066661836)

MODEL: Debiased model single
('figlio', 0.3157605163753032)

MODEL: Debiased model sum
('bambino', 0.03485752195119857)

MODEL: Debiased model pca
('famiglia', 0.020816104114055656)


GROUP OF WORDS: career


MODEL: Not debiased model
('presidente', 0.04962961599230764)

MODEL: Debiased model single
('professionale', 0.3571805231273174)

MODEL: Debiased model sum
('onorevole', 0.023849748820066463)

MODEL: Debiased model pca
('onorevole', 0.017385783791542098)


GROUP OF WORDS: rage


MODEL: Not debiased model
('intollerante', 0.0602129846811294)

MODEL: Debiased model single
('crudele', 0.3721710965037346)

MODEL: Debiased model s

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('frivolo', 0.06346932649612425)

MODEL: Debiased model single
('grasso', 0.3786844291724265)

MODEL: Debiased model sum
('grasso', 0.007533681392669689)

MODEL: Debiased model pca
('bello', 0.2999991778284311)


GROUP OF WORDS: family


MODEL: Not debiased model
('bambino', 0.025714072585105885)

MODEL: Debiased model single
('famiglia', 0.3828860007226468)

MODEL: Debiased model sum
('bambino', 0.016521964967250835)

MODEL: Debiased model pca
('famiglia', 0.32024010941386233)


GROUP OF WORDS: career


MODEL: Not debiased model
('capo', 0.012652518227696419)

MODEL: Debiased model single
('professionale', 0.37201162241399294)

MODEL: Debiased model sum
('professionale', 0.011811992526054416)

MODEL: Debiased model pca
('professionale', 0.31129593886435036)


GROUP OF WORDS: rage


MODEL: Not debiased model
('cattivo', 0.04554537683725357)

MODEL: Debiased model single
('brutale', 0.40025616548955445)

MODEL: Debiased model 

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('brutto', 0.03416837677359583)

MODEL: Debiased model single
('bello', 0.3816258057486266)

MODEL: Debiased model sum
('bello', 0.04264675229787823)

MODEL: Debiased model pca
('splendido', 0.6485581190325319)


GROUP OF WORDS: family


MODEL: Not debiased model
('bambino', 0.03165259659290315)

MODEL: Debiased model single
('famiglia', 0.40286856140010063)

MODEL: Debiased model sum
('matrimonio', 0.052633196115493774)

MODEL: Debiased model pca
('matrimonio', 0.668773240223527)


GROUP OF WORDS: career


MODEL: Not debiased model
('capo', 0.0712145067751408)

MODEL: Debiased model single
('professionale', 0.3887537821661681)

MODEL: Debiased model sum
('presidente', 0.004960994422435749)

MODEL: Debiased model pca
('onorevole', 0.6859465164132417)


GROUP OF WORDS: rage


MODEL: Not debiased model
('intollerante', 0.058555851876735676)

MODEL: Debiased model single
('intollerante', 0.4310970946680755)

MODEL: Debiased mode

In [8]:
# not averaged EAB version

for year in tqdm(YEARS,
                 desc='Passing years'):
    print(f'\nYEAR: {year}')
    model = load_embed_model(year)
    
    for word in tqdm(WORDS_GROUP,
                     desc='Passing group of words'):
        print(f'\n\nGROUP OF WORDS: {word}\n')

        print('\nMODEL: Not debiased model')
        score = EAB(model.wv,
                    gender_female='donna',
                    gender_male='uomo',
                    type_most_similar = 'cosmul')
        top_bias = score.get_top_bias(pred_positive_word=word, verbose=False)[0]
        print( (top_bias[0], abs(top_bias[1][0] - top_bias[1][1])) )
        
        for deb_type in ['single', 'sum', 'pca']:
            d = Debias(model.wv, definitional_pairs=['uomo','donna'],
                              identify_direction=deb_type)
            debiased_model = d.hard_debias(equality_sets=gendered_neutral_words['equality_sets'])
            
            score = EAB(debiased_model,
                        gender_female='donna',
                        gender_male='uomo',
                        type_most_similar = 'cosmul')
            print(f'\nMODEL: Debiased model {deb_type}')
            top_bias = score.get_top_bias(pred_positive_word=word, verbose=False)[0]
            print( (top_bias[0], abs(top_bias[1][0] - top_bias[1][1])) )


HBox(children=(HTML(value='Passing years'), FloatProgress(value=0.0, max=4.0), HTML(value='')))


YEAR: 1948_1968


HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('brutto', 0.02924209404736755)

MODEL: Debiased model single
('rozzo', 0.017551195248961426)

MODEL: Debiased model sum
('piacevole', 0.02327295523136863)

MODEL: Debiased model pca
('rozzo', 0.03575541973114016)


GROUP OF WORDS: family


MODEL: Not debiased model
('accudire', 0.10531398877501488)

MODEL: Debiased model single
('accudire', 0.0012819427996872967)

MODEL: Debiased model sum
('accudire', 0.01660062503069648)

MODEL: Debiased model pca
('accudire', 0.03742551654577242)


GROUP OF WORDS: career


MODEL: Not debiased model
('ambizione', 0.033331044763326656)

MODEL: Debiased model single
('denaro', 0.01843912042677398)

MODEL: Debiased model sum
('potere', 0.02327295523136863)

MODEL: Debiased model pca
('capo', 0.03575541973114016)


GROUP OF WORDS: rage


MODEL: Not debiased model
('crudele', 0.02310821935534474)

MODEL: Debiased model single
('intollerante', 0.017551195248961426)

MODEL: Debiased model sum
('a

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('brutto', 0.05616789534687999)

MODEL: Debiased model single
('magro', 0.010845014639198736)

MODEL: Debiased model sum
('magro', 0.025574414059519768)

MODEL: Debiased model pca
('piacevole', 0.05172188170254233)


GROUP OF WORDS: family


MODEL: Not debiased model
('bambino', 0.16640051901340486)

MODEL: Debiased model single
('matrimonio', 8.754432179181038e-09)

MODEL: Debiased model sum
('matrimonio', 0.017241074144840107)

MODEL: Debiased model pca
('matrimonio', 0.026900038495659917)


GROUP OF WORDS: career


MODEL: Not debiased model
('capo', 0.0022651735227555148)

MODEL: Debiased model single
('potere', 0.009640903957188196)

MODEL: Debiased model sum
('denaro', 0.025448774918913952)

MODEL: Debiased model pca
('ambizione', 0.05262226797640335)


GROUP OF WORDS: rage


MODEL: Not debiased model
('odioso', 0.034859059005975745)

MODEL: Debiased model single
('crudele', 0.010845014639198736)

MODEL: Debiased model s

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('splendido', 0.05066542923450465)

MODEL: Debiased model single
('rozzo', 0.00570427570492027)

MODEL: Debiased model sum
('rozzo', 0.01831345930695527)

MODEL: Debiased model pca
('rozzo', 0.038520068116486006)


GROUP OF WORDS: family


MODEL: Not debiased model
('matrimonio', 0.07286201417446136)

MODEL: Debiased model single
('matrimonio', 3.5949051513384234e-08)

MODEL: Debiased model sum
('matrimonio', 0.010125575959682442)

MODEL: Debiased model pca
('matrimonio', 0.01236168798059234)


GROUP OF WORDS: career


MODEL: Not debiased model
('capo', 0.1179308181628585)

MODEL: Debiased model single
('onorevole', 0.00228290464729064)

MODEL: Debiased model sum
('onorevole', 0.01831345930695527)

MODEL: Debiased model pca
('lavoro', 0.04110869318246846)


GROUP OF WORDS: rage


MODEL: Not debiased model
('brutale', 0.015967693179845782)

MODEL: Debiased model single
('aggressivo', 0.009836094267666295)

MODEL: Debiased mode

HBox(children=(HTML(value='Passing group of words'), FloatProgress(value=0.0, max=12.0), HTML(value='')))



GROUP OF WORDS: adj_appearence


MODEL: Not debiased model
('splendido', 0.006678805407136668)

MODEL: Debiased model single
('frivolo', 0.016938852146268002)

MODEL: Debiased model sum
('rozzo', 0.0023882292211055978)

MODEL: Debiased model pca
('rozzo', 0.005445801839232378)


GROUP OF WORDS: family


MODEL: Not debiased model
('accudire', 0.12301618522033098)

MODEL: Debiased model single
('matrimonio', 0.021910957992076985)

MODEL: Debiased model sum
('matrimonio', 0.002913539856672309)

MODEL: Debiased model pca
('matrimonio', 0.00015134401619432136)


GROUP OF WORDS: career


MODEL: Not debiased model
('ambizione', 0.0026756316423415805)

MODEL: Debiased model single
('potere', 0.01918617561459557)

MODEL: Debiased model sum
('capo', 0.0023882292211055978)

MODEL: Debiased model pca
('capo', 0.005445801839232378)


GROUP OF WORDS: rage


MODEL: Not debiased model
('odioso', 0.15286566242575642)

MODEL: Debiased model single
('aggressivo', 0.014013432338833898)

MODEL: Debiased 