### Imports

In [1]:
# Importem llibreries necessàries
import pandas as pd

# Import custom functions
import sys
sys.path.insert(0, "code/")
from bias_utils import setup_models, uni_tokenize

### Setup models AINA (català)

In [2]:
tokenizer, model = setup_models("projecte-aina/roberta-base-ca-v2")

Creating AutoTokenizer.
Creating AutoModelForMaskedLM.
Describe model:
------------------------------------------------------------
Model type => RobertaForMaskedLM
Token type => RobertaTokenizerFast
MASK_TOKEN => <mask>
MASK_ID    => 4 



### Read Wikipedia data


In [17]:
# Algunes Modificacions manuals
# ================================================================================================
# df_raw.loc[df_raw.label_ca.str.startswith("músic"), ["job_masc", "job_fem"]] = ["músic", "música"]
# df_raw.loc[df_raw.label_ca.str.startswith("metg"), ["job_masc", "job_fem"]] = ["metge", "metgessa"]
# df_raw.loc[df_raw.label_ca.str.startswith("teatrò"), ["job_masc", "job_fem"]] = "teatròleg", "teatròloga"
# df_raw.loc[df_raw.label_ca.str.contains("esquiador"), ["job_masc", "job_fem"]]= "esquiador", "esquiadora"
# df_raw.loc[df_raw.label_ca.str.contains("odontòleg"), ["job_masc", "job_fem"]] = "dentista"
# df_raw.loc[df_raw.job_masc == "pastor", "job_fem"] = "pastora"
# df_raw.loc[df_raw.job_masc == "fuster", "job_fem"] = "fustera"

# Guardem canvis arxiu original => Write Excel
# ================================================================================================
# df_raw.to_excel(wikidata_raw_file, index=False)

In [3]:
DATA = "data/wikipedia/"
wikidata_raw_file = DATA + "wiki_top_10500_professions.xlsx"

# Read data File
df_raw = pd.read_excel(wikidata_raw_file)

# Filter NAN
# filter_na = df_raw.job_masc.isna()
# df_raw.loc[filter_na, "job_masc"] = df_raw.loc[filter_na, "label_ca"]
df_raw = df_raw.fillna("")

display(df_raw.head())
df_raw.shape

Unnamed: 0,profession,female,male,total,job_masc,label_ca,job_fem,altLabel_ca,label_en,altLabel_en,ratio_female,first_word
0,Q82955,64060,285339,349399,polític,polític,política,política,politician,"political figure, pol, political leader, polit.",0.183343,polític
1,Q937857,45535,259910,305445,futbolista,futbolista,futbolista,jugador de futbol,association football player,"football player, footballer, soccer player, as...",0.149078,futbolista
2,Q33999,88965,110810,199775,actor,actor,actriu,actriu,actor,"actress, actors, actresses",0.445326,actor
3,Q36180,58064,126111,184175,escriptor,escriptor,escriptora,escriptora,writer,"author, authors, writers",0.315265,escriptor
4,Q1622272,23634,102240,125874,professor,professor d'universitat,professora,"professora d'universitat, professor universitari",university teacher,"professor, university professor, lecturer, tut...",0.187759,professor


(10553, 12)

#### wiki_top_100_professions.xlsx

In [4]:
cols = ['profession', 'female', 'male', 'total', 'ratio_female', 'label_ca', 'job_masc', 'job_fem']
df_100 = df_raw.sort_values("total",ascending=False).head(100)[cols]
# df_100.profession = df_100.profession.apply(lambda w: WIKIDATA_URL+w)

display(df_100.head(10))

file_100 = DATA + "wiki_top_100_professions.xlsx"
df_100.to_excel(file_100, index=False)
# !open $file_100

Unnamed: 0,profession,female,male,total,ratio_female,label_ca,job_masc,job_fem
0,Q82955,64060,285339,349399,0.183343,polític,polític,política
1,Q937857,45535,259910,305445,0.149078,futbolista,futbolista,futbolista
2,Q33999,88965,110810,199775,0.445326,actor,actor,actriu
3,Q36180,58064,126111,184175,0.315265,escriptor,escriptor,escriptora
4,Q1622272,23634,102240,125874,0.187759,professor d'universitat,professor,professora
5,Q1930187,26197,75962,102159,0.256434,periodista,periodista,periodista
6,Q1028181,22803,65989,88792,0.256814,pintor,pintor,pintora
7,Q177220,38651,46978,85629,0.451377,cantant,cantant,cantant
8,Q10800557,27955,31667,59622,0.468871,actor de cinema,actor,actriu
9,Q36834,7772,51001,58773,0.132238,compositor,compositor,compositora


#### First word (label ca)

In [6]:
# df_raw[df_raw.job_fem.isna()]
filtre = (df_raw.job_fem=="")&~(df_raw.label_ca=="")
filtre = ~(df_raw.label_ca=="")

df_raw.loc[filtre, "first_word"] = df_raw[filtre].label_ca.apply(lambda w: w.split()[0])

### Clean data

* 1. Preparació manual de noms femenins i simplificació de termes compostos (actor de cinema => actor/actriu)
* 2. Filtrem columnes i valors (omitim professions sense valors "fem)
* 3. Agrupem valors del mateix grup de professió
* 4. Eliminació de paraules fora del vocabulari



In [7]:
# 1. Filter empty values and +10 totals
# ================================================================================================
cols = ['profession', 'female', 'male', 'total', 'job_masc', 'label_ca', 'job_fem']
df_jobs = df_raw[~(df_raw.job_fem == "") & (df_raw.total > 10)][cols]

# Replace empty "job_masc" with "label_ca" values
# filtre = df_jobs.job_masc.isna()
# filtre = (df_jobs.job_masc=="")
# df_jobs.loc[filtre, "job_masc"] = df_jobs.loc[filtre, "label_ca"]


# 2. Group by same job
# ================================================================================================
df_jobs = df_jobs.groupby(["job_masc", "job_fem"]).agg({
    "profession": "first",
    "female": "sum",
    "male": "sum",
    "total": "sum",
}).reset_index()


# 3. Add male/female ratios
# ================================================================================================
df_jobs['ratio_fem'] = df_jobs['female']/df_jobs['total']
df_jobs['ratio_masc'] = df_jobs['male']/df_jobs['total']


# 4. Add token id
# ================================================================================================
# Create columns with token_id for each words column (Words multi-token are filled with -1)
for col in ['job_masc', 'job_fem']:
    df_jobs[col+"_id"] = df_jobs[col].apply(lambda w:uni_tokenize(tokenizer, w, -1))
    

# Show data    
df_jobs.head(10)

Unnamed: 0,job_masc,job_fem,profession,female,male,total,ratio_fem,ratio_masc,job_masc_id,job_fem_id
0,acadèmic,acadèmica,Q3400985,3971,9154,13125,0.302552,0.697448,8333,11989
1,activista,activista,Q15253558,7614,7210,14824,0.513627,0.486373,16738,16738
2,actor,actriu,Q33999,153097,198409,351506,0.435546,0.564454,10740,13751
3,administrador,administradora,Q21281706,334,1039,1373,0.243263,0.756737,29509,-1
4,administratiu,administrativa,Q16532929,113,203,316,0.357595,0.642405,6718,6894
5,advocat,advocada,Q40348,7161,35609,42770,0.16743,0.83257,11163,32308
6,agent,agent,Q519076,386,1343,1729,0.22325,0.77675,11990,11990
7,agricultor,agricultora,Q131512,980,4464,5444,0.180015,0.819985,-1,-1
8,alcalde,alcaldessa,Q30185,117,1077,1194,0.09799,0.90201,7084,18383
9,analista,analista,Q485178,171,561,732,0.233607,0.766393,41652,41652


#### Filtrem paraules multi-token

In [9]:
fem_unitoken = df_jobs.job_fem_id>0
masc_unitoken = df_jobs.job_masc_id>0
df_jobs_clean = df_jobs[masc_unitoken & fem_unitoken]


print("Professions descartades:", len(df_jobs[~masc_unitoken | ~fem_unitoken]), "de", len(df_jobs))
print("Professions femení, no reconegudes:", len(df_jobs[masc_unitoken & ~fem_unitoken]), "de", len(df_jobs))
# display(df_jobs[masc_unitoken & ~fem_unitoken].head())
print("Professions masculí, no reconegudes:", len(df_jobs[~masc_unitoken & fem_unitoken]), "de", len(df_jobs))
# display(df_jobs[~masc_unitoken & fem_unitoken].head())
print("Professions (fem i masc) no reconegudes:", len(df_jobs[~masc_unitoken & ~fem_unitoken]), "de", len(df_jobs))
# df_jobs[~masc_unitoken & ~fem_unitoken].head()


# Guardem arxiu netejat
# ====================================================================================
file_cleaned = DATA + "/wiki_professions_cleaned.xlsx"
df_jobs_clean.to_excel(file_cleaned, index=False)

Professions descartades: 103 de 210
Professions femení, no reconegudes: 38 de 210
Professions masculí, no reconegudes: 1 de 210
Professions (fem i masc) no reconegudes: 64 de 210


In [10]:
df_jobs_clean.sort_values("total")

Unnamed: 0,job_masc,job_fem,profession,female,male,total,ratio_fem,ratio_masc,job_masc_id,job_fem_id
71,doctor,doctora,Q25141651,20,5,25,0.800000,0.200000,5136,18530
63,detectiu,detectiu,Q1058617,6,88,94,0.063830,0.936170,34778,34778
193,taxista,taxista,Q2961580,8,89,97,0.082474,0.917526,38757,38757
40,carter,cartera,Q2180295,16,92,108,0.148148,0.851852,47082,15924
37,cambrer,cambrera,Q157195,52,60,112,0.464286,0.535714,26840,41941
...,...,...,...,...,...,...,...,...,...,...
85,escriptor,escriptora,Q36180,68101,147372,215473,0.316054,0.683946,8319,19840
98,futbolista,futbolista,Q937857,45536,259998,305534,0.149037,0.850963,14619,14619
87,esportista,esportista,Q3665646,52709,273905,326614,0.161380,0.838620,29430,29430
169,polític,política,Q82955,64347,286607,350954,0.183349,0.816651,2054,1274


In [11]:
# Recompte de dades en cada procés de neteja
# ====================================================================================
print("Arxiu inicial:     ", len(df_raw))
print("Total major de 10: ", len(df_raw[(df_raw.total > 10)]))
print("Dades amb etiqueta:", len(df_raw[~(df_raw.job_fem == "") & (df_raw.total > 10)]))
print("Dades agrupades:   ", len(df_jobs))
print("Dades netejades:   ", len(df_jobs_clean))

Arxiu inicial:      10553
Total major de 10:  3262
Dades amb etiqueta: 1140
Dades agrupades:    210
Dades netejades:    107


In [12]:
cols = [ 'profession', 'job_masc', 'job_fem', 'job_masc_id', 'job_fem_id']
file_multitoken = "data/professions_multitoken.xlsx"

df_multitoken = df_jobs[~masc_unitoken | ~fem_unitoken][cols]
df_multitoken.columns = [ 'Wikidata ID', 'Masculí', 'Femení', 'Token ID (masc)', 'Token ID (fem)']

# df_multitoken.profession = df_multitoken.profession.apply(lambda w: WIKIDATA_URL + w)
df_multitoken.to_excel(file_multitoken, index=False)
# !open $file_multitoken

### Professions del mateix grup

In [13]:
cols = ["profession", "job_masc","job_fem"]
df_raw[~(df_raw.job_fem=="")][cols].groupby(cols[1:]).count().sort_values("profession").tail(15).reset_index()

Unnamed: 0,job_masc,job_fem,profession
0,dissenyador,dissenyadora,25
1,cantant,cantant,25
2,diputat,diputada,27
3,mestre,mestra,27
4,ministre,ministra,27
5,músic,música,30
6,escriptor,escriptora,31
7,esportista,esportista,31
8,activista,activista,35
9,entrenador,entrenadora,40


In [14]:
cols = ['label_ca', 'profession', 'female', 'male', 'total']
df_raw[df_raw.job_masc=="escriptor"][cols]

Unnamed: 0,label_ca,profession,female,male,total
3,escriptor,Q36180,58064,126111,184175
109,escriptor de literatura infantil,Q4853732,4182,3527,7709
123,escriptor de no-ficció,Q15980158,1360,4960,6320
142,assagista,Q11774202,1360,3766,5126
161,escriptor de ciència-ficció,Q18844224,972,3529,4501
202,prosista,Q12144794,685,2471,3156
356,escriptor de contes,Q15949613,548,728,1276
499,escriptor de gènere policíac,Q10297252,193,527,720
746,escriptor dialectal,Q61043360,129,248,377
807,escriptor d'himnes,Q13424456,82,226,308
