In [1]:
import pandas as pd
from IPython.display import display, Markdown

from data_processing import get_correct_wrong_pred_df, get_category_gender_partition, get_false_preds, get_subcategories_count_per_run, most_common

## Data

In [2]:
nlexique = pd.read_csv('../data/nlexique.csv')
nlexique = nlexique[['lexeme', 'sg', 'gen']].rename(columns={'sg': 'phon'}).dropna().reset_index(drop=True)
nlexique

Unnamed: 0,lexeme,phon,gen
0,à-côté,akOte,m
1,à-coup,aku,m
2,à-peu-près,apØpʁɛ,m
3,à-pic,apik,m
4,à-plat,apla,m
...,...,...,...
30999,zurichois,zyʁikwa,m
31000,zydeco,zidəko,f
31001,zygoma,zigOma,m
31002,zygote,zigɔt,m


In [3]:
echantinom = pd.read_csv('../data/Echantinom-full-20210902.csv')
echantinom = echantinom[~echantinom['lemma'].isin(nlexique[nlexique['gen'] == 'b']['lexeme'])] #[['lemma', 'phon', 'gen']]
echantinom

Unnamed: 0,lemma,gen,phon,freq_lex_books,freq_lex_subtitles,freq_frcow,last_process_broad,last_process_narrow,prefix,compound,...,autonomous_base,base_stem_phon,sfx_allomorph,der_stem_phon,edit_distance,pattern,pattern_tf,pattern_rel_tf,base_der_sim,offset_sim
0,berlingue,m,bɛʁ.lɛ̃g,0.34,0.00,34,nonconcat,apocope,0,0,...,,,,,,,,,,
1,corton,m,kɔʁ.tɔ̃,0.27,0.03,398,suffix,suffix,0,0,...,True,kuʁ,ɔ̃,kɔʁt,2,_u_~_ɔ_tɔ̃,1,0.015625,0.222162783145905,0.158108526129264
2,dabuche,f,da.byʃ,0.54,0.00,3,suffix,suffix,0,0,...,True,UNKNOWN,yʃ,dab,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
3,faf,m,faf,0.88,0.07,3422,nonconcat,apocope,0,0,...,,,,,,,,,,
4,gail,f,gaj,0.61,0.00,2471,simplex,native_simplex,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,palu,m,pa.ly,0.14,0.80,1431,nonconcat,apocope,0,0,...,,,,,,,,,,
4996,talc,m,talk,1.49,1.40,2775,simplex,borrowing,0,0,...,,,,,,,,,,
4997,sauvetage,m,sO.və.taʒ,3.72,8.32,60875,suffix,suffix,0,0,...,True,sOv,aʒ,sOvət,2,_~_ətaʒ,1,0.011494252873563,0.252674728631973,0.614848479997584
4998,plaid,m,plɛd,1.15,0.34,2889,simplex,borrowing,0,0,...,,,,,,,,,,


In [4]:
orth_preds_x10 = pd.read_csv('../results/echantinom_orth_preds_x10.csv')
orth_preds_x10.rename(columns={"Predicted Gender": "orth_pred", "True Gender": "true", "Form": 'lemma'}, inplace=True)
orth_preds_x10

Unnamed: 0,lemma,orth_pred,true,Class Probabilities,Set,Run
0,an,m,m,"[('n', {'f': 0.4085073173046112, 'm': 0.591492...",test,1
1,bi,m,m,"[('i', {'f': 0.16074861586093903, 'm': 0.83925...",test,1
2,as,m,m,"[('s', {'f': 0.2868475317955017, 'm': 0.713152...",test,1
3,té,m,m,"[('é', {'f': 0.3696044683456421, 'm': 0.630395...",test,1
4,pie,f,f,"[('e', {'f': 0.11107771843671799, 'm': 0.88892...",test,1
...,...,...,...,...,...,...
45315,approvisionnement,m,m,"[('t', {'f': 0.07809196412563324, 'm': 0.92190...",test,10
45316,sous-alimentation,f,f,"[('n', {'f': 0.38623788952827454, 'm': 0.61376...",test,10
45317,cul-de-basse-fosse,m,m,"[('e', {'f': 0.23680555820465088, 'm': 0.76319...",test,10
45318,marie-couche-toi-là,f,f,"[('à', {'f': 0.28149473667144775, 'm': 0.71850...",test,10


## Run analysis

In [5]:
# We want to know how many errors are the same over all runs
errors = orth_preds_x10[orth_preds_x10['orth_pred'] != orth_preds_x10['true']]
error_counts = errors.groupby('lemma').agg({
    'Run':'nunique',
    'orth_pred': most_common,
    'true': 'first'
    })  \
    .reset_index() \
    .sort_values(by='Run', ascending=False)\
    .rename(columns={'Run': 'Nb_runs'})

# check if each lemma has different values in the 'orth_pred' column depending on the runs
orth_pred_counts = errors.groupby('lemma')['orth_pred'].nunique().reset_index()
len(orth_pred_counts[orth_pred_counts['orth_pred'] > 1])                                # THere are no lemmas that have different gender predictions 

run_counts = error_counts['Nb_runs'].value_counts().reset_index()
run_counts.columns = ['Nb_runs', 'Nb_errors']
run_counts.sort_values(by="Nb_runs", ascending=False)

Unnamed: 0,Nb_runs,Nb_errors
0,10,179
4,9,56
7,8,47
9,7,41
6,6,49
8,5,43
3,4,59
5,3,55
2,2,91
1,1,179


In [6]:
# what does the model always get wrong
error_lemmas = error_counts[error_counts['Nb_runs'] == 10]
error_lemmas

Unnamed: 0,lemma,Nb_runs,orth_pred,true
556,perpendiculaire,10,m,f
693,squaw,10,m,f
469,mer,10,m,f
184,chèche,10,f,m
185,chèvrefeuille,10,f,m
...,...,...,...,...
499,nage,10,m,f
102,boisson,10,m,f
795,zouave,10,f,m
554,pence,10,f,m


In [7]:
# Check if all lemmas in error_lemmas are in echantnom
lemmas_not_in_echantinom = error_lemmas[~error_lemmas['lemma'].isin(echantinom['lemma'])]
print(lemmas_not_in_echantinom)                                                          

Empty DataFrame
Columns: [lemma, Nb_runs, orth_pred, true]
Index: []


In [8]:
# What are these errors
merged_errors_echantinom = pd.merge(error_lemmas, echantinom, on='lemma') 

# Remove columns with only NaN values
cols_to_remove = [col for col in merged_errors_echantinom.columns if merged_errors_echantinom[col].isna().all()]
merged_errors_echantinom.drop(columns=cols_to_remove, inplace=True)
print(f"Removed columns with only NaN values: {cols_to_remove}")

merged_errors_echantinom.head(20)

Removed columns with only NaN values: []


Unnamed: 0,lemma,Nb_runs,orth_pred,true,gen,phon,freq_lex_books,freq_lex_subtitles,freq_frcow,last_process_broad,...,autonomous_base,base_stem_phon,sfx_allomorph,der_stem_phon,edit_distance,pattern,pattern_tf,pattern_rel_tf,base_der_sim,offset_sim
0,perpendiculaire,10,m,f,f,pɛʁ.pɑ̃.di.ky.lɛʁ,0.74,0.04,0,conversion,...,,,,,,,,,,
1,squaw,10,m,f,f,skwo,6.08,0.0,461,simplex,...,,,,,,,,,,
2,mer,10,m,f,f,mɛʁ,257.57,106.61,603223,simplex,...,,,,,,,,,,
3,chèche,10,f,m,m,ʃɛʃ,0.68,0.01,1275,simplex,...,,,,,,,,,,
4,chèvrefeuille,10,f,m,m,ʃɛ.vʁə.fœj,2.7,0.11,1659,polylexical,...,,,,,,,,,,
5,cimetière,10,f,m,m,si.mə.tjɛʁ,44.19,31.34,83755,simplex,...,,,,,,,,,,
6,superstar,10,m,f,f,sy.pɛʁs.taʁ,0.41,1.82,6679,simplex,...,,,,,,,,,,
7,mayo,10,m,f,f,ma.jo,0.0,0.77,973,nonconcat,...,,,,,,,,,,
8,sueur,10,m,f,f,sɥœʁ,60.34,11.71,35392,suffix,...,True,sɥ,œʁ,sɥ,0.0,_~_œʁ,11.0,0.846153846153846,0.474891513586044,0.444119388776185
9,matuche,10,f,m,m,ma.tyʃ,0.41,0.0,34,nonconcat,...,True,,,,,,,,,


In [9]:
# There is 1 lemma that doesn't have the same labelled gender in echantinom and in nlexique
merged_errors_echantinom[merged_errors_echantinom['true'] != merged_errors_echantinom['gen']].reset_index()[['lemma', 'true', 'gen']]

Unnamed: 0,lemma,true,gen
0,dine,m,f


In [10]:
# Look for pattern
merged_errors_echantinom['orth_pred'].value_counts()            # There are 97 wrongly predicted m and 82 wrongly predicted f

m    97
f    82
Name: orth_pred, dtype: int64

In [11]:
merged_errors_echantinom['last_process_broad'].value_counts()   # We have mostly simplex nouns 

simplex        119
suffix          21
nonconcat       15
conversion       9
polylexical      9
prefix           6
Name: last_process_broad, dtype: int64

In [12]:
merged_errors_echantinom['suffix'].unique()                     # The suffixes causing issues:
merged_errors_echantinom['suffix'].value_counts()   

0       151
eurF     13
onF       4
aine      2
aire      1
uche      1
in        1
ite       1
ose       1
aneM      1
if        1
ôse       1
ique      1
Name: suffix, dtype: int64

In [26]:
merged_errors_echantinom['last_process_narrow'].value_counts() # We have 22 borrowings, which are:
merged_errors_echantinom[merged_errors_echantinom['last_process_narrow'] == 'borrowing']['lemma'].tolist()

['squaw',
 'superstar',
 'start-up',
 'maharani',
 'soul',
 'canasta',
 'tong',
 'miss',
 'caïque',
 'girl',
 'rallye',
 'drive',
 'kipa',
 'kacha',
 'razzia',
 'vahiné',
 'williams',
 'baffle',
 'obi',
 'barbecue',
 'pin-up',
 'pence']

## Orthographic error analysis

In [5]:
print('\nCorrect and wrong orthographic predictions per gender:')
# print(get_correct_wrong_pred_df(orth_preds_x10, proportions=False).to_markdown(index=False))
distributions = get_correct_wrong_pred_df(orth_preds_x10, pred_col='orth_pred', proportions=False)
distributions_prop = get_correct_wrong_pred_df(orth_preds_x10, pred_col='orth_pred', proportions=True)
display(distributions, distributions_prop)


Correct and wrong orthographic predictions per gender:


Unnamed: 0,Run,f_true,m_true,f_false,m_false
0,1,1446,2674,199,213
1,2,1435,2665,208,224
2,3,1451,2663,210,208
3,4,1402,2721,152,257
4,5,1420,2692,181,239
5,6,1437,2660,213,222
6,7,1457,2645,228,202
7,8,1446,2663,210,213
8,9,1506,2585,288,153
9,10,1479,2643,230,180


Unnamed: 0,Run,f_true,m_true,f_false,m_false
0,1,0.879,0.926,0.121,0.074
1,2,0.873,0.922,0.127,0.078
2,3,0.874,0.928,0.126,0.072
3,4,0.902,0.914,0.098,0.086
4,5,0.887,0.918,0.113,0.082
5,6,0.871,0.923,0.129,0.077
6,7,0.865,0.929,0.135,0.071
7,8,0.873,0.926,0.127,0.074
8,9,0.839,0.944,0.161,0.056
9,10,0.865,0.936,0.135,0.064


### last_process_broad

In [6]:
category = 'last_process_broad' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred')
all_runs_proportions = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for last_process_broad over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,689,1254,98,114
suffix,608,1092,62,84
conversion,166,307,19,20
polylexical,78,161,8,10
nonconcat,39,57,3,9
prefix,28,53,6,7


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,0.875,0.917,0.125,0.083
suffix,0.907,0.929,0.093,0.071
conversion,0.897,0.939,0.103,0.061
polylexical,0.907,0.942,0.093,0.058
nonconcat,0.929,0.864,0.071,0.136
prefix,0.824,0.883,0.176,0.117


In [7]:
run = 1

print(f"\n [Run {run}] Gender partition for {category}:")
run1 = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', run=run)
run1_prop = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', run=run, proportion=True)
display(run1, run1_prop)


 [Run 1] Gender partition for last_process_broad:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,635,1117,92,87
suffix,565,963,53,70
conversion,159,285,16,15
polylexical,74,144,8,8
nonconcat,34,50,3,6
prefix,25,51,6,5


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,0.873,0.928,0.127,0.072
suffix,0.914,0.932,0.086,0.068
conversion,0.909,0.95,0.091,0.05
polylexical,0.902,0.947,0.098,0.053
nonconcat,0.919,0.893,0.081,0.107
prefix,0.806,0.911,0.194,0.089


In [8]:
run_data = orth_preds_x10[orth_preds_x10['Run'] == 3]
crosstab = pd.crosstab(echantinom['last_process_broad'], run_data['true'])
# TODO: why nothing from run 3 onwards?
crosstab = crosstab.loc[crosstab.sum(axis=1).sort_values(ascending=False).index]
crosstab

true
last_process_broad


#### False f

In [10]:
pred_gender = 'f'
true_gender = 'm'
category = 'last_process_broad'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
conversion,13,13,15,10,12,17,16,11,21
nonconcat,8,7,11,7,8,9,13,8,9
polylexical,19,24,22,16,23,19,20,24,27
prefix,6,4,5,3,6,7,4,8,8
simplex,138,147,144,102,117,143,155,144,198
suffix,15,13,13,14,15,18,20,15,25


In [None]:
subcategory = 'simplex'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
# TODO: 138 rows where we're supposed to get 92 f_false
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false simplex count: 138


array(['site', 'cube', 'mime', 'vice', 'vide', 'dine', 'vote', 'pèze',
       'mile', 'cola', 'culte', 'drone', 'amble', 'limbe', 'nonce',
       'monde', 'fifre', 'clone', 'ongle', 'nimbe', 'prote', 'torse',
       'blase', 'palpe', 'galbe', 'renne', 'orgue', 'morse', 'birbe',
       'tison', 'pence', 'grime', 'baile', 'gypse', 'angle', 'rifle',
       'saule', 'arôme', 'kyrie', 'sauna', 'drive', 'tulle', 'sosie',
       'gamma', 'pagne', 'agrume', 'prêche', 'caïque', 'vergne', 'litige',
       'madère', 'porche', 'baffle', 'pulque', 'curare', 'flegme',
       'cigare', 'lierre', 'calque', 'latino', 'buffle', 'cirque',
       'casque', 'poison', 'stupre', 'couple', 'junkie', 'rallye',
       'druide', 'cierge', 'congre', 'zouave', 'lambda', 'chèche',
       'drille', 'bidule', 'causse', 'comble', 'jacques', 'vampire',
       'tumulte', 'bidasse', 'bacille', 'guinche', 'concile', 'frisbee',
       'silence', 'gruyère', 'emblème', 'fourgue', 'gymnase', 'marsala',
       'braille', 'murm

In [None]:
subcategory = 'polylexical'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false polylexical count: 19


array(['adverbe', 'fanzine', 'confrère', 'unetelle', 'discobole',
       'bidonville', 'bain-marie', 'passe-droit', 'saint-pierre',
       'portefeuille', 'chasse-neige', 'faire-valoir', 'claque-merde',
       'mille-feuille', 'chèvrefeuille', 'soutien-gorge', 'croquemitaine',
       'homme-grenouille', 'contre-la-montre'], dtype=object)

In [None]:
subcategory = 'suffix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false suffix count: 15


array(['ovule', 'butane', 'nivôse', 'pendule', 'sulfure', 'cyanure',
       'lactose', 'ventôse', 'lignite', 'globule', 'caniche', 'fascicule',
       'demandeur', 'capitaine', 'nourrisson'], dtype=object)

In [None]:
subcategory = 'conversion'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false conversion count: 13


array(['onze', 'sauté', 'double', 'balaise', 'parjure', 'atlante',
       'immeuble', 'uniforme', 'acquitté', 'burlesque', 'maxillaire',
       'plantigrade', 'barbiturique'], dtype=object)

#### False m

In [12]:
pred_gender = 'm'
true_gender = 'f'
category = 'last_process_broad'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
conversion,5,7,9,12,9,9,5,7,4
nonconcat,14,14,13,16,16,15,12,14,14
polylexical,12,10,14,16,15,8,13,13,11
prefix,8,7,11,9,8,6,8,9,5
simplex,147,163,138,177,167,161,142,149,98
suffix,27,25,23,27,24,23,24,21,21


In [None]:
subcategory = 'simplex'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false simplex count: 147


array(['foi', 'ire', 'nef', 'ive', 'glu', 'bru', 'obi', 'mer', 'clé',
       'eau', 'cage', 'loge', 'robe', 'cour', 'soif', 'miss', 'care',
       'tong', 'girl', 'part', 'gail', 'nage', 'rage', 'diva', 'toge',
       'puce', 'zone', 'kipa', 'sape', 'hâte', 'city', 'soul', 'acre',
       'cave', 'noix', 'gent', 'tribu', 'piste', 'câpre', 'douma',
       'serre', 'façon', 'spore', 'hydre', 'pogne', 'bugle', 'fleur',
       'corne', 'vodka', 'terre', 'vertu', 'bible', 'ancre', 'flore',
       'brume', 'spire', 'bribe', 'alène', 'nacre', 'taule', 'jauge',
       'grâce', 'plume', 'horde', 'kacha', 'smala', 'squaw', 'trame',
       'savate', 'carène', 'satire', 'trique', 'huître', 'baraka',
       'guenon', 'frange', 'truite', 'tumeur', 'sangle', 'capote',
       'armada', 'razzia', 'pagode', 'strate', 'alcôve', 'galène',
       'crypte', 'igname', 'cadène', 'poudre', 'brebis', 'gabare',
       'bastos', 'lymphe', 'pin-up', 'cloque', 'rumeur', 'vahiné',
       'fenêtre', 'syllabe', 'pirogu

In [None]:
subcategory = 'suffix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false suffix count: 27


array(['catin', 'sueur', 'pudeur', 'fureur', 'piqûre', 'levure',
       'vigueur', 'parenté', 'candeur', 'minceur', 'ferveur', 'boisson',
       'louange', 'chanson', 'chaleur', 'rondeur', 'blondeur', 'noirceur',
       'passoire', 'rousseur', 'grandeur', 'splendeur', 'blancheur',
       'bronchite', 'corpuscule', 'bassinoire', 'rôtissoire'],
      dtype=object)

In [None]:
subcategory = 'nonconcat'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false nonconcat count: 14


array(['mob', 'info', 'cata', 'keuf', 'mayo', 'diapo', 'conso', 'impro',
       'philo', 'nympho', 'porcif', 'nounou', 'dondon', 'thalasso'],
      dtype=object)

In [None]:
subcategory = 'polylexical'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false polylexical count: 12


array(['madame', 'silicone', 'parabole', 'bande-son', 'pause-café',
       'notre-dame', "presqu'île", 'claire-voie', 'flanc-garde',
       'grand-route', 'grand-voile', 'tête-de-mort'], dtype=object)

### last_process_narrow

In [None]:
category = 'last_process_narrow' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred')
all_runs_proportions = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for last_process_narrow over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
suffix,608,1092,62.0,84.0
native_simplex,504,935,78.0,89.0
borrowing,144,265,18.0,22.0
conversion-A,139,231,14.0,18.0
native_compound,46,101,6.0,7.0
prefix,28,53,6.0,7.0
antonomasia,38,50,2.0,2.0
neoclassical_compound,14,28,1.0,3.0
agglomerate,17,25,1.0,0.0
apocope,13,28,1.0,0.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
suffix,0.907,0.929,0.093,0.071
native_simplex,0.866,0.913,0.134,0.087
borrowing,0.889,0.923,0.111,0.077
conversion-A,0.908,0.928,0.092,0.072
native_compound,0.885,0.935,0.115,0.065
prefix,0.824,0.883,0.176,0.117
antonomasia,0.95,0.962,0.05,0.038
neoclassical_compound,0.933,0.903,0.067,0.097
agglomerate,0.944,1.0,0.056,0.0
apocope,0.929,1.0,0.071,0.0


#### False f

In [13]:
pred_gender = 'f'
true_gender = 'm'
category = 'last_process_narrow'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
acronym,0,0,0,0,0,1,1,1,0
agglomerate,2,5,2,2,7,2,3,3,3
antonomasia,13,13,14,9,9,14,13,14,15
apocope,3,2,3,3,2,3,5,2,2
apocope_with_appendix,3,3,4,2,3,3,6,3,3
blend,1,1,1,1,1,1,1,1,1
borrowing,22,24,24,10,20,21,35,21,32
conversion-A,12,13,15,10,12,17,15,11,20
conversion-NUM,1,0,0,0,0,0,0,0,0
conversion-V0,0,0,0,0,0,0,1,0,1


In [None]:
subcategory = 'native_simplex'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false native_simplex count: 103


array(['site', 'cube', 'mime', 'vice', 'vide', 'dine', 'vote', 'pèze',
       'mile', 'culte', 'drone', 'amble', 'limbe', 'nonce', 'monde',
       'fifre', 'clone', 'ongle', 'nimbe', 'prote', 'torse', 'blase',
       'palpe', 'galbe', 'renne', 'orgue', 'morse', 'birbe', 'tison',
       'grime', 'baile', 'gypse', 'angle', 'saule', 'arôme', 'tulle',
       'pagne', 'agrume', 'prêche', 'vergne', 'litige', 'porche',
       'curare', 'flegme', 'cigare', 'lierre', 'calque', 'buffle',
       'cirque', 'casque', 'poison', 'stupre', 'couple', 'druide',
       'cierge', 'congre', 'zouave', 'chèche', 'drille', 'bidule',
       'causse', 'comble', 'vampire', 'tumulte', 'bidasse', 'bacille',
       'guinche', 'concile', 'silence', 'emblème', 'fourgue', 'gymnase',
       'murmure', 'salaire', 'symbole', 'falbala', 'flingue', 'bastion',
       'vacarme', 'chevesne', 'cantique', 'grimoire', 'quarante',
       'genièvre', 'conclave', 'margrave', 'triomphe', 'scarabée',
       'carrosse', 'brahmane', 'c

In [None]:
subcategory = 'borrowing'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false borrowing count: 22


array(['cola', 'pence', 'rifle', 'kyrie', 'sauna', 'drive', 'gamma',
       'caïque', 'baffle', 'pulque', 'latino', 'junkie', 'rallye',
       'lambda', 'frisbee', 'bouddha', 'bazooka', 'folklore', 'barbecue',
       'dies irae', 'cheese-cake', 'strip-tease'], dtype=object)

In [None]:
subcategory = 'suffix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false suffix count: 15


array(['ovule', 'butane', 'nivôse', 'pendule', 'sulfure', 'cyanure',
       'lactose', 'ventôse', 'lignite', 'globule', 'caniche', 'fascicule',
       'demandeur', 'capitaine', 'nourrisson'], dtype=object)

In [None]:
subcategory = 'antonomasia'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false antonomasia count: 13


array(['sosie', 'madère', 'jacques', 'gruyère', 'marsala', 'braille',
       'mercure', 'narcisse', 'matamore', 'macintosh', 'champagne',
       'bourgogne', 'macfarlane'], dtype=object)

In [None]:
subcategory = 'native_compound'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false native_compound count: 13


array(['bidonville', 'bain-marie', 'passe-droit', 'saint-pierre',
       'portefeuille', 'chasse-neige', 'faire-valoir', 'claque-merde',
       'mille-feuille', 'chèvrefeuille', 'soutien-gorge', 'croquemitaine',
       'homme-grenouille'], dtype=object)

In [None]:
subcategory = 'conversion-A'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false conversion-A count: 12


array(['sauté', 'double', 'balaise', 'parjure', 'atlante', 'immeuble',
       'uniforme', 'acquitté', 'burlesque', 'maxillaire', 'plantigrade',
       'barbiturique'], dtype=object)

In [None]:
subcategory = 'prefix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false prefix count: 6


array(['vicomte', 'tricorne', 'demi-frère', 'entrecuisse', 'arrière-goût',
       'contrepoison'], dtype=object)

#### False m

In [14]:
pred_gender = 'm'
true_gender = 'f'
category = 'last_process_narrow'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
agglomerate,3,2,3,3,3,2,2,1,2
antonomasia,2,2,1,1,3,2,3,1,1
apocope,10,10,8,10,10,10,7,10,10
apocope_with_appendix,1,1,1,2,2,1,1,1,1
borrowing,27,25,24,30,26,26,22,26,19
conversion-A,5,5,8,11,8,7,4,7,4
conversion-V,0,0,0,0,0,0,1,0,0
conversion-V0,0,1,0,1,0,1,0,0,0
conversion-V12,0,1,1,0,1,1,0,0,0
native_compound,7,8,10,11,12,5,9,11,8


In [None]:
subcategory = 'native_simplex'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false native_simplex count: 118


array(['foi', 'ire', 'nef', 'ive', 'glu', 'bru', 'mer', 'clé', 'eau',
       'cage', 'loge', 'robe', 'cour', 'soif', 'care', 'part', 'gail',
       'nage', 'rage', 'toge', 'puce', 'zone', 'sape', 'hâte', 'acre',
       'cave', 'noix', 'gent', 'tribu', 'piste', 'câpre', 'serre',
       'façon', 'spore', 'hydre', 'pogne', 'bugle', 'fleur', 'corne',
       'terre', 'vertu', 'bible', 'ancre', 'flore', 'brume', 'spire',
       'bribe', 'alène', 'nacre', 'taule', 'jauge', 'grâce', 'plume',
       'horde', 'trame', 'savate', 'carène', 'satire', 'trique', 'huître',
       'guenon', 'frange', 'truite', 'tumeur', 'sangle', 'capote',
       'pagode', 'strate', 'alcôve', 'galène', 'crypte', 'igname',
       'cadène', 'poudre', 'brebis', 'gabare', 'lymphe', 'cloque',
       'rumeur', 'fenêtre', 'syllabe', 'pirogue', 'arnaque', 'cuiller',
       'septime', 'riposte', 'tartane', 'bagarre', 'victime', 'imposte',
       'horloge', 'couleur', 'attaque', 'mandore', 'fanfare', 'chicane',
       'enclume',

In [None]:
subcategory = 'borrowing'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false borrowing count: 27


array(['obi', 'miss', 'tong', 'girl', 'diva', 'kipa', 'city', 'soul',
       'douma', 'vodka', 'kacha', 'smala', 'squaw', 'baraka', 'armada',
       'razzia', 'pin-up', 'vahiné', 'canasta', 'alhambra', 'start-up',
       'fantasia', 'maharani', 'williams', 'superstar', 'trattoria',
       'garden-party'], dtype=object)

In [None]:
subcategory = 'suffix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false suffix count: 27


array(['catin', 'sueur', 'pudeur', 'fureur', 'piqûre', 'levure',
       'vigueur', 'parenté', 'candeur', 'minceur', 'ferveur', 'boisson',
       'louange', 'chanson', 'chaleur', 'rondeur', 'blondeur', 'noirceur',
       'passoire', 'rousseur', 'grandeur', 'splendeur', 'blancheur',
       'bronchite', 'corpuscule', 'bassinoire', 'rôtissoire'],
      dtype=object)

In [None]:
subcategory = 'apocope'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false apocope count: 10


array(['mob', 'info', 'cata', 'mayo', 'diapo', 'conso', 'impro', 'philo',
       'nympho', 'thalasso'], dtype=object)

In [None]:
subcategory = 'prefix'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false prefix count: 8


array(['malfaçon', 'ex-femme', 'mi-pente', 'mini-jupe', 'mi-carême',
       'mi-juillet', 'contrefaçon', 'avant-première'], dtype=object)

### Compound

In [None]:
category = 'compound' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred')
all_runs_proportions = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for compound over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1540,2774,188.0,233.0
neoclassical,20,42,1.0,4.0
VERB-NOUN,15,33,4.0,3.0
NOUN-NOUN,15,27,1.0,1.0
ADJ-NOUN,11,24,1.0,2.0
NOUN-ADJ,2,12,0.0,1.0
ADJ-ADJ,1,4,0.0,0.0
VERB-ADV,2,2,0.0,0.0
ADV-ADJ,1,1,1.0,0.0
ADV-NOUN,0,2,0.0,0.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.891,0.923,0.109,0.077
neoclassical,0.952,0.913,0.048,0.087
VERB-NOUN,0.789,0.917,0.211,0.083
NOUN-NOUN,0.938,0.964,0.062,0.036
ADJ-NOUN,0.917,0.923,0.083,0.077
NOUN-ADJ,1.0,0.923,0.0,0.077
ADJ-ADJ,1.0,1.0,0.0,0.0
VERB-ADV,1.0,1.0,0.0,0.0
ADV-ADJ,0.5,1.0,0.5,0.0
ADV-NOUN,,1.0,,0.0


#### False f

In [15]:
pred_gender = 'f'
true_gender = 'm'
category = 'compound'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,182,189,190,138,164,197,212,190,263
ADJ-ADJ,0,0,0,0,1,0,0,0,0
ADJ-NOUN,1,3,1,1,2,1,1,1,1
NOUN-ADJ,0,0,0,0,0,1,1,0,1
NOUN-NOUN,5,5,6,3,3,4,3,7,7
VERB-NOUN,6,3,8,4,6,4,5,4,7
VERB-VERB,1,0,0,0,0,0,1,0,0
neoclassical,4,8,5,6,5,6,5,8,9


In [None]:
subcategory = 'VERB-NOUN'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false VERB-NOUN count: 6


array(['passe-droit', 'portefeuille', 'chasse-neige', 'claque-merde',
       'soutien-gorge', 'croquemitaine'], dtype=object)

In [None]:
subcategory = 'NOUN-NOUN'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false NOUN-NOUN count: 5


array(['bidonville', 'bain-marie', 'saint-pierre', 'chèvrefeuille',
       'homme-grenouille'], dtype=object)

In [None]:
subcategory = 'neoclassical'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false neoclassical count: 4


array(['adverbe', 'confrère', 'discobole', 'plantigrade'], dtype=object)

#### False m

In [16]:
pred_gender = 'm'
true_gender = 'f'
category = 'compound'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,201,215,194,240,223,212,190,198,141
ADJ-NOUN,3,2,4,6,6,1,4,6,3
ADV-ADJ,0,0,1,1,1,1,0,0,0
ADV-NOUN,1,1,1,1,1,1,1,1,1
NOUN-ADJ,0,0,1,0,1,0,1,0,0
NOUN-NOUN,3,5,4,4,4,3,3,4,4
neoclassical,5,3,3,5,3,4,5,4,4


In [None]:
subcategory = 'neoclassical'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false neoclassical count: 5


array(['philo', 'nympho', 'thalasso', 'silicone', 'parabole'],
      dtype=object)

In [None]:
subcategory = 'ADJ-NOUN'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false ADJ-NOUN count: 3


array(['claire-voie', 'grand-route', 'grand-voile'], dtype=object)

In [None]:
subcategory = 'NOUN-NOUN'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false NOUN-NOUN count: 3


array(['bande-son', 'pause-café', 'flanc-garde'], dtype=object)

### Suffix broad

In [None]:
category = 'suffix_broad' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred')
all_runs_proportions = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for suffix_broad over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,930,1725,124.0,148.0
eurM1,62,126,9.0,9.0
ment,73,111,8.0,10.0
ion,58,104,3.0,11.0
ier,52,82,3.0,7.0
...,...,...,...,...
en,1,0,0.0,0.0
one,0,1,0.0,0.0
aque,1,0,0.0,0.0
ille,1,0,0.0,0.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.882,0.921,0.118,0.079
eurM1,0.873,0.933,0.127,0.067
ment,0.901,0.917,0.099,0.083
ion,0.951,0.904,0.049,0.096
ier,0.945,0.921,0.055,0.079
...,...,...,...,...
en,1.000,,0.000,
one,,1.000,,0.000
aque,1.000,,0.000,
ille,1.000,,0.000,


#### False f

In [17]:
pred_gender = 'f'
true_gender = 'm'
category = 'suffix_broad'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)


Run,1,2,3,4,5,6,7,8,9
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,181,192,193,135,162,192,204,191,258
Vche,2,1,1,2,2,1,2,1,2
ain,2,2,3,2,2,2,2,2,2
aire,0,1,1,2,1,1,1,2,4
aneM,1,1,1,1,1,1,1,1,1
ant,0,0,0,0,1,0,0,1,1
ate,0,1,0,0,1,1,1,0,1
cule,1,0,0,1,0,3,2,0,1
el,0,0,0,0,0,0,0,0,1
eurM1,1,0,0,0,0,1,3,0,2


In [None]:
subcategory = 'ule'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false ule count: 3


array(['ovule', 'pendule', 'globule'], dtype=object)

In [None]:
subcategory = 'Vche'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false Vche count: 2


array(['matuche', 'caniche'], dtype=object)

In [None]:
subcategory = 'ureM'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false ureM count: 2


array(['sulfure', 'cyanure'], dtype=object)

#### False m

In [18]:
pred_gender = 'm'
true_gender = 'f'
category = 'suffix_broad'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,182,197,181,224,210,195,176,187,128
Vche,0,1,0,1,1,0,1,0,0
ade,0,0,0,0,0,1,0,1,0
aire,1,1,1,1,1,1,1,1,1
ange,1,0,1,1,0,1,0,0,0
cule,1,1,1,2,2,0,1,1,1
esque,0,0,0,1,0,0,0,0,0
eurF,15,15,15,15,15,15,14,15,15
ier,0,0,0,0,0,0,0,1,0
if,1,1,1,1,1,1,1,1,1


In [None]:
subcategory = 'eurF'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false eurF count: 15


array(['sueur', 'pudeur', 'fureur', 'vigueur', 'candeur', 'minceur',
       'ferveur', 'chaleur', 'rondeur', 'blondeur', 'noirceur',
       'rousseur', 'grandeur', 'splendeur', 'blancheur'], dtype=object)

In [None]:
subcategory = 'onF'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false onF count: 4


array(['boisson', 'chanson', 'malfaçon', 'contrefaçon'], dtype=object)

In [None]:
subcategory = 'oir'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false oir count: 3


array(['passoire', 'bassinoire', 'rôtissoire'], dtype=object)

### Conversion

In [None]:
category = 'conversion' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred')
all_runs_proportions = get_category_gender_partition(category, echantinom, orth_preds_x10, pred_col='orth_pred', proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for conversion over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1151,2061,140.0,177.0
V0,222,446,32.0,32.0
A,219,363,21.0,33.0
V12,6,28,2.0,1.0
V13,2,11,0.0,0.0
VINF,2,6,0.0,0.0
ADV,2,3,0.0,0.0
N,2,2,0.0,1.0
V,2,2,0.0,0.0
PRO,0,1,1.0,0.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.892,0.921,0.108,0.079
V0,0.874,0.933,0.126,0.067
A,0.912,0.917,0.088,0.083
V12,0.75,0.966,0.25,0.034
V13,1.0,1.0,0.0,0.0
VINF,1.0,1.0,0.0,0.0
ADV,1.0,1.0,0.0,0.0
N,1.0,0.667,0.0,0.333
V,1.0,1.0,0.0,0.0
PRO,0.0,1.0,1.0,0.0


#### False f

In [19]:
pred_gender = 'f'
true_gender = 'm'
category = 'conversion'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,152,166,162,120,144,166,182,161,219
A,12,14,17,11,12,17,16,12,22
NUM,1,0,0,0,0,0,0,0,0
V0,34,28,31,21,25,30,30,37,47


In [None]:
subcategory = 'V0'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false V0 count: 34


array(['cube', 'mime', 'vice', 'vide', 'vote', 'pèze', 'amble', 'fifre',
       'clone', 'nimbe', 'palpe', 'galbe', 'tison', 'grime', 'angle',
       'prêche', 'calque', 'casque', 'couple', 'comble', 'sulfure',
       'tumulte', 'guinche', 'silence', 'cyanure', 'fourgue', 'murmure',
       'salaire', 'flingue', 'vacarme', 'triomphe', 'carrosse',
       'matamore', 'bastringue'], dtype=object)

In [None]:
subcategory = 'A'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] f_false A count: 12


array(['sauté', 'double', 'balaise', 'parjure', 'atlante', 'immeuble',
       'uniforme', 'acquitté', 'burlesque', 'maxillaire', 'plantigrade',
       'barbiturique'], dtype=object)

#### False m

In [20]:
pred_gender = 'm'
true_gender = 'f'
category = 'conversion'

get_subcategories_count_per_run(orth_preds_x10, pred_gender, true_gender, category, echantinom)

Run,1,2,3,4,5,6,7,8,9
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,158,163,154,190,171,163,147,163,120
A,7,6,9,13,10,9,7,9,6
V,0,0,0,0,0,0,1,0,0
V0,48,56,44,54,57,49,49,41,27
V12,0,1,1,0,1,1,0,0,0


In [None]:
subcategory = 'V0'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false V0 count: 48


array(['glu', 'loge', 'robe', 'soif', 'nage', 'rage', 'zone', 'sape',
       'hâte', 'cave', 'piste', 'serre', 'façon', 'pogne', 'fleur',
       'corne', 'terre', 'ancre', 'brume', 'nacre', 'jauge', 'grâce',
       'plume', 'trame', 'carène', 'trique', 'frange', 'sangle', 'capote',
       'poudre', 'cloque', 'levure', 'fenêtre', 'syllabe', 'arnaque',
       'riposte', 'louange', 'bagarre', 'victime', 'couleur', 'attaque',
       'fanfare', 'chicane', 'applique', 'histoire', 'vertèbre',
       'silicone', 'harangue'], dtype=object)

In [None]:
subcategory = 'A'
false_rows= get_false_preds(run, echantinom, 'orth_pred', pred_gender, true_gender, orth_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()

[Run 1] m_false A count: 7


array(['nympho', 'dextre', 'mormone', 'secrète', 'bayadère', 'verticale',
       'perpendiculaire'], dtype=object)