In [1]:
import pandas as pd
from IPython.display import display, Markdown

from data_processing import get_correct_wrong_pred_df, get_category_gender_partition, get_false_preds, get_subcategories_count_per_run

## Data

In [2]:
nlexique = pd.read_csv('../data/nlexique.csv')
nlexique = nlexique[['lexeme', 'sg', 'gen']].rename(columns={'sg': 'phon'}).dropna().reset_index(drop=True)
nlexique

Unnamed: 0,lexeme,phon,gen
0,à-côté,akOte,m
1,à-coup,aku,m
2,à-peu-près,apØpʁɛ,m
3,à-pic,apik,m
4,à-plat,apla,m
...,...,...,...
30999,zurichois,zyʁikwa,m
31000,zydeco,zidəko,f
31001,zygoma,zigOma,m
31002,zygote,zigɔt,m


In [3]:
echantinom = pd.read_csv('../data/Echantinom-full-20210902.csv')
echantinom = echantinom[~echantinom['lemma'].isin(nlexique[nlexique['gen'] == 'b']['lexeme'])] #[['lemma', 'phon', 'gen']]
echantinom

Unnamed: 0,lemma,gen,phon,freq_lex_books,freq_lex_subtitles,freq_frcow,last_process_broad,last_process_narrow,prefix,compound,...,autonomous_base,base_stem_phon,sfx_allomorph,der_stem_phon,edit_distance,pattern,pattern_tf,pattern_rel_tf,base_der_sim,offset_sim
0,berlingue,m,bɛʁ.lɛ̃g,0.34,0.00,34,nonconcat,apocope,0,0,...,,,,,,,,,,
1,corton,m,kɔʁ.tɔ̃,0.27,0.03,398,suffix,suffix,0,0,...,True,kuʁ,ɔ̃,kɔʁt,2,_u_~_ɔ_tɔ̃,1,0.015625,0.222162783145905,0.158108526129264
2,dabuche,f,da.byʃ,0.54,0.00,3,suffix,suffix,0,0,...,True,UNKNOWN,yʃ,dab,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN
3,faf,m,faf,0.88,0.07,3422,nonconcat,apocope,0,0,...,,,,,,,,,,
4,gail,f,gaj,0.61,0.00,2471,simplex,native_simplex,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,palu,m,pa.ly,0.14,0.80,1431,nonconcat,apocope,0,0,...,,,,,,,,,,
4996,talc,m,talk,1.49,1.40,2775,simplex,borrowing,0,0,...,,,,,,,,,,
4997,sauvetage,m,sO.və.taʒ,3.72,8.32,60875,suffix,suffix,0,0,...,True,sOv,aʒ,sOvət,2,_~_ətaʒ,1,0.011494252873563,0.252674728631973,0.614848479997584
4998,plaid,m,plɛd,1.15,0.34,2889,simplex,borrowing,0,0,...,,,,,,,,,,


In [4]:
phon_preds_x10 = pd.read_csv('../results/echantinom_phon_preds_x10.csv')
phon_preds_x10.rename(columns={"Predicted Gender": "phon_pred", "True Gender": "true", "Form": 'lemma'}, inplace=True)
phon_preds_x10

Unnamed: 0,lemma,phon_pred,true,Class Probabilities,Set,Run
0,ɛ,m,f,"[('ɛ', {'f': 0.14882417023181915, 'm': 0.85117...",test,1
1,u,f,m,"[('u', {'f': 0.3157658278942108, 'm': 0.684234...",test,1
2,ɛ,m,m,"[('ɛ', {'f': 0.14882417023181915, 'm': 0.85117...",test,1
3,o,m,f,"[('o', {'f': 0.5670557618141174, 'm': 0.432944...",test,1
4,ba,m,m,"[('a', {'f': 0.39993923902511597, 'm': 0.60006...",test,1
...,...,...,...,...,...,...
45315,kɔ̃fidɑ̃sjalite,f,f,"[('e', {'f': 0.5326544642448425, 'm': 0.467345...",test,10
45316,ɛ̃kɔ̃patibilite,f,f,"[('e', {'f': 0.5326544642448425, 'm': 0.467345...",test,10
45317,suzalimɑ̃tasjɔ̃,f,f,"[('̃', {'f': 0.6368642449378967, 'm': 0.363135...",test,10
45318,tʁɑ̃splɑ̃tasjɔ̃,f,f,"[('̃', {'f': 0.6368642449378967, 'm': 0.363135...",test,10


## Phonetic error analysis

In [5]:
print('\nCorrect and wrong phonetic predictions per gender:')
# print(get_correct_wrong_pred_df(phon_preds_x10, proportions=False).to_markdown(index=False))
distributions = get_correct_wrong_pred_df(phon_preds_x10, pred_col= 'phon_pred', proportions=False)
distributions_prop = get_correct_wrong_pred_df(phon_preds_x10, pred_col= 'phon_pred', proportions=True)
display(distributions, distributions_prop)


Correct and wrong phonetic predictions per gender:


Unnamed: 0,Run,f_true,m_true,f_false,m_false
0,1,1221,2398,475,438
1,2,1282,2320,553,377
2,3,1188,2464,409,471
3,4,1154,2519,354,505
4,5,1181,2378,495,478
5,6,1221,2438,435,438
6,7,1120,2538,335,539
7,8,1154,2555,318,505
8,9,1148,2502,371,511
9,10,1201,2342,531,458


Unnamed: 0,Run,f_true,m_true,f_false,m_false
0,1,0.72,0.846,0.28,0.154
1,2,0.699,0.86,0.301,0.14
2,3,0.744,0.84,0.256,0.16
3,4,0.765,0.833,0.235,0.167
4,5,0.705,0.833,0.295,0.167
5,6,0.737,0.848,0.263,0.152
6,7,0.77,0.825,0.23,0.175
7,8,0.784,0.835,0.216,0.165
8,9,0.756,0.83,0.244,0.17
9,10,0.693,0.836,0.307,0.164


### last_process_broad

In [6]:
category = 'last_process_broad' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',)
all_runs_proportions = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for last_process_broad over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,15.0,34.0,221,183
suffix,0.0,1.0,181,191
conversion,2.0,1.0,60,48
polylexical,1.0,1.0,17,26
nonconcat,4.0,6.0,15,15
prefix,0.0,0.0,9,10


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,0.064,0.157,0.936,0.843
suffix,0.0,0.005,1.0,0.995
conversion,0.032,0.02,0.968,0.98
polylexical,0.056,0.037,0.944,0.963
nonconcat,0.211,0.286,0.789,0.714
prefix,0.0,0.0,1.0,1.0


In [7]:
run = 1

print(f"\n [Run {run}] Gender partition for {category}:")
run1 = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred', run=run)
run1_prop = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred', run=run, proportion=True)
display(run1, run1_prop)


 [Run 1] Gender partition for last_process_broad:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,12.0,30.0,185,150
suffix,0.0,1.0,157,161
conversion,2.0,1.0,55,39
nonconcat,4.0,5.0,13,15
polylexical,1.0,1.0,15,20
prefix,0.0,0.0,7,10


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simplex,0.061,0.167,0.939,0.833
suffix,0.0,0.006,1.0,0.994
conversion,0.035,0.025,0.965,0.975
nonconcat,0.235,0.25,0.765,0.75
polylexical,0.062,0.048,0.938,0.952
prefix,0.0,0.0,1.0,1.0


In [8]:
# TODO: why nothing from run 3 onwards?
run_data = phon_preds_x10[phon_preds_x10['Run'] == 3]
crosstab = pd.crosstab(echantinom['last_process_broad'], run_data['true'])
crosstab = crosstab.loc[crosstab.sum(axis=1).sort_values(ascending=False).index]
crosstab

true
last_process_broad


#### False f

In [9]:
pred_gender = 'f'
true_gender = 'm'
category = 'last_process_broad'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
nonconcat,2,2,2,1,2,2,1,1,2
polylexical,1,1,0,1,1,2,0,0,0
simplex,15,16,10,10,11,10,7,7,8


In [24]:
subcategory = 'simplex'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false simplex count: 15


array(['as', 'bis', 'bal', 'sas', 'van', 'val', 'imam', 'past', 'atlas',
       'blini', 'islam', 'fanal', 'salami', 'bikini', 'falbala'],
      dtype=object)

In [25]:
subcategory = 'nonconcat'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false nonconcat count: 2


array(['faf', 'pastaga'], dtype=object)

#### False m

In [10]:
pred_gender = 'm'
true_gender = 'f'
category = 'last_process_broad'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
last_process_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
simplex,6,6,7,7,6,6,7,6,6


In [27]:
subcategory = 'simplex'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false simplex count: 6


array(['pli', 'alma', 'diva', 'kipa', 'smala', 'bastos'], dtype=object)

### last_process_narrow

In [28]:
category = 'last_process_narrow' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',)
all_runs_proportions = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for last_process_narrow over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
suffix,0.0,1.0,181.0,191.0
native_simplex,5.0,16.0,164.0,145.0
borrowing,6.0,17.0,49.0,28.0
conversion-A,2.0,1.0,45.0,42.0
native_compound,0.0,0.0,11.0,14.0
apocope,3.0,2.0,9.0,7.0
antonomasia,4.0,0.0,6.0,10.0
prefix,0.0,0.0,9.0,10.0
reduplication,0.0,3.0,2.0,5.0
agglomerate,0.0,0.0,1.0,8.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
suffix,0.0,0.005,1.0,0.995
native_simplex,0.03,0.099,0.97,0.901
borrowing,0.109,0.378,0.891,0.622
conversion-A,0.043,0.023,0.957,0.977
native_compound,0.0,0.0,1.0,1.0
apocope,0.25,0.222,0.75,0.778
antonomasia,0.4,0.0,0.6,1.0
prefix,0.0,0.0,1.0,1.0
reduplication,0.0,0.375,1.0,0.625
agglomerate,0.0,0.0,1.0,1.0


#### False f

In [11]:
pred_gender = 'f'
true_gender = 'm'
category = 'last_process_narrow'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
acronym,1,1,0,1,1,2,0,0,0
antonomasia,2,2,1,1,1,2,1,0,1
apocope,1,1,1,1,1,1,1,1,1
apocope_with_appendix,1,1,1,0,1,1,0,0,1
borrowing,5,5,4,3,4,2,2,3,1
native_simplex,8,8,5,6,6,6,4,4,6
onomatopeic,0,1,0,0,0,0,0,0,0


In [30]:
subcategory = 'native_simplex'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false native_simplex count: 8


array(['as', 'bis', 'bal', 'sas', 'val', 'islam', 'fanal', 'falbala'],
      dtype=object)

In [31]:
subcategory = 'borrowing'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false borrowing count: 5


array(['van', 'imam', 'past', 'blini', 'salami'], dtype=object)

In [32]:
subcategory = 'antonomasia'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false antonomasia count: 2


array(['atlas', 'bikini'], dtype=object)

#### False m

In [12]:
pred_gender = 'm'
true_gender = 'f'
category = 'last_process_narrow'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
last_process_narrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
antonomasia,1,1,1,1,1,1,1,1,1
borrowing,4,4,4,5,3,4,4,4,4
native_simplex,1,1,2,1,2,1,2,1,1


In [34]:
subcategory = 'borrowing'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false borrowing count: 4


array(['alma', 'diva', 'kipa', 'smala'], dtype=object)

### Compound

In [35]:
category = 'compound' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',)
all_runs_proportions = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for compound over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,20.0,42.0,484.0,453.0
neoclassical,2.0,1.0,7.0,5.0
NOUN-NOUN,0.0,0.0,3.0,8.0
VERB-NOUN,0.0,0.0,3.0,4.0
ADJ-NOUN,0.0,0.0,2.0,1.0
NOUN-ADJ,0.0,0.0,2.0,1.0
ADJ-ADJ,0.0,0.0,1.0,0.0
ADV-NOUN,0.0,0.0,1.0,0.0
ADV-ADJ,0.0,0.0,0.0,1.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.04,0.085,0.96,0.915
neoclassical,0.222,0.167,0.778,0.833
NOUN-NOUN,0.0,0.0,1.0,1.0
VERB-NOUN,0.0,0.0,1.0,1.0
ADJ-NOUN,0.0,0.0,1.0,1.0
NOUN-ADJ,0.0,0.0,1.0,1.0
ADJ-ADJ,0.0,,1.0,
ADV-NOUN,0.0,,1.0,
ADV-ADJ,,0.0,,1.0


#### False f

In [13]:
pred_gender = 'f'
true_gender = 'm'
category = 'compound'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,18,19,12,12,14,14,8,8,10


In [37]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false 0 count: 18


array(['as', 'faf', 'bis', 'bal', 'sas', 'van', 'val', 'sima', 'imam',
       'past', 'atlas', 'blini', 'islam', 'fanal', 'salami', 'bikini',
       'falbala', 'pastaga'], dtype=object)

#### False m

In [14]:
pred_gender = 'm'
true_gender = 'f'
category = 'compound'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,6,6,7,7,6,6,7,6,6


In [39]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false 0 count: 6


array(['pli', 'alma', 'diva', 'kipa', 'smala', 'bastos'], dtype=object)

### Suffix broad

In [40]:
category = 'suffix_broad' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',)
all_runs_proportions = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for suffix_broad over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,22.0,41.0,298.0,264.0
ment,0.0,0.0,26.0,20.0
ion,0.0,0.0,18.0,22.0
eurM1,0.0,0.0,16.0,23.0
ier,0.0,0.0,16.0,14.0
...,...,...,...,...
if,0.0,1.0,0.0,0.0
ose,0.0,0.0,1.0,0.0
one,0.0,0.0,1.0,0.0
is,0.0,0.0,1.0,0.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.069,0.134,0.931,0.866
ment,0.000,0.000,1.000,1.000
ion,0.000,0.000,1.000,1.000
eurM1,0.000,0.000,1.000,1.000
ier,0.000,0.000,1.000,1.000
...,...,...,...,...
if,,1.000,,0.000
ose,0.000,,1.000,
one,0.000,,1.000,
is,0.000,,1.000,


#### False f

In [15]:
pred_gender = 'f'
true_gender = 'm'
category = 'suffix_broad'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,18,19,12,12,14,14,8,8,10


In [42]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false 0 count: 18


array(['as', 'faf', 'bis', 'bal', 'sas', 'van', 'val', 'sima', 'imam',
       'past', 'atlas', 'blini', 'islam', 'fanal', 'salami', 'bikini',
       'falbala', 'pastaga'], dtype=object)

#### False m

In [16]:
pred_gender = 'm'
true_gender = 'f'
category = 'suffix_broad'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
suffix_broad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,6,6,7,7,6,6,7,6,6


In [44]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false 0 count: 6


array(['pli', 'alma', 'diva', 'kipa', 'smala', 'bastos'], dtype=object)

### Conversion

In [45]:
category = 'conversion' 
print(f"\nGender partition for {category} over all runs:")
all_runs = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',)
all_runs_proportions = get_category_gender_partition(category, echantinom, phon_preds_x10, pred_col= 'phon_pred',proportion=True)
display(all_runs, all_runs_proportions)


Gender partition for conversion over all runs:


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16.0,33.0,352.0,335.0
V0,3.0,9.0,69.0,67.0
A,3.0,1.0,74.0,66.0
V12,0.0,0.0,2.0,2.0
V13,0.0,0.0,2.0,1.0
V,0.0,0.0,2.0,0.0
N,0.0,0.0,1.0,0.0
VINF,0.0,0.0,1.0,0.0
ADV,0.0,0.0,0.0,1.0
PRO,0.0,0.0,0.0,1.0


Unnamed: 0_level_0,f_true,m_true,f_false,m_false
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.043,0.09,0.957,0.91
V0,0.042,0.118,0.958,0.882
A,0.039,0.015,0.961,0.985
V12,0.0,0.0,1.0,1.0
V13,0.0,0.0,1.0,1.0
V,0.0,,1.0,
N,0.0,,1.0,
VINF,0.0,,1.0,
ADV,,0.0,,1.0
PRO,,0.0,,1.0


#### False f

In [17]:
pred_gender = 'f'
true_gender = 'm'
category = 'conversion'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,16,15,10,8,13,11,6,6,8
V0,2,4,2,4,1,3,2,2,2


In [47]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false 0 count: 16


array(['as', 'faf', 'bal', 'van', 'val', 'sima', 'imam', 'past', 'atlas',
       'blini', 'islam', 'fanal', 'salami', 'bikini', 'falbala',
       'pastaga'], dtype=object)

In [48]:
subcategory = 'V0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring f_false:
[Run 1] f_false V0 count: 2


array(['bis', 'sas'], dtype=object)

#### False m

In [18]:
pred_gender = 'm'
true_gender = 'f'
category = 'conversion'

get_subcategories_count_per_run(phon_preds_x10, pred_gender, true_gender, category, echantinom, col='phon_pred')

Run,1,2,3,4,5,6,7,8,9
conversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,5,5,6,6,5,5,6,5,5
V0,1,1,1,1,1,1,1,1,1


In [50]:
subcategory = '0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false 0 count: 5


array(['alma', 'diva', 'kipa', 'smala', 'bastos'], dtype=object)

In [51]:
subcategory = 'V0'

print(f'\nExploring {pred_gender}_false:')
false_rows= get_false_preds(run, echantinom, 'phon_pred', pred_gender, true_gender, phon_preds_x10, category, subcategory)
print(f"[Run {run}] {pred_gender}_false {subcategory} count: {len(false_rows['lemma'].unique())}")
false_rows['lemma'].unique()


Exploring m_false:
[Run 1] m_false V0 count: 1


array(['pli'], dtype=object)