In [None]:
import pandas as pd

from visualization import view_curve
from stats import Distance, SuffixAverage, avg_dist_to_suffix_avg

### Quantifying differences between plots

In [None]:
orth_preds = pd.read_csv('../results/echantinom_orth_preds.csv')
orth_preds_x10 = pd.read_csv('../results/echantinom_orth_preds_x10.csv')

### Comparison of individual words with average suffix

In [None]:
word = 'action'
suffix = 'ion'
gender = 'f'

distance = Distance(orth_preds)
sfx_avg = SuffixAverage(orth_preds, suffix)
sfx_avg_probs = sfx_avg.prob(gender=gender)

print(f'Euclidean distance between "{word}" and average of all words with suffix "{suffix}" : {distance.euclidean(word, sfx_avg_probs):.2f}\n')

view_curve([word], orth_preds, binary=True, gender=gender)
sfx_avg.plot(gender=gender)

In [None]:
suffixes = orth_preds.suffix.value_counts().to_dict()

# The selected gender has an impact on the average distances
# With gender set to 'f', the feminine suffixes have artificially lower average distances than masculine ones and vice versa 

df = pd.DataFrame({
    'suffix': suffixes.keys(),
    'avg_dist_to_suffix_avg': [avg_dist_to_suffix_avg(orth_preds, sfx, 'True', 2) for sfx in suffixes.keys()],
    'num_samples': suffixes.values()
})

df

I'm adding to this dataframe:
- The number of training items that end in the string corresponding to the suffix
- The entropy of gender balance in that set of training items (0=only one gender, 1 = perfectly balanced distribution)

In [None]:
import numpy as np
train=pd.read_csv('../data/orth_train.csv')

def sfx_to_ending(s):
    if s[-1]=='M' or s[-1]=='F':
        return s[:-1]
    return s

for i in df.index:
    ending = sfx_to_ending(df.loc[i,'suffix'])
    subset = train[train.lexeme.str.endswith(ending)]
    vc = subset.gen.value_counts(normalize=True)
    df.loc[i,'train_size'] = len(subset)
    df.loc[i,'entropy'] = abs(- np.sum(vc*np.log2(vc)))

df

We now have expectations that:

- All other things being equal, there should be more varation in average distance when the training set is smaller.
- All other things being equal, the entropy of the gender balance in the training set should correlate with the average distance.

The following figure suggests that this is broadly correct.

In [None]:
import seaborn as sns
g = sns.relplot(df[(df.num_samples>5) & (df.suffix!='0')],
            x='entropy',
            y='avg_dist_to_suffix_avg',
           size = 'train_size',
            aspect=1.3)
g.set(xlabel='Entropy',ylabel='Average Euclidian distance to centroid')
g.legend.set_title('Size of\ntraining set')


### Comparing the averages of different suffixes

In [None]:
sfx_a = 'eurM'
sfx_b = 'eurF'

sfx_a_avg = SuffixAverage(orth_preds, sfx_a)
sfx_a_avg_probs = sfx_a_avg.prob(gender='f')

sfx_b_avg = SuffixAverage(orth_preds, sfx_b)
sfx_b_avg_probs = sfx_b_avg.prob(gender='f')

print(f'Euclidean distance between average of all words with suffix "{sfx_a}" & average of all word with suffix "{sfx_b}": {distance.euclidean(sfx_a_avg_probs, sfx_b_avg_probs):.2f}')

sfx_a_avg.plot(gender='f', min_dp=3, scale=True)
sfx_b_avg.plot(gender='f', min_dp=3, scale=True)

In [None]:
sfx_a = 'on'
sfx_b = 'ion'

sfx_a_avg = SuffixAverage(orth_preds, sfx_a)
sfx_a_avg_probs = sfx_a_avg.prob(gender='f')

sfx_b_avg = SuffixAverage(orth_preds, sfx_b)
sfx_b_avg_probs = sfx_b_avg.prob(gender='f')

print(f'Euclidean distance between average of all words with suffix "{sfx_a}" & average of all word with suffix "{sfx_b}": {distance.euclidean(sfx_a_avg_probs, sfx_b_avg_probs):.2f}')

sfx_a_avg.plot(gender='f', min_dp=3, scale=True)
sfx_b_avg.plot(gender='f', min_dp=3, scale=True)