In [1]:
import os
from biashandler import *
import numpy as np
import statsmodels.api as sm
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions
from scipy.stats import shapiro 
from scipy.stats import ks_2samp

In [2]:
# Loading of the Spanish CrowS-Pairs dataset, with original (1503),
# paraphrased (170), random (170) examples   
path_crows = os.getcwd()+'/data/crowspairs/es_en.csv'

In [3]:
#  Initiation of two BiasInfo objects. They use the same model ("BETO", see thesis)
#  and the difference between the object is only about the used metric for 
#  computing the PLLs. The two metrics are explained in the thesis (see Mcrows for "cp"
#  and Msal for "sz").
bias_cp = BiasInfo(path_crows, "beto", "cp")
bias_sz = BiasInfo(path_crows, "beto", "sz")

In [4]:
# Docs of sent_to_df is reported in biashandler.py

# PLLs for original examples using the two metrics. Remove # for computing.
df_cp_or = bias_cp.sent_to_df("sent_more", "sent_less")
df_sz_or = bias_sz.sent_to_df("sent_more", "sent_less")

Model: beto


Some weights of BertForMaskedLM were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|▏                                         | 5/1503 [00:02<12:27,  2.00it/s]


KeyboardInterrupt: 

In [5]:
# PLLs for paraphrased examples using the two metrics. Remove # for computing.
#df_cp_par = bias_cp.sent_to_df("sent_more_par", "sent_less_par")
#df_sz_par = bias_sz.sent_to_df("sent_more_par", "sent_less_par")

In [None]:
# PLLs for random examples using the two metrics. Remove # for computing.
#df_cp_ran = bias_cp.sent_to_df("sent_more_ran", "sent_less_ran")
#df_sz_ran = bias_sz.sent_to_df("sent_more_ran", "sent_less_ran")

In [None]:
# The dfs are saved as csv files. In fact, the computing of PLLs is heavy and for this
# reason the dfs are saved as csv files.
#df_cp_or.to_csv(os.getcwd()+"/data/crowspairs/cp_beto_or.csv")
#df_sz_or.to_csv(os.getcwd()+"/data/crowspairs/sz_beto_or.csv")
#df_cp_par.to_csv(os.getcwd()+"/data/crowspairs/cp_beto_par.csv")
#df_sz_par.to_csv(os.getcwd()+"/data/crowspairs/sz_beto_par.csv")
#df_cp_ran.to_csv(os.getcwd()+"/data/crowspairs/cp_beto_ran.csv")
#df_sz_ran.to_csv(os.getcwd()+"/data/crowspairs/sz_beto_ran.csv")

In [None]:
# Loading of the csv files created above
scores_cp = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/cp_beto_or.csv")
scores_cp_par = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/cp_beto_par.csv")
scores_cp_ran = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/cp_beto_ran.csv")
scores_sal = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/sz_beto_or.csv")
scores_sal_par = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/sz_beto_par.csv")
scores_sal_ran = pd.read_csv(os.getcwd()+"/data/crowspairs/csv_experiments_ch4/sz_beto_ran.csv")

In [None]:
# The PLL values of original examples, paraphrased examples, and random examples are combined.
cp_scores_all = scores_to_df(scores_cp, scores_cp_par, scores_cp_ran)
sz_scores_all = scores_to_df(scores_sal, scores_sal_par, scores_sal_ran)

In [None]:
#####################################################################################################################
# Detecting bias in m
#####################################################################################################################

In [None]:
# information about the prop. In this part, only the original examples are considered. 
# (paraphrased and random examples are used for robustness analysis)
info_props(cp_scores_all, 0.05)

In [None]:
info_props(sz_scores_all, 0.05)

In [None]:
cp_scores_all

In [None]:
#####################################################################################################################
# Robustness of m
#####################################################################################################################

In [None]:
info_props(cp_scores_all.head(170), 0.05)

In [None]:
info_props(sz_scores_all, 0.05)

In [None]:
info_props(sz_scores_all.head(170), 0.05)

In [None]:
# KL, caso 580

In [None]:
shapiro(cp_scores_all.loc[:, "sent_more_score"])

In [None]:
shapiro(cp_scores_all.loc[:, "sent_less_score"])

In [None]:
shapiro(sz_scores_all.loc[:, "sent_more_score"])

In [None]:
shapiro(sz_scores_all.loc[:, "sent_less_score"])

In [None]:
ks_2samp(cp_scores_all.loc[:, "sent_more_score"], cp_scores_all.loc[:, "sent_less_score"])

In [None]:
ks_2samp(sz_scores_all.loc[:, "sent_more_score"], sz_scores_all.loc[:, "sent_less_score"])

In [None]:
# KL, 580, cp
bias_cp.kl_div(cp_scores_all.loc[:, "sent_more_score"], cp_scores_all.loc[:, "sent_less_score"] )

In [None]:
type(bias_cp.kl_div(cp_scores_all.loc[:, "sent_more_score"], cp_scores_all.loc[:, "sent_less_score"] ))

In [None]:
# KL, 580, sz
bias_cp.kl_div(sz_scores_all.loc[:, "sent_more_score"], sz_scores_all.loc[:, "sent_less_score"] )

In [None]:
plt.hist(cp_scores_all.loc[:, "sent_more_score"], bins=10, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $ster_{cp}$')

# Mostra l'istogramma
plt.show()

In [None]:
plt.hist(cp_scores_all.loc[:, "sent_less_score"], bins=10, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $noster_{cp}$')

# Mostra l'istogramma
plt.show()

In [None]:
plt.hist(sz_scores_all.loc[:, "sent_more_score"], bins=10, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $ster_{sz}$')

# Mostra l'istogramma
plt.show()

In [None]:
plt.hist(cp_scores_all.loc[:, "sent_less_score"], bins=10, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $noster_{sz}$')

# Mostra l'istogramma
plt.show()

In [None]:
sl_cp_scores = (cp_scores_all.loc[:, "sent_less_score"]).values
f_1 = Fitter(sl_cp_scores,
           distributions=['gamma',
                          'lognorm',
                          'skewnorm',
                          'genextreme',
                          "beta",
                          "burr",
                         "norm"])
f_1.fit()

In [None]:
f_1.summary()

In [None]:
f_1.get_best()

In [None]:
sl_cp_scores = (cp_scores_all.loc[:, "sent_more_score"]).values
f_2 = Fitter(sl_cp_scores,
           distributions = ['gamma',
                          'lognorm',
                          'skewnorm',
                          'genextreme',
                          "beta",
                          "burr",
                          "norm"])
f_2.fit()

In [None]:
f_2.summary()

In [None]:
f_2.get_best()

In [None]:
sl_cp_scores = (sz_scores_all.loc[:, "sent_less_score"]).values
f_3 = Fitter(sl_cp_scores,
           distributions=['gamma',
                          'lognorm',
                          'skewnorm',
                          'genextreme',
                          "beta",
                          "burr",
                          "norm"])
f_3.fit()

In [None]:
f_3.summary()

In [None]:
f_3.get_best()

In [None]:
sl_cp_scores = (sz_scores_all.loc[:, "sent_more_score"]).values
f_4 = Fitter(sl_cp_scores,
           distributions = ['gamma',
                          'lognorm',
                          'skewnorm',
                          'genextreme',
                          "beta",
                          "burr",
                          "norm"]) 
f_4.fit()

In [None]:
f_4.summary()

In [None]:
f_4.get_best()

In [None]:
# 3) Test sulla media della distribuzione delle differenze tra ster, no ster per cp e sz. Esperimenti svolti sui 580 

In [None]:
#cp_scores_all
#sz_scores_all

In [None]:
cp_scores_all

In [None]:
diff_scores_cp = cp_scores_all.loc[:,'sent_more_score'] - cp_scores_all.loc[:,'sent_less_score']
plt.hist(diff_scores_cp, bins=100, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('Valori')
plt.ylabel('Frequenza')
plt.title('Istogramma del Vettore')

# Mostra l'istogramma
plt.show()

In [None]:
shapiro(diff_scores_cp)

In [None]:
diff_scores_sz = sz_scores_all.loc[:,'sent_more_score'] - sz_scores_all.loc[:,'sent_less_score']
plt.hist(diff_scores_sz, bins=100, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('Valori')
plt.ylabel('Frequenza')
plt.title('Istogramma del Vettore')

# Mostra l'istogramma
plt.show()

In [None]:
shapiro(diff_scores_sz)

In [None]:
# Sarebbe interessante studiare gli outliers.. vediamo se c'è tempo

In [None]:
# Shapiro suggerisce che le due distribuzioni NON sono normali. Usiamo il Z score approssimato.

In [None]:
q_low = diff_scores_sz.quantile(0.01)
q_hi  = diff_scores_sz.quantile(0.99)

In [None]:

diff_scores_sz = diff_scores_sz[(diff_scores_sz < q_hi) & (diff_scores_sz > q_low)]
plt.hist(diff_scores_sz, bins=100, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL differences')
plt.ylabel('Frequency')
plt.title('Histogram of $diff_{sz}$')

# Mostra l'istogramma
plt.show()

In [None]:
shapiro(diff_scores_sz)

In [None]:
q_low = diff_scores_cp.quantile(0.01)
q_hi  = diff_scores_cp.quantile(0.99)

In [None]:

diff_scores_cp = diff_scores_cp[(diff_scores_cp < q_hi) & (diff_scores_cp > q_low)]
plt.hist(diff_scores_cp, bins=100, edgecolor='black')

# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL differences')
plt.ylabel('Frequency')
plt.title('Histogram of $diff_{cp}$')

# Mostra l'istogramma
plt.show()

In [None]:
shapiro(diff_scores_cp)

In [None]:
from statsmodels.stats.weightstats import ztest

In [None]:
ztest_Score_cp, p_value_cp = ztest(diff_scores_cp,value = 0.0, alternative='two-sided')
ztest_Score_sz, p_value_sz = ztest(diff_scores_sz,value = 0.0, alternative='two-sided')

In [None]:
p_value_cp

In [None]:
p_value_sz

In [None]:
from scipy.stats import wilcoxon
res = wilcoxon(diff_scores_cp)
res.statistic, res.pvalue

In [None]:
scores_cp_par

In [None]:
# Robustezza

In [None]:
scores_cp = scores_cp.head(170)
#scores_cp_par = pd.read_csv(os.getcwd()+"/data/cp_beto_par.csv")
#scores_cp_ran = pd.read_csv(os.getcwd()+"/data/cp_beto_ran.csv")
scores_sal = scores_sal.head(170)
#scores_sal_par = pd.read_csv(os.getcwd()+"/data/sz_beto_par.csv")
#scores_sal_ran = pd.read_csv(os.getcwd()+"/data/sz_beto_ran.csv")

In [None]:
type(bias_cp.agree(scores_cp, scores_cp_par, "my_agree", -3.5, 3.5))

In [None]:
s = sum(bias_cp.agree(scores_cp, scores_cp_par, "my_agree", -3.5, 3.5))

In [None]:
s

In [None]:
pr, p_value = s/170, test_pr_small(s, 170, 0.8, 0.05, alt = "smaller")  

In [None]:
pr

In [None]:
p_value

In [None]:
sm.stats.proportions_ztest([98,133], [170,170])

In [None]:
a = scores_cp.iloc[:,3] - scores_cp.iloc[:,4] > 0

In [None]:
b = scores_cp_par.iloc[:,3] - scores_cp_par.iloc[:,4] > 0

In [None]:
sum((scores_cp.iloc[:,3] - scores_cp.iloc[:,4]).between(-0.5, 0.5))

In [None]:
v = ((scores_cp.iloc[:,3] - scores_cp.iloc[:,4])-(scores_cp_par.iloc[:,3] - scores_cp_par.iloc[:,4]) ).between(-5, 5)

In [None]:
a = scores_cp.iloc[:,3] - scores_cp.iloc[:,4]
b = scores_cp_par.iloc[:,3] - scores_cp_par.iloc[:,4]
c = scores_cp_ran.iloc[:,3] - scores_cp_par.iloc[:,4]

In [None]:
sum(a.between(-0.25,0.25))/170 * 100

In [None]:
sum(b.between(-0.25,0.25))/170 * 100

In [None]:
plt.hist(a, bins=10, edgecolor='black')
# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $diff_{or}$')
# Mostra l'istogramma
plt.show()

In [None]:
plt.hist(b, bins=10, edgecolor='black')
# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $diff_{par}$')
# Mostra l'istogramma
plt.show()

In [None]:
plt.hist(c, bins=10, edgecolor='black')
# Aggiungi etichette agli assi e un titolo
plt.xlabel('PLL')
plt.ylabel('Frequency')
plt.title('Histogram of $diff_{ran}$')
# Mostra l'istogramma
plt.show()

In [None]:
ks_2samp(a, b)

In [None]:
ztest_Score, p_value = ztest((a-b), value = 0.0, alternative='two-sided')

In [None]:
ztest_Score, p_value

In [None]:
ztest_Score, p_value = ztest((a-c), value = 0.0, alternative='two-sided')

In [None]:
ztest_Score, p_value