# Statistical analysis eval runs

## Setup

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option("display.width", 100)
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm
import seaborn as sns

from scipy import stats
import math

from collections import Counter

pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)




### Load files

In [3]:
experiments = pd.read_csv('../evaluation/resources/2020/all_experiments_eval.csv')
experiments.source = experiments.source.apply(lambda row: row.replace('_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL',''))
experiments

Unnamed: 0,difference,disparity,relevance,difference_ind,disparity_ind,relevance_ind,qid,source,system
0,0.35200,2.93676,1.94984,1.11692,0.98604,0.00293,95,ac_controller_default_meta_9_eval_old_config,META+AC_auth_ignore
1,0.99169,1.73147,1.25625,0.37762,0.12248,0.09292,612,ac_controller_default_meta_9_eval_old_config,META+AC_auth_ignore
2,3.70055,2.63171,0.25572,0.93072,0.21487,0.03291,657,ac_controller_default_meta_9_eval_old_config,META+AC_auth_ignore
3,0.29945,1.98737,1.31715,0.67458,0.17690,0.14219,771,ac_controller_default_meta_9_eval_old_config,META+AC_auth_ignore
4,0.71517,1.65668,1.35692,0.57742,0.34905,0.17297,822,ac_controller_default_meta_9_eval_old_config,META+AC_auth_ignore
...,...,...,...,...,...,...,...,...,...
1795,0.05434,1.86368,1.61798,0.88898,1.06764,0.18810,119231,relevance_ranker_eval_TEXT_eval,TEXT
1796,0.01025,2.05438,1.91161,0.59300,1.08332,0.63665,123512,relevance_ranker_eval_TEXT_eval,TEXT
1797,0.28198,1.23715,0.85092,0.98350,1.08307,0.13868,126740,relevance_ranker_eval_TEXT_eval,TEXT
1798,0.03991,1.18031,1.05937,0.54097,1.06769,0.55050,128050,relevance_ranker_eval_TEXT_eval,TEXT


In [4]:
trec_runs = pd.read_csv('../evaluation/resources/2020/all_experiments_eval_tr.csv')
trec_runs.source = trec_runs.source.apply(lambda row: row.replace('_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL',''))
trec_runs['system'] = 'TREC mean'
trec_runs

Unnamed: 0,difference,disparity,relevance,difference_ind,disparity_ind,relevance_ind,qid,source,system
0,1.51007,1.34901,0.57693,1.37280,1.25664,0.01030,95,trec_run.Deltr-gammas,TREC mean
1,0.07098,1.98317,1.84246,0.93773,0.96372,0.23349,612,trec_run.Deltr-gammas,TREC mean
2,3.05426,1.75627,0.14114,2.07657,1.31770,0.01139,657,trec_run.Deltr-gammas,TREC mean
3,0.66066,2.41739,1.35155,2.08090,1.33319,0.01718,771,trec_run.Deltr-gammas,TREC mean
4,0.01574,2.10948,1.93304,0.63216,0.93686,0.43951,822,trec_run.Deltr-gammas,TREC mean
...,...,...,...,...,...,...,...,...,...
4595,0.26294,2.81128,1.98748,1.27396,1.26686,0.09522,119231,trec_run.UW_Kr_r60g20c20,TREC mean
4596,0.25114,3.36546,2.44670,1.72420,1.31765,0.18821,123512,trec_run.UW_Kr_r60g20c20,TREC mean
4597,0.98612,1.65765,0.70910,0.94758,1.07918,0.15469,126740,trec_run.UW_Kr_r60g20c20,TREC mean
4598,0.61913,1.11836,0.73879,0.54767,1.06771,0.54716,128050,trec_run.UW_Kr_r60g20c20,TREC mean


In [5]:
trec_means = pd.merge(trec_runs.groupby('qid').mean().reset_index(), trec_runs[['qid','system']].drop_duplicates(), on='qid',how='left')
trec_means

Unnamed: 0,qid,difference,disparity,relevance,difference_ind,disparity_ind,relevance_ind,system
0,95,0.60857,2.74279,1.72457,1.06185,0.93675,0.00582,TREC mean
1,612,1.01391,2.43918,1.59900,1.22390,0.91528,0.06618,TREC mean
2,657,3.40445,2.35234,0.26407,1.55008,0.86590,0.04874,TREC mean
3,771,1.00722,2.72797,1.33356,1.51243,0.98721,0.12842,TREC mean
4,822,0.65296,2.20431,1.66184,1.25084,0.99320,0.15834,TREC mean
...,...,...,...,...,...,...,...,...
195,119231,0.35149,2.34169,1.70841,0.77897,0.83099,0.12478,TREC mean
196,123512,0.42510,2.49884,1.92640,1.21086,0.93900,0.25556,TREC mean
197,126740,0.48605,1.60679,0.93370,0.85543,0.83660,0.07948,TREC mean
198,128050,0.80032,1.39898,0.78851,0.78801,0.85329,0.31978,TREC mean


In [6]:
trec_means.mean()

  """Entry point for launching an IPython kernel.


qid              43107.52500
difference           0.74901
disparity            2.16609
relevance            1.32899
difference_ind       1.17066
disparity_ind        0.90616
relevance_ind        0.16966
dtype: float64

### effect size methods

In [7]:
def omega_squared(ms_effect, ms_error, df_effect, ss_total, roundd=True):
    wsq = (df_effect * (ms_effect - ms_error)) / (ss_total - ms_error)
    if roundd:
        return round(wsq,5)
    return wsq
    

In [8]:
def omega_squared_partial(ms_effect, ms_error, df_effect, total_observations, roundd=True):
    wsqp = (df_effect * (ms_effect - ms_error)) / (df_effect * ms_effect + (total_observations - df_effect) * ms_error)
    if roundd:
        return round(wsqp,5)
    return wsqp

In [9]:
def eta_squared(ss_effect, ss_total, roundd=True):
    if roundd:
        return round(ss_effect/ss_total,5)
    else:
        return ss_effect/ss_total

In [10]:
def eta_squared_partial(ss_effect, ss_error, roundd=True):
    if roundd:
        return round(ss_effect/(ss_effect + ss_error),5)
    else:
        return ss_effect/(ss_effect + ss_error)

In [11]:
def cohens_d(m1, m2, ms_error):
    return abs(m1 - m2) / math.sqrt(ms_error)

In [12]:
def ci(m, num_observations, ms_error):
    t = stats.t.ppf(1-0.025, num_observations)
    return t*math.sqrt(ms_error/num_observations)

In [13]:
def print_etasq(factor):
    ss_effect = anova_lm(lm).sum_sq.loc[f'C({factor})']
    ss_error = anova_lm(lm).sum_sq.loc['Residual']
    ss_total = anova_lm(lm).sum_sq.sum()

    print(factor)
    
    print(f"eta-squared\teta-squared partial")
    print(f"{eta_squared(ss_effect, ss_total)}\t\t\t{eta_squared_partial(ss_effect, ss_error)}")
    print()

In [14]:
def print_omegasq(factor, data, roundd=True):
    ms_effect = anova_lm(lm).mean_sq.loc[f'C({factor})']
    ms_error = anova_lm(lm).mean_sq.loc['Residual']
    df_effect = anova_lm(lm).df.loc[f'C({factor})']
    ss_total = anova_lm(lm).sum_sq.sum()
    N = len(data)
    
    print(ss_total - ms_error)
    print(df_effect * ms_effect + (N - df_effect) * ms_error)
    
    print(factor)
    
    print(f"omega-squared\tomega-squared partial")
    print(f"{omega_squared(ms_effect, ms_error, df_effect, ss_total,roundd)}\t\t\t{omega_squared_partial(ms_effect, ms_error, df_effect, N,roundd)}")
    print()

In [28]:
def effect_size_t_test(a,b,roundd=2):
    differences = a - b
    sample_mean = differences.sum()/len(differences)
    sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
    return round(abs(sample_mean) / math.sqrt(sample_variance),roundd)

In [46]:
stats.ttest_rel(experiments.query('system == "TEXT"')['difference'].reset_index(drop=True), experiments.query('system == "META"')['difference'].reset_index(drop=True))

Ttest_relResult(statistic=1.788084384591603, pvalue=0.07528385360281753)

The difference in scores is significant, p = 0.00000 < 0.05.

In [47]:
effect_size_t_test(experiments.query('system == "TEXT"')['difference'].reset_index(drop=True), experiments.query('system == "META"')['difference'].reset_index(drop=True),roundd=3)

0.127

In [43]:
differences = experiments.query('system == "TEXT"')['difference'].reset_index(drop=True) - experiments.query('system == "META"')['difference'].reset_index(drop=True)
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [44]:
c = ci(sample_mean,len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,2), round(ch,2)

(-0.01, 0.14)

In [45]:
round(sample_mean,3)

0.066

In [34]:
round(sample_variance,2)

0.27

In [35]:
stats.ttest_rel(experiments.query('system == "TEXT"')['difference_ind'].reset_index(drop=True), experiments.query('system == "META"')['difference_ind'].reset_index(drop=True))

Ttest_relResult(statistic=1.5044964503300546, pvalue=0.13403905334555483)

The difference in scores is significant, p = 0.00000 < 0.05.

In [36]:
effect_size_t_test(experiments.query('system == "TEXT"')['difference_ind'].reset_index(drop=True), experiments.query('system == "META"')['difference_ind'].reset_index(drop=True),roundd=3)

0.107

In [37]:
differences = experiments.query('system == "TEXT"')['difference_ind'].reset_index(drop=True) - experiments.query('system == "META"')['difference_ind'].reset_index(drop=True)
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [38]:
c = ci(sample_mean,len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,2), round(ch,2)

(-0.01, 0.06)

In [39]:
round(sample_mean,2)

0.03

In [40]:
round(sample_variance,2)

0.06

In [6]:
experiments.query('system == "TEXT" or system  == "META"').groupby('system').mean()

Unnamed: 0_level_0,difference,disparity,relevance,difference_ind,disparity_ind,relevance_ind,qid
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
META,0.84711,2.31496,1.35438,1.4482,1.25602,0.20582,43107.525
TEXT,0.91308,2.35879,1.34331,1.47438,1.25879,0.19412,43107.525


## Overall performance

In [None]:
experiments.drop('qid',axis=1).groupby('system').mean()

## Is there a significant improvement in the performance of LambdaMart_base + vs LambdaMart_mpt_15_0.9?

### Select data

In [None]:
s1 = 'LM_base'
s2 = 'LM_mpt_15_0.9'

In [None]:
data = experiments.query(f'system == "{s1}" or system == "{s2}"')
data = data.reset_index(drop=True)
data

In [None]:
data.groupby('system').mean()

Augmented version is worse lol.

In [None]:
from scipy import stats
a = data.query(f'system == "{s1}"').reset_index(drop=True)
b = data.query(f'system == "{s2}"').reset_index(drop=True)

### t-test eel

In [None]:
stats.ttest_rel(b['difference'], a['difference'])

NOT significant!

In [None]:
differences = b['difference'] - a['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,2), round(ch,2)

In [None]:
sample_mean

In [None]:
sample_variance

### t-test eel_ind

In [None]:
stats.ttest_rel(b['difference_ind'], a['difference_ind'])

The difference in scores is significant, p = 0.020 < 0.05.

In [None]:
round(effect_size_t_test(a['difference_ind'],b['difference_ind']),2)

The effect size is small.

In [None]:
differences = b['difference_ind'] - a['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,3), round(ch,2)

In [None]:
round(sample_mean,3)

In [None]:
round(sample_variance,2)

### Visualize means of the samples

In [None]:
fig, axs = plt.subplots(1,1,figsize=(10,10))
# for ax in axs:
#     ax.set_ylim(0, ymax)

# axs[0].set_title('Mean EEL (economic level)')
# axs[1].set_title('Mean EEL (individual)')

_ = sns.barplot(x='system', y='difference',data=data,ax=axs)
# _ = sns.barplot(x='system', y='difference_ind',data=data,ax=axs)


In [None]:
fig, axs = plt.subplots(1,1,figsize=(10,10))
# for ax in axs:
#     ax.set_ylim(0, ymax)

# axs[0].set_title('Mean EEL (economic level)')
# axs[1].set_title('Mean EEL (individual)')

# _ = sns.barplot(x='system', y='difference',data=data,ax=axs)
_ = sns.barplot(x='system', y='difference_ind',data=data,ax=axs)


## Is there a significant improvement in the performance of LambdaMart + RFR vs LambdaMart without RFR?

### Select data

In [None]:
s1 = 'LM_base'
s2 = 'LM_base+RFR_doc'

In [None]:
data = experiments.query(f'system == "{s1}" or system == "{s2}"')
data = data.reset_index(drop=True)
data

In [None]:
from scipy import stats
a = data.query(f'system == "{s1}"').reset_index(drop=True)
b = data.query(f'system == "{s2}"').reset_index(drop=True)

### t-test eel

In [None]:
stats.ttest_rel(a['difference'], b['difference'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference'],b['difference'],roundd=3)

The effect size is moderate.

In [None]:
differences = a['difference'] - b['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean,len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,2), round(ch,2)

In [None]:
round(sample_mean,2)

In [None]:
round(sample_variance,2)

### t-test eel_ind

In [None]:
stats.ttest_rel(a['difference_ind'], b['difference_ind'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
round(effect_size_t_test(a['difference_ind'],b['difference_ind']),2)

In [None]:
differences = a['difference_ind'] - b['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,2), round(ch,2)

In [None]:
round(sample_mean,2)

In [None]:
round(sample_variance,2)

### Visualize means of the samples

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,10))
# for ax in axs:
#     ax.set_ylim(0, ymax)

axs[0].set_title('Mean EEL (economic level)')
axs[1].set_title('Mean EEL (individual)')

_ = sns.barplot(x='system', y='difference',data=data,ax=axs[0])
_ = sns.barplot(x='system', y='difference_ind',data=data,ax=axs[1])


## Is there a significant improvement in the performance of AC base vs AC doc to doc for META?

### Select data

In [None]:
experiments.system.unique()

In [None]:
s1 = 'META+AC_auth_ignore'
s2 = 'META+AC_doc'

In [None]:
data = experiments.query(f'system == "{s1}" or system == "{s2}"')
data = data.reset_index(drop=True)
data

In [None]:
from scipy import stats
a = data.query(f'system == "{s1}"').reset_index(drop=True)
b = data.query(f'system == "{s2}"').reset_index(drop=True)

### t-test eel

In [None]:
stats.ttest_rel(a['difference'], b['difference'])

The difference in scores is NOT significant, p = 0.059 > 0.05.

In [None]:
differences = a['difference'] - b['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
effect_size_t_test(a['difference'], b['difference'])

In [None]:
c = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,5), round(ch,3)

In [None]:
round(sample_mean,3)

In [None]:
round(sample_variance,3)

### t-test eel_ind

In [None]:
stats.ttest_rel(a['difference_ind'], b['difference_ind'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference_ind'], b['difference_ind'])

The effect size is moderate.

In [None]:
differences = a['difference_ind']- b['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,3), round(ch,3)

In [None]:
round(sample_mean,3)

In [None]:
round(sample_variance,3)

In [None]:
data.groupby('system').mean()

### Visualize means of the samples

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,10))

axs[0].set_title('Mean EEL (economic level)')
axs[1].set_title('Mean EEL (individual)')

sns.barplot(x='system', y='difference',data=data,ax=axs[0])
sns.barplot(x='system', y='difference_ind',data=data,ax=axs[1])


## Is there a significant improvement in the performance of AC base vs AC doc to doc for TEXT?

### Select data

In [None]:
experiments.system.unique()

In [None]:
s1 = 'TEXT+AC_auth_ignore'
s2 = 'TEXT+AC_doc'

In [None]:
data = experiments.query(f'system == "{s1}" or system == "{s2}"')
data = data.reset_index(drop=True)
data

In [None]:
from scipy import stats
a = data.query(f'system == "{s1}"').reset_index(drop=True)
b = data.query(f'system == "{s2}"').reset_index(drop=True)

### t-test eel

In [None]:
stats.ttest_rel(a['difference'],b['difference'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference'],b['difference'])

The effect size is moderate.

In [None]:
differences = a['difference'] - b['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean,len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,4), round(ch,4)

In [None]:
round(sample_mean,3)

In [None]:
round(sample_variance,3)

### t-test eel_ind

In [None]:
stats.ttest_rel(a['difference_ind'],b['difference_ind'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference_ind'],b['difference_ind'],roundd=3)

The effect size is moderate.

In [None]:
differences = a['difference_ind']-b['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)

In [None]:
c = ci(sample_mean,len(differences), sample_variance)
cl, ch = (sample_mean - c, sample_mean + c)
round(cl,3), round(ch,2)

In [None]:
round(sample_mean,4)

In [None]:
round(sample_variance,3)

### Visualize means of the samples

In [None]:
fig, axs = plt.subplots(1,2,figsize=(20,10))

axs[0].set_title('Mean EEL (economic level)')
axs[1].set_title('Mean EEL (individual)')

sns.barplot(x='system', y='difference',data=data,ax=axs[0])
sns.barplot(x='system', y='difference_ind',data=data,ax=axs[1])


## Is there a significant difference between the TREC mean and AC(dtd) or RFR(dtd)?

### Select data

In [None]:
experiments.system.unique()

In [None]:
experiments.groupby('system').mean()

In [None]:
s1 = 'LM_mpt_15_0.9'
s2 = 'LM_base+RFR_doc'
s3 = 'META+AC_doc'
s4 = 'TEXT+AC_doc'
s5 = 'TREC mean'

In [None]:
data = experiments.query(f'system == "{s1}" or system == "{s2}" or system == "{s3}" or system == "{s4}"')
data = pd.concat([data,trec_means])
data = data.reset_index(drop=True)

data

In [None]:
data.groupby('system').mean()

In [None]:
from scipy import stats
a = data.query(f'system == "{s1}"').reset_index(drop=True)
b = data.query(f'system == "{s2}"').reset_index(drop=True)
c = data.query(f'system == "{s3}"').reset_index(drop=True)
d = data.query(f'system == "{s4}"').reset_index(drop=True)
e = data.query(f'system == "{s5}"').reset_index(drop=True)

#### t-test eel

In [None]:
stats.ttest_rel(e['difference'], a['difference'])

In [None]:
stats.ttest_rel(e['difference'], b['difference'])

In [None]:
stats.ttest_rel(e['difference'], c['difference'])

In [None]:
stats.ttest_rel(e['difference'], d['difference'])

The difference in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference'], e['difference'])

In [None]:
effect_size_t_test(b['difference'], e['difference'])

In [None]:
effect_size_t_test(c['difference'], e['difference'])

In [None]:
effect_size_t_test(d['difference'], e['difference'])

The effect size is LARGE.

In [None]:
differences = e['difference'] - a['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference'] - b['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference'] - c['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference'] - d['difference']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

#### t-test eel ind

In [None]:
stats.ttest_rel(a['difference_ind'], e['difference_ind'])

In [None]:
stats.ttest_rel(b['difference_ind'], e['difference_ind'])

In [None]:
stats.ttest_rel(c['difference_ind'], e['difference_ind'])

In [None]:
stats.ttest_rel(d['difference_ind'], e['difference_ind'])

The difference_ind in scores is significant, p = 0.00000 < 0.05.

In [None]:
effect_size_t_test(a['difference_ind'], e['difference_ind'])

In [None]:
effect_size_t_test(b['difference_ind'], e['difference_ind'])

In [None]:
effect_size_t_test(c['difference_ind'], e['difference_ind'])

In [None]:
effect_size_t_test(d['difference_ind'], e['difference_ind'])

The effect size is LARGE.

In [None]:
differences = e['difference_ind'] - a['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference_ind'] - b['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference_ind'] - c['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))

In [None]:
differences = e['difference_ind'] - d['difference_ind']
sample_mean = differences.sum()/len(differences)
sample_variance = (differences - sample_mean).apply(lambda x: x*x).sum()/len(differences - 1)
cc = ci(sample_mean, len(differences), sample_variance)
cl, ch = (sample_mean - cc, sample_mean + cc)

print(round(cl,4),',', round(ch,4))
print(round(sample_mean,3))
print(round(sample_variance,3))