# Statistical analysis eval runs

## Setup

In [79]:
%matplotlib inline

In [80]:
import pandas as pd
pd.set_option("display.width", 100)
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm
import seaborn as sns

from scipy import stats
import math

from collections import Counter

pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x)




In [81]:

def pivot_table(table_path):
    df = pd.read_csv(table_path, sep='\t', names=['key', 'qid', 'value'])
    df = df.pivot(index='qid', columns='key', values='value')
    df = df.reset_index()
    return df


### effect size methods

In [82]:
def omega_squared(df_effect, ms_effect, ms_error, ss_total):
    wsq = (df_effect * (ms_effect - ms_error)) / (ss_total - ms_error)
    return wsq
    

In [83]:
def omega_squared_partial(df_effect, ms_effect, ms_error, N):
    wsqp = (df_effect *(ms_effect-ms_error))/(df_effect*ms_effect + (N-df_effect)*ms_error)
    return wsqp

In [84]:
def cohens_d(m1, m2, ms_error):
    return abs(m1 - m2) / math.sqrt(ms_error)

In [85]:
def ci(dfe, V, num_observations):
    t = stats.t.ppf(1-0.025, dfe)
    return t*math.sqrt(V/num_observations)

In [86]:
def print_omegasq(anova, factor, N, roundd):
    ms_effect = anova.mean_sq.loc[f'C({factor})']
    ms_error = anova.mean_sq.loc['Residual']
    df_effect = anova.df.loc[f'C({factor})']
    ss_total = anova.sum_sq.sum()
    
    
    print(f"omega-squared\tomega-squared partial")
    print(f"{round(omega_squared(df_effect, ms_effect, ms_error, ss_total),roundd)}\t\t\t{round(omega_squared_partial(df_effect, ms_effect, ms_error, N),roundd)}")
    print()

In [87]:
def plot_levels_t(V, df, data, feature, dv,plotthese=['difference','disparity','relevance']):
    plt.rcParams["axes.labelsize"] = 13
    
    
    datasorted = data.sort_values(by=feature)
    
    
    
    cis = [ci(df, V,20)]*2
 
    
    fig, axs = plt.subplots(1,len(plotthese),figsize=(20,10))

    if 'ind' in dv:
        addition = '_ind'
    else:
        addition = ''
    
    if len(plotthese) == 1:
        _ = sns.barplot(x=feature, y=f'difference{addition}',ci=None, yerr=cis, data=datasorted,ax=axs)
        _ = axs.bar_label(axs.containers[1], label_type = 'center', fmt="%.2f", color='white', fontsize=13)
    
    else:
        _ = sns.barplot(x=feature, y=f'difference{addition}',ci=None, yerr=cis, data=datasorted,ax=axs[0])
        _ = axs[0].bar_label(axs[0].containers[1], label_type = 'center', fmt="%.2f", color='white', fontsize=13)
    if 'disparity' in plotthese:
        _ = sns.barplot(x=feature, y=f'disparity{addition}', ci=None, data=datasorted, ax=axs[1])
        _ = axs[1].bar_label(axs[1].containers[0], label_type = 'center', fmt="%.2f", color='white', fontsize=13)
    if 'relevance' in plotthese:
        _ = sns.barplot(x=feature, y=f'relevance{addition}', ci=None, data=datasorted, ax=axs[2])
        _ = axs[2].bar_label(axs[2].containers[0], label_type = 'center', fmt="%.2f", color='white', fontsize=13)


In [88]:
def sample_mean(a,b):
    return (a-b).mean()

In [89]:
def bigV(a,b):
    sm = sample_mean(a,b)
    return (((a-b) - sm)**2).sum()/(len(a) - 1)

In [90]:
def effect_size_t_test(a,b,roundd=2):
    differences = a - b
    sm = sample_mean(a,b)
    V = bigV(a,b)
    return round(abs(sm) / math.sqrt(V),roundd)

## LM with post-processing

### Load files

In [91]:
base_eel = pivot_table('../evaluation/resources/2020/eval_results/eval/lambdamart_mrfr_eval_lm_ndcg_nofeat_random_state=0_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL.tsv')
base_eel = base_eel.rename({'difference':'difference_e','disparity':'disparity_e','relevance':'relevance_e'},axis=1)
base_err = pivot_table('../evaluation/resources/2020/eval_results/eval/lambdamart_mrfr_eval_lm_ndcg_nofeat_random_state=0_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL_ind.tsv')
base_err = base_err.rename({'difference':'difference_i','disparity':'disparity_i','relevance':'relevance_i'},axis=1)
base = pd.merge(base_err, base_eel, on = 'qid')
base['system'] = 'base'

                    
rfr_eel = pivot_table('../evaluation/resources/2020/eval_results/eval/mrfr_default_lm_eval_auth_ignore_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL.tsv')
rfr_eel = rfr_eel.rename({'difference':'difference_e','disparity':'disparity_e','relevance':'relevance_e'},axis=1)
rfr_err = pivot_table('../evaluation/resources/2020/eval_results/eval/mrfr_default_lm_eval_auth_ignore_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL_ind.tsv')
rfr_err = rfr_err.rename({'difference':'difference_i','disparity':'disparity_i','relevance':'relevance_i'},axis=1)
rfr = pd.merge(rfr_err, rfr_eel, on = 'qid')
rfr['system'] = 'rfre'


                           
ac_eel = pivot_table('../evaluation/resources/2020/eval_results/eval/ac_controller_default_lm_eval_auth_ignore_9_linear_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL.tsv')
ac_eel = ac_eel.rename({'difference':'difference_e','disparity':'disparity_e','relevance':'relevance_e'},axis=1)
ac_err = pivot_table('../evaluation/resources/2020/eval_results/eval/ac_controller_default_lm_eval_auth_ignore_9_linear_TREC-Fair-Ranking-eval-sample-full-annotations-DocLevel-mixed_group-qrels_EEL_ind.tsv')
ac_err = ac_err.rename({'difference':'difference_i','disparity':'disparity_i','relevance':'relevance_i'},axis=1)
ac = pd.merge(ac_err, ac_eel, on = 'qid')
ac['system'] = 'ac'

ppdata = pd.concat([base,rfr,ac])

In [92]:
base.head()

key,qid,difference_i,disparity_i,relevance_i,difference_e,disparity_e,relevance_e,system
0,95,1.47009,1.33333,0.0,1.63203,1.70608,0.69449,base
1,612,1.73004,1.33236,0.02165,0.20394,2.9405,2.25464,base
2,657,2.10123,1.33333,0.00688,3.74959,2.36969,0.10019,base
3,771,1.93125,1.32943,0.09012,1.84176,3.64599,1.3753,base
4,822,0.667,1.07943,0.49337,0.11004,1.40178,1.53204,base


In [93]:
rfr.head()

key,qid,difference_i,disparity_i,relevance_i,difference_e,disparity_e,relevance_e,system
0,95,0.15414,0.02596,0.00428,0.15507,1.97292,1.56638,rfre
1,612,0.56617,0.2438,0.0593,0.83268,1.84165,1.39085,rfre
2,657,0.8084,0.18259,0.07792,2.41469,1.54883,0.35721,rfre
3,771,0.72244,0.28937,0.1745,0.14628,1.76154,1.28082,rfre
4,822,0.47677,0.43898,0.26826,0.27543,1.66227,1.57959,rfre


In [94]:
ac.head()

key,qid,difference_i,disparity_i,relevance_i,difference_e,disparity_e,relevance_e,system
0,95,0.99926,0.8625,0.0,0.14837,2.0303,1.59842,ac
1,612,0.52184,0.26427,0.09171,0.7766,1.75416,1.37515,ac
2,657,0.84467,0.16673,0.05186,2.3912,1.45757,0.32332,ac
3,771,0.77716,0.30603,0.15548,0.22145,1.88787,1.30639,ac
4,822,0.50549,0.41273,0.24078,0.71759,1.51177,1.28326,ac


### Performance

In [95]:
ppdata.groupby('system').mean().round(3).reset_index()

key,system,qid,difference_i,disparity_i,relevance_i,difference_e,disparity_e,relevance_e
0,ac,43107.525,0.577,0.336,0.181,0.437,1.863,1.334
1,base,43107.525,1.422,1.248,0.215,0.855,2.209,1.297
2,rfre,43107.525,0.612,0.336,0.164,0.45,1.872,1.332


### Select data

In [122]:
s1 = 'base'
s2 = 'ac'
s3 = 'rfre'

### ac

In [123]:
from scipy import stats
a = ppdata.query(f'system == "{s1}"').reset_index(drop=True)
b = ppdata.query(f'system == "{s2}"').reset_index(drop=True)

#### t-test eel

In [124]:
stats.ttest_rel(a['difference_e'], b['difference_e'])

Ttest_relResult(statistic=8.0742324496481, pvalue=6.369959840783443e-14)

In [125]:
differences = b['difference_e'] - a['difference_e']
sm = sample_mean(b['difference_e'],a['difference_e'])
sv = bigV(a['difference_e'],b['difference_e'])

In [104]:
c = ci(len(differences) - 1, sv, len(differences))
cl, ch = (sm - c, sm + c)
round(cl,2), round(ch,2)

(-0.52, -0.32)

In [105]:
sm

-0.41845890999999996

In [106]:
sv

0.5371964552022632

In [107]:
effect_size_t_test(a['difference_e'],b['difference_e'],2)

0.57

#### t-test eel_ind

In [109]:
stats.ttest_rel(a['difference_i'], b['difference_i'])

Ttest_relResult(statistic=25.3825428856392, pvalue=2.5809112278064424e-64)

In [110]:
differences = b['difference_i'] - a['difference_i']
sm = sample_mean(b['difference_i'],a['difference_i'])
sv = bigV(a['difference_i'],b['difference_i'])

In [111]:
c = ci(len(differences) - 1, sv, len(differences))
cl, ch = (sm - c, sm + c)
round(cl,2), round(ch,2)

(-0.91, -0.78)

In [112]:
sm

-0.84534248

In [113]:
sv

0.22183247548432122

In [114]:
effect_size_t_test(a['difference_i'],b['difference_i'],2)

1.79

### rfre

In [126]:
from scipy import stats
a = ppdata.query(f'system == "{s1}"').reset_index(drop=True)
b = ppdata.query(f'system == "{s3}"').reset_index(drop=True)

#### t-test eel

In [127]:
stats.ttest_rel(a['difference_e'], b['difference_e'])

Ttest_relResult(statistic=6.86591530923515, pvalue=8.227819845911793e-11)

In [128]:
differences = b['difference_e'] - a['difference_e']
sm = sample_mean(b['difference_e'],a['difference_e'])
sv = bigV(a['difference_e'],b['difference_e'])

In [129]:
c = ci(len(differences) - 1, sv, len(differences))
cl, ch = (sm - c, sm + c)
round(cl,2), round(ch,2)

(-0.52, -0.29)

In [130]:
sm

-0.405755095

In [131]:
sv

0.698491419103403

In [132]:
effect_size_t_test(a['difference_e'],b['difference_e'],2)

0.49

#### t-test eel_ind

In [133]:
stats.ttest_rel(a['difference_i'], b['difference_i'])

Ttest_relResult(statistic=21.96122571081236, pvalue=4.416650349448093e-55)

In [134]:
differences = b['difference_i'] - a['difference_i']
sm = sample_mean(b['difference_i'],a['difference_i'])
sv = bigV(a['difference_i'],b['difference_i'])

In [135]:
c = ci(len(differences) - 1, sv, len(differences))
cl, ch = (sm - c, sm + c)
round(cl,2), round(ch,2)

(-0.88, -0.74)

In [136]:
sm

-0.8105962599999998

In [137]:
sv

0.27247460764614306

In [138]:
effect_size_t_test(a['difference_i'],b['difference_i'],2)

1.55