In [1]:
cd ..

/home/max/Documents/remb


In [2]:
import pandas as pd
from scipy.stats import ttest_rel
from statsmodels.stats.multitest import multipletests

In [3]:
def read_grouped_results(path, run=None):
    df = pd.read_csv(path)
    if run is not None: df = df[df.run == run]
    df = df[df.epoch == df.epoch.max()]
    return df.drop(columns='epoch onto run elapsed'.split()).reset_index(drop=True)

def paired_test(metrics1, metrics2, *, alpha):
    assert len(metrics1) == len(metrics2)
    test = ttest_rel(metrics1, metrics2, axis=0, alternative='two-sided')
    reject, pvalue_corr, _, alpha_corr = multipletests(test.pvalue, method='holm', alpha=alpha)
    result =  pd.DataFrame(dict(metric=metrics1.columns, tstatistic=test.statistic, pvalue=test.pvalue, reject_null=test.pvalue < alpha, pvalue_corr=pvalue_corr, reject_null_corr=reject))
    return result, alpha_corr

alpha = 0.05

In [4]:
relaxed = read_grouped_results('local/out/exp/20220715T194328/test-grouped.csv', run='trained reasoner')
restricted = read_grouped_results('local/out/exp/20220715T194304/test-grouped.csv', run='trained reasoner')

print('metrics (relaxed reasoner; synthetic data set)'); print(relaxed); print()
print('metrics (restricted reasoner; synthetic data set)'); print(restricted); print()
#print('difference'); print(relaxed - restricted); print()
print('paired t-test'); print(paired_test(relaxed, restricted, alpha=alpha)[0])

metrics (relaxed reasoner; synthetic data set)
       acc        f1      prec    recall   roc_auc    pr_auc
0   0.9580  0.886792  0.976261  0.812346  0.970268  0.941187
1   0.9465  0.875437  0.961637  0.803419  0.985810  0.965872
2   0.9675  0.900154  0.986532  0.827684  0.974260  0.954037
3   0.9370  0.874502  0.940043  0.817505  0.970542  0.947574
4   0.9620  0.907990  0.976562  0.848416  0.990079  0.974268
5   0.9845  0.959157  1.000000  0.921519  0.997789  0.992763
6   0.9715  0.916053  0.993610  0.849727  0.990277  0.974060
7   0.9635  0.909542  0.978667  0.849537  0.987917  0.972060
8   0.9510  0.882494  0.943590  0.828829  0.984789  0.962724
9   0.9620  0.883077  0.976190  0.806180  0.984752  0.956592
10  0.9615  0.928105  0.982213  0.879646  0.989363  0.981836
11  0.9695  0.911466  0.969136  0.860274  0.992495  0.975054
12  0.9670  0.931677  0.984683  0.884086  0.984247  0.973529
13  0.9315  0.853476  0.921478  0.794821  0.972233  0.940800
14  0.9420  0.846966  0.972727  0.7500

In [5]:
unfrozen = read_grouped_results('local/out/exp/20220719T213232/Unfrozen reasoner head test by onto.csv')
pretrained = read_grouped_results('local/out/exp/20220719T213232/Frozen pre-trained reasoner head test by onto.csv')
index = 'wildlife demacare stuff swo ontodt pizza'.split()
unfrozen.index = pretrained.index = index

print('metrics (unfrozen)'); print(unfrozen); print()
print('metrics (pretrained)'); print(pretrained); print()
#print('difference'); print(unfrozen - pretrained); print()
print('paired t-test'); print(paired_test(unfrozen, pretrained, alpha=alpha)[0]); print()

metrics (unfrozen)
               acc        f1      prec    recall   roc_auc    pr_auc
wildlife  0.963875  0.846603  0.869684  0.824716  0.980259  0.921401
demacare  0.995563  0.944919  0.998361  0.896907  0.997942  0.977227
stuff     0.990313  0.978880  0.985324  0.972519  0.996959  0.992135
swo       0.963469  0.937683  0.917484  0.958792  0.992886  0.980843
ontodt    0.975250  0.851463  0.918656  0.793429  0.982054  0.918722
pizza     0.970969  0.964665  0.965362  0.963968  0.993978  0.991950

metrics (pretrained)
               acc        f1      prec    recall   roc_auc    pr_auc
wildlife  0.942813  0.744056  0.810481  0.687694  0.951796  0.836294
demacare  0.995594  0.945328  0.998362  0.897644  0.997655  0.976188
stuff     0.965812  0.925961  0.925836  0.926086  0.983099  0.940981
swo       0.955844  0.923444  0.917923  0.929031  0.986082  0.963994
ontodt    0.967250  0.796030  0.898112  0.714785  0.970425  0.882810
pizza     0.932844  0.917986  0.921750  0.914253  0.976780  0.

In [6]:
trained = read_grouped_results('local/out/exp/20220715T194304/test-grouped.csv', run='trained reasoner')
random = read_grouped_results('local/out/exp/20220715T194304/test-grouped.csv', run='random reasoner')

#print('difference'); print(relaxed - restricted); print()
print('paired t-test'); print(paired_test(trained, random, alpha=alpha)[0])

paired t-test
    metric  tstatistic        pvalue  reject_null   pvalue_corr  \
0      acc   79.140439  2.106386e-25         True  1.263832e-24   
1       f1   44.693393  1.030833e-20         True  4.123330e-20   
2     prec   75.482185  5.162788e-25         True  2.581394e-24   
3   recall  -13.326150  4.326527e-11         True  4.326527e-11   
4  roc_auc   37.247267  3.167875e-19         True  9.503626e-19   
5   pr_auc   36.606602  4.385900e-19         True  9.503626e-19   

   reject_null_corr  
0              True  
1              True  
2              True  
3              True  
4              True  
5              True  


In [7]:
trained = read_grouped_results('local/out/exp/20220715T194328/test-grouped.csv', run='trained reasoner')
random = read_grouped_results('local/out/exp/20220715T194328/test-grouped.csv', run='random reasoner')

#print('difference'); print(relaxed - restricted); print()
print('paired t-test'); print(paired_test(trained, random, alpha=alpha)[0])

paired t-test
    metric  tstatistic        pvalue  reject_null   pvalue_corr  \
0      acc   17.888024  2.396874e-13         True  9.587498e-13   
1       f1   19.519654  4.952308e-14         True  2.476154e-13   
2     prec   20.133847  2.820985e-14         True  1.692591e-13   
3   recall   10.941499  1.212147e-09         True  1.212147e-09   
4  roc_auc   16.058313  1.649393e-12         True  4.948179e-12   
5   pr_auc   13.576316  3.137394e-11         True  6.274789e-11   

   reject_null_corr  
0              True  
1              True  
2              True  
3              True  
4              True  
5              True  
