In [213]:
# see also: https://machinelearningmastery.com/mcnemars-test-for-machine-learning/

import pandas as pd
import numpy as np
import random as rand
from statsmodels.stats.contingency_tables import mcnemar

In [214]:
all_pred = pd.read_csv('STEL_single-pred-quadruple_all-models.tsv', sep='\t')

In [215]:
len(all_pred)

1830

In [216]:
def sample_random(sim_str):
    pred_sim = all_pred[sim_str].copy()
    rand_ids = pred_sim.index[pred_sim == 0].tolist()
    for cur_id in rand_ids:
        pred_sim.at[cur_id] = rand.choice([1,2])
    print('Replaced {} values'.format(len(rand_ids)))
    return pred_sim
    
def no_random(sim_str_1, sim_str_2):
    tmp_all_pred = all_pred.copy()
    all_indices = tmp_all_pred.index.to_list()
    rand_ids_1 = set(tmp_all_pred.index[tmp_all_pred[sim_str_1] == 0].tolist())
    rand_ids_2 = set(tmp_all_pred.index[tmp_all_pred[sim_str_2] == 0].tolist())
    rand_ids = list(rand_ids_1.union(rand_ids_2))
    # print(rand_ids)
    no_random_ids = [cur_id for cur_id in all_indices if cur_id not in rand_ids]
    tmp_all_pred = tmp_all_pred.iloc[no_random_ids]
    correct_preds = all_pred['Correct Alternative'][no_random_ids]
    # tmp_all_pred.reset_index(drop=True, inplace=True)
    # tmp_all_pred.sort_index(inplace=True)
    #for cur_id in no_random_ids:
    #    tmp_all_pred = tmp_all_pred.drop(cur_id)
    # print(tmp_all_pred[sim_str_1])
    # print(tmp_all_pred[sim_str_2])
    print('Dropped {} values'.format(len(rand_ids)))
    tmp_all_pred[sim_str_1] == tmp_all_pred[sim_str_2]
    return tmp_all_pred[sim_str_1], tmp_all_pred[sim_str_2], correct_preds
    
def _calc_sig_from_df(pred_sim_1, pred_sim_2, correct_predictions=all_pred['Correct Alternative']):    
    p1_correctness = pred_sim_1 == correct_predictions
    p2_correctness = pred_sim_2 == correct_predictions
    yes_yes = p1_correctness & p2_correctness
    nbr_yes_yes = yes_yes[yes_yes == True].sum()
    no_no = ~p1_correctness & ~p2_correctness
    nbr_no_no = no_no[no_no == True].sum()
    yes_no = p1_correctness & ~p2_correctness
    nbr_yes_no = yes_no[yes_no == True].sum()
    no_yes = ~p1_correctness & p2_correctness
    nbr_no_yes = no_yes[no_yes == True].sum()
    table = [[nbr_yes_yes, nbr_yes_no], [nbr_no_yes, nbr_no_no]]
    result = mcnemar(table, exact=True)
    print('statistic=%.3f, p-value=%.10f' % (result.statistic, result.pvalue))
    return result.pvalue
    
def significance_testing(sim_str_1, sim_str_2):
    pred_sim_1 = sample_random(sim_str_1)
    pred_sim_2 = sample_random(sim_str_2)
    return _calc_sig_from_df(pred_sim_1, pred_sim_2)

def sig_test_no_random(sim_str_1, sim_str_2):
    pred_sim_1, pred_sim_2, correct_preds = no_random(sim_str_1, sim_str_2)
    print('working with length {}'.format(len(pred_sim_1)))
    return _calc_sig_from_df(pred_sim_1, pred_sim_2, correct_preds)

def mean_sig_test(sim_str_1, sim_str_2):
    p_values = []
    for _ in range(100):
        p_values.append(significance_testing(sim_str_1, sim_str_2))
    mean_p_value = np.mean(p_values)
    print('p-value=%.10f' % (mean_p_value))


In [217]:
significance_testing('CasedBertSimilarity', 'UncasedBertSimilarity')

Replaced 0 values
Replaced 0 values
statistic=206.000, p-value=0.0084887766


0.00848877655387761

In [218]:
significance_testing('BERTCasedNextSentenceSimilarity', 'BERTUncasedNextSentenceSimilarity')

Replaced 35 values
Replaced 187 values
statistic=259.000, p-value=0.0001224951


0.0001224950520884254

In [219]:
sig_test_no_random('BERTCasedNextSentenceSimilarity', 'BERTUncasedNextSentenceSimilarity')

Dropped 199 values
working with length 1631
statistic=218.000, p-value=0.0002653185


0.00026531846381390114

In [220]:
mean_sig_test('BERTCasedNextSentenceSimilarity', 'BERTUncasedNextSentenceSimilarity')

Replaced 35 values
Replaced 187 values
statistic=261.000, p-value=0.0001289105
Replaced 35 values
Replaced 187 values
statistic=256.000, p-value=0.0000971075
Replaced 35 values
Replaced 187 values
statistic=259.000, p-value=0.0000659569
Replaced 35 values
Replaced 187 values
statistic=259.000, p-value=0.0001224951
Replaced 35 values
Replaced 187 values
statistic=271.000, p-value=0.0007815768
Replaced 35 values
Replaced 187 values
statistic=256.000, p-value=0.0000165169
Replaced 35 values
Replaced 187 values
statistic=255.000, p-value=0.0000159997
Replaced 35 values
Replaced 187 values
statistic=251.000, p-value=0.0000140590
Replaced 35 values
Replaced 187 values
statistic=251.000, p-value=0.0000323531
Replaced 35 values
Replaced 187 values
statistic=265.000, p-value=0.0000904162
Replaced 35 values
Replaced 187 values
statistic=266.000, p-value=0.0003512641
Replaced 35 values
Replaced 187 values
statistic=268.000, p-value=0.0001778535
Replaced 35 values
Replaced 187 values
statistic=267

In [221]:
significance_testing('BERTCasedNextSentenceSimilarity', 'RobertaSimilarity')

Replaced 35 values
Replaced 0 values
statistic=293.000, p-value=0.0000000000


1.7901220789015757e-11

In [222]:
sig_test_no_random('BERTCasedNextSentenceSimilarity', 'RobertaSimilarity')
mean_sig_test('BERTCasedNextSentenceSimilarity', 'RobertaSimilarity')

Dropped 35 values
working with length 1795
statistic=286.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=295.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=292.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=293.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=297.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=294.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=298.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=294.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=296.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=294.000, p-value=0.0000000000
Replaced 35 values
Replaced 0 values
statistic=296.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=296.000, p-value=0.0000000001
Replaced 35 values
Replaced 0 values
statistic=296.000, p-value=0.0000

In [223]:
significance_testing('MpnetSentenceBertSimilarity', 'ParaMpnetSentenceBertSimilarity')

Replaced 0 values
Replaced 0 values
statistic=272.000, p-value=0.0000003603


3.602681412702784e-07

In [224]:
significance_testing('CasedBertSimilarity', 'RobertaSimilarity')

Replaced 0 values
Replaced 0 values
statistic=214.000, p-value=0.0000000000


6.804315534243815e-29

In [225]:
sig_test_no_random('LIWCStyleSimilarity','LIWCSimilarity')
mean_sig_test('LIWCStyleSimilarity','LIWCSimilarity')

Dropped 1148 values
working with length 682
statistic=143.000, p-value=0.9530249959
Replaced 1141 values
Replaced 158 values
statistic=408.000, p-value=0.1097733700
Replaced 1141 values
Replaced 158 values
statistic=379.000, p-value=0.0256281951
Replaced 1141 values
Replaced 158 values
statistic=403.000, p-value=0.0165202446
Replaced 1141 values
Replaced 158 values
statistic=396.000, p-value=0.0204267185
Replaced 1141 values
Replaced 158 values
statistic=389.000, p-value=0.0069618464
Replaced 1141 values
Replaced 158 values
statistic=420.000, p-value=0.1574953770
Replaced 1141 values
Replaced 158 values
statistic=394.000, p-value=0.0499666530
Replaced 1141 values
Replaced 158 values
statistic=390.000, p-value=0.0042623768
Replaced 1141 values
Replaced 158 values
statistic=392.000, p-value=0.0071612224
Replaced 1141 values
Replaced 158 values
statistic=412.000, p-value=0.0684890877
Replaced 1141 values
Replaced 158 values
statistic=384.000, p-value=0.0023591249
Replaced 1141 values
Repl

In [226]:
sig_test_no_random('LIWCStyleSimilarity', 'LIWCFunctionSimilarity')
mean_sig_test('LIWCStyleSimilarity', 'LIWCFunctionSimilarity')

Dropped 1232 values
working with length 598
statistic=135.000, p-value=0.1460711812
Replaced 1141 values
Replaced 508 values
statistic=450.000, p-value=0.2269926368
Replaced 1141 values
Replaced 508 values
statistic=438.000, p-value=0.2464553574
Replaced 1141 values
Replaced 508 values
statistic=434.000, p-value=0.1282862209
Replaced 1141 values
Replaced 508 values
statistic=436.000, p-value=0.2199750031
Replaced 1141 values
Replaced 508 values
statistic=448.000, p-value=0.2023709395
Replaced 1141 values
Replaced 508 values
statistic=439.000, p-value=0.5478621333
Replaced 1141 values
Replaced 508 values
statistic=435.000, p-value=0.3675958666
Replaced 1141 values
Replaced 508 values
statistic=449.000, p-value=0.3093756622
Replaced 1141 values
Replaced 508 values
statistic=453.000, p-value=0.6922428765
Replaced 1141 values
Replaced 508 values
statistic=441.000, p-value=0.6402048627
Replaced 1141 values
Replaced 508 values
statistic=451.000, p-value=0.2807418579
Replaced 1141 values
Repl

In [227]:
sig_test_no_random('LIWCFunctionSimilarity','LIWCSimilarity')
mean_sig_test('LIWCStyleSimilarity','LIWCSimilarity')

Dropped 547 values
working with length 1283
statistic=259.000, p-value=0.2653829318
Replaced 1141 values
Replaced 158 values
statistic=406.000, p-value=0.0456555344
Replaced 1141 values
Replaced 158 values
statistic=393.000, p-value=0.0199891127
Replaced 1141 values
Replaced 158 values
statistic=392.000, p-value=0.0236131913
Replaced 1141 values
Replaced 158 values
statistic=384.000, p-value=0.0223135709
Replaced 1141 values
Replaced 158 values
statistic=392.000, p-value=0.0058828755
Replaced 1141 values
Replaced 158 values
statistic=398.000, p-value=0.0291162714
Replaced 1141 values
Replaced 158 values
statistic=392.000, p-value=0.0043519408
Replaced 1141 values
Replaced 158 values
statistic=384.000, p-value=0.0402616963
Replaced 1141 values
Replaced 158 values
statistic=380.000, p-value=0.0094476897
Replaced 1141 values
Replaced 158 values
statistic=394.000, p-value=0.0036212179
Replaced 1141 values
Replaced 158 values
statistic=406.000, p-value=0.0306273171
Replaced 1141 values
Repl