In [1]:
import json
from scipy import stats
import pandas as pd

In [2]:
t_test_folder = '../output/intrusion/'
eve_model = "eve"
methods = [eve_model,"word2vec_sg","word2vec_cbow","fasttext_cbow","fasttext_sg","glove"]
dataset_ids = ["animal_classes", "european_cities", "movie_genres", "cuisine", "music_genres", "nobel_laureates",
               "country_continent"]

In [3]:
def standard_t_test(a, b):
    t, pvalue = stats.ttest_ind(a, b)
    return t, pvalue

def pair_t_test(a, b):
    t, pvalue = stats.ttest_rel(a, b)
    return t, pvalue

def load_items(filepath):
    print("Loading %s ..." % filepath)
    return json.load(open(filepath))

In [4]:
results = dict()
for dataset_id in dataset_ids:
    # Load the data
    print()
    results[dataset_id] =load_items(t_test_folder + "results-for-tests-%s.json" % dataset_id )


Loading ../output/intrusion/results-for-tests-animal_classes.json ...

Loading ../output/intrusion/results-for-tests-european_cities.json ...

Loading ../output/intrusion/results-for-tests-movie_genres.json ...

Loading ../output/intrusion/results-for-tests-cuisine.json ...

Loading ../output/intrusion/results-for-tests-music_genres.json ...

Loading ../output/intrusion/results-for-tests-nobel_laureates.json ...

Loading ../output/intrusion/results-for-tests-country_continent.json ...


In [5]:
distribution = dict()
distribution['all'] = dict()
for method in methods:
    distribution['all'][method] = list()
for dataset_id in dataset_ids:
    print('Processing', dataset_id)
    distribution[dataset_id] = dict()
    for method in methods:
        distribution[dataset_id][method] = list(zip(*results[dataset_id][method]))[1]
        distribution['all'][method] += distribution[dataset_id][method]

Processing animal_classes
Processing european_cities
Processing movie_genres
Processing cuisine
Processing music_genres
Processing nobel_laureates
Processing country_continent


In [6]:
result_cols = ["Dataset","Method","Standard t-stats","Standard p-value","Pairwise t-stats", "Pairwise p-value"]
result_rows = list()
for dataset_id in dataset_ids:
    for i in range(len(methods)):
        for j in range(i+1, len(methods)):
            dist_a = distribution[dataset_id][methods[i]]
            dist_b = distribution[dataset_id][methods[j]]
            s_t, s_pvalue = standard_t_test(dist_a, dist_b)
            p_t, p_pvalue = pair_t_test(dist_a, dist_b)
            if methods[i] == eve_model or methods[j] == eve_model:
                result_rows.append([dataset_id, methods[i] + ', ' + methods[j], s_t, s_pvalue, p_t, p_pvalue])

for i in range(len(methods)):
    for j in range(i+1, len(methods)):
        dist_a = distribution['all'][methods[i]]
        dist_b = distribution['all'][methods[j]]
        s_t, s_pvalue = standard_t_test(dist_a, dist_b)
        p_t, p_pvalue = pair_t_test(dist_a, dist_b)
        if methods[i] == eve_model or methods[j] == eve_model:
            result_rows.append(['all', methods[i] + ', ' + methods[j], s_t, s_pvalue, p_t, p_pvalue])
        
print('preparing dataframe')
df_results = pd.DataFrame(result_rows, columns=result_cols)
df_results

preparing dataframe


Unnamed: 0,Dataset,Method,Standard t-stats,Standard p-value,Pairwise t-stats,Pairwise p-value
0,animal_classes,"eve, word2vec_sg",745.839799,0.0,1433.278113,0.0
1,animal_classes,"eve, word2vec_cbow",824.137907,0.0,1513.655448,0.0
2,animal_classes,"eve, fasttext_cbow",880.544687,0.0,1544.174976,0.0
3,animal_classes,"eve, fasttext_sg",711.328942,0.0,1405.219611,0.0
4,animal_classes,"eve, glove",1010.348198,0.0,1666.247292,0.0
5,european_cities,"eve, word2vec_sg",-214.663573,0.0,372.93223,0.0
6,european_cities,"eve, word2vec_cbow",69.707905,0.0,515.288276,0.0
7,european_cities,"eve, fasttext_cbow",117.705245,0.0,538.114822,0.0
8,european_cities,"eve, fasttext_sg",-277.870906,0.0,353.113874,0.0
9,european_cities,"eve, glove",572.433342,0.0,880.064324,0.0


In [7]:
df_results.to_csv("intrusion_significance.csv")