# General Imports

In [None]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind
from itertools import combinations

from sklearn.model_selection import ParameterGrid

# Rid of pesky sk-learn version warnings since we aren't using those variables anyway
import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [None]:
def load_dict(filename, verbose=False):
    '''
    Loads dictionary of metrics from given filename
    
    Args:
    - filename (str): file to load
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .pickle extension appended to filename if not already present
    Return
    - dictionary (dict): data found in file
    - None (None): return None val in case exception is raised and dictionary file does not exist
    '''
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'

    try:
        with open(filename, 'rb') as pickle_file: 
            dictionary = pickle.load(pickle_file) 
    except FileNotFoundError as e:
        print(e)
        return None
    
    return dictionary

In [None]:
# Initialize loading variables
data_dict = {}
datasets = ['yelp', 'sub_ob', 'clickbait']
models = ['svm', 'logreg', 'RandomForestClassifier', 'nn']
index = 2 # Loads final (of 3) data checkpoint for data

# Load data
for dataset in datasets:
    for model in models:
        data_dict[(model, dataset)] = load_dict('../checkpoints/{model}/{model}_{dataset}_{index}'.format(model=model, dataset=dataset, index=index))

# Analysis

In [None]:
TEST_METRICS = ['acc_test', 'precision_test', 'recall_test', 'f1_test']
TRAIN_METRICS = ['acc_train', 'precision_train', 'recall_train', 'f1_train']
DATASETS = ['yelp', 'sub_ob', 'clickbait']

## Test/train set performance (across 3 trials) for each algorithm/dataset combo (Raw + Mean)

In [None]:
def round_values(values):
    new_vals = []
    for value in values:
        if type(value) in (list, np.ndarray):
            interior_list = []
            for val in value:
                interior_list.append(round(val, 3))
            new_vals.append(interior_list)
        else:
            new_vals.append(round(value, 3))
    return new_vals

In [None]:
# Table 1
algo_data_test_dict = {}

# Table 3
algo_data_train_dict = {}

for (algorithm, dataset), metric_dict in data_dict.items():
    # Table 1
    for metric in TEST_METRICS:
        # Take the mean to get the precision, recall, and F1 (since not averaged initially)
        values = [np.mean(metric_dict[(dataset, index)][metric]) for index in range(3)]
        algo_data_test_dict[(algorithm, dataset, metric)] = {'mean': np.mean(values), 
                                                             'values': values, 
                                                             'round_values': round_values(values)}
    # Table 3
    for metric in TRAIN_METRICS:
        values = [metric_dict[(dataset, index)][metric] for index in range(3)]
        algo_data_train_dict[(algorithm, dataset, metric)] = {'mean': np.mean(values), 
                                                              'values': values}

## Mean test set performance (across 3 trials x 3 data sets) for each algorithm

In [None]:
# Table 2 p-values
overall_algo_dict = {}

# Table 2 results
mean_algo_dict = {}

for (algorithm, dataset, metric), value_dict in algo_data_test_dict.items():
    try:
        [overall_algo_dict[(algorithm, metric)].append(value) for value in value_dict['values']]
    except KeyError:
        overall_algo_dict[(algorithm, metric)] = value_dict['values']
        
for key in overall_algo_dict.keys():
    mean_algo_dict[key] = np.mean(overall_algo_dict[key])

## t-test

In [None]:
# Best algorithms
datasets = ['yelp', 'sub_ob', 'clickbait']
best_algo_1_dict, best_algo_2_dict, best_algo_3_dict = {}, {}, {}

# Table 5 - Table 1 p-values
best_algo_1_dict['acc_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'acc_test')]['values']]
best_algo_1_dict['precision_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'precision_test')]['values']]
best_algo_1_dict['recall_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'sub_ob', 'recall_test')]['values']]
best_algo_1_dict['f1_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'f1_test')]['values']]

# Table 6 - Table 2 p-values
best_algo_2_dict['acc_test'] = overall_algo_dict[('nn', 'acc_test')]
best_algo_2_dict['precision_test'] = overall_algo_dict[('nn', 'precision_test')]
best_algo_2_dict['recall_test'] = overall_algo_dict[('nn', 'recall_test')]
best_algo_2_dict['f1_test'] = overall_algo_dict[('nn', 'f1_test')]

# Table 7 - Table 3 p-values
best_algo_3_dict['acc_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'acc_train')]['values']]
best_algo_3_dict['precision_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'precision_train')]['values']]
best_algo_3_dict['recall_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'recall_train')]['values']]
best_algo_3_dict['f1_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'f1_train')]['values']]

In [None]:
# Table 5
for best_metric, best_values in best_algo_1_dict.items():
    for (model, dataset, metric), value_dict in algo_data_test_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict['values'], best_values)
            if 0.05 < p < 1:
                print('test: {}\np: {}\n'.format((model, dataset, metric), p))

In [None]:
# Table 6
for best_metric, best_values in best_algo_2_dict.items():
    for (algorithm, metric), value_dict in overall_algo_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict, best_values)
            print('test: {}\np: {}\n'.format((algorithm, metric), p))

In [None]:
# Table "7"
for best_metric, best_values in best_algo_3_dict.items():
    for (model, dataset, metric), value_dict in algo_data_test_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict['values'], best_values)
            if 0.05 < p < 1:
                print(model, dataset, metric)
                #print('test: {}\nstat: {}\np: {}'.format((model, dataset, metric), stat, p))

# Note that there is no table 7 in the paper, but this was done to see if there would be any statistical significance during training,
# even though training performance doesn't mean much in comparison to testing performance

## Heat Map

In [None]:
def draw_heatmap(errors, D_list, title):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(errors, annot=True, fmt='.3f', yticklabels=D_list, xticklabels=[])
    ax.collections[0].colorbar.set_label('error')
    ax.set(ylabel='max depth D')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(title)
    plt.show()

In [None]:
def get_mean(*lists):
    '''
    Computes element-wise mean for each element of the passed in lists. 
    If lists are uneven, shrinks lists to shortest size
    Args: 
    - *lists (list-like): lists to perform averaging over
    Returns:
    - list of element-wise means
    '''
    return [np.mean(values) for values in zip(*lists)]

assert get_mean([1,2,3], [4,5,6]) == [2.5,3.5,4.5]
assert get_mean([1,1,1], []) == []
assert get_mean([0,0,0], [2,4,6]) == [1,2,3]

### SVM

In [None]:
# Gets the list of validation performance for each paremeter combo across all data sets and trials for SVM
svm_validation_means = get_mean(*[data_dict[('svm', dataset)][(dataset, i)]['cv_results']['mean_test_score'] 
                    for dataset in DATASETS for i in range(3)])

c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
svm_param_grid = list(ParameterGrid({'kernel': ['linear'], 'C': c_vals}))
draw_heatmap(svm_validation_means, svm_param_grid, 'test')

### Logistic Regression

### Random Forest