# General Imports

In [19]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind
from itertools import combinations

from sklearn.model_selection import ParameterGrid

# Rid of pesky sk-learn version warnings since we aren't using those variables anyway
import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [2]:
def load_dict(filename, verbose=False):
    '''
    Loads dictionary of metrics from given filename
    
    Args:
    - filename (str): file to load
    - verbose=False (bool): sepcifies if exact filename should be used. if False, .pickle extension appended to filename if not already present
    Return
    - dictionary (dict): data found in file
    - None (None): return None val in case exception is raised and dictionary file does not exist
    '''
    if (not verbose) and ('.pickle' not in filename):
        filename += '.pickle'

    try:
        with open(filename, 'rb') as pickle_file: 
            dictionary = pickle.load(pickle_file) 
    except FileNotFoundError as e:
        print(e)
        return None
    
    return dictionary

In [3]:
# Initialize loading variables
data_dict = {}
datasets = ['yelp', 'sub_ob', 'clickbait']
models = ['svm', 'logreg', 'randomforest', 'nn']
index = 2 # Loads final (of 3) data checkpoint for data

# Load data
for dataset in datasets:
    for model in models:
        data_dict[(model, dataset)] = load_dict('../checkpoints/{model}/{model}_{dataset}_{index}'.format(model=model, dataset=dataset, index=index))

# Analysis

In [7]:
TEST_METRICS = ['acc_test', 'precision_test', 'recall_test', 'f1_test']
TRAIN_METRICS = ['acc_train', 'precision_train', 'recall_train', 'f1_train']
DATASETS = ['yelp', 'sub_ob', 'clickbait']

## Test/train set performance (across 3 trials) for each algorithm/dataset combo (Raw + Mean)

In [5]:
def round_values(values):
    new_vals = []
    for value in values:
        if type(value) in (list, np.ndarray):
            interior_list = []
            for val in value:
                interior_list.append(round(val, 3))
            new_vals.append(interior_list)
        else:
            new_vals.append(round(value, 3))
    return new_vals

In [24]:
algo_data_test_dict = {}
algo_data_train_dict = {}

for (algorithm, dataset), metric_dict in data_dict.items():
    for metric in TEST_METRICS:
        # Take the mean to get the precision, recall, and F1 (since not averaged initially)
        values = [np.mean(metric_dict[(dataset, index)][metric]) for index in range(3)]
        
        ## Change metric name
        #metric = metric[:metric.find('_')]
        #if metric == 'acc': metric = 'ACC'
        #elif metric == 'precision': metric = 'PREC'
        #elif metric == 'recall': metric = 'REC'
        #elif metric == 'f1': metric = 'F1'
        
        ## Change algorithm name
        #if algorithm == 'svm': algorithm = 'SVM'
        #elif algorithm == 'logreg': algorithm = 'LogReg'
        #elif algorithm == 'randomforest': algorithm = 'RF'
    
        #algo_data_test_dict[(algorithm, dataset.capitalize(), metric)] = {'mean': np.mean(values), 'values': round_values(values)}
        algo_data_test_dict[(algorithm, dataset, metric)] = {'mean': np.mean(values), 'values': values}
    for metric in TRAIN_METRICS:
        values = [metric_dict[(dataset, index)][metric] for index in range(3)]
        algo_data_train_dict[(algorithm, dataset, metric)] = {'mean': np.mean(values), 'values': values}

In [7]:
test_df = pd.DataFrame.from_dict(algo_data_test_dict, orient='index')
test_df.drop('mean', inplace=True, axis=1)
test_df['values'] = test_df['values'].apply(list)
test_df
test_df.to_csv('../algodatatestdict.csv')

## Mean test set performance (across 3 trials x 3 data sets) for each algorithm

In [63]:
mean_algo_dict = {}

for (algorithm, dataset, metric), value_dict in algo_data_test_dict.items():
    try:
        [mean_algo_dict[(algorithm, metric)].append(value) for value in value_dict['values']]
    except KeyError:
        mean_algo_dict[(algorithm, metric)] = value_dict['values']
        
#for key in mean_algo_dict.keys():
#    mean_algo_dict[key] = np.mean(mean_algo_dict[key])  

## t-test

In [66]:
# Best algorithms
datasets = ['yelp', 'sub_ob', 'clickbait']
best_algo_1_dict, best_algo_2_dict, best_algo_3_dict = {}, {}, {}

# Table 1
best_algo_1_dict['acc_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'acc_test')]['values']]
best_algo_1_dict['precision_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'precision_test')]['values']]
best_algo_1_dict['recall_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'sub_ob', 'recall_test')]['values']]
best_algo_1_dict['f1_test'] = [np.mean(value) for value in algo_data_test_dict[('nn', 'clickbait', 'f1_test')]['values']]

# Table 2
best_algo_2_dict['acc_test'] = mean_algo_dict[('nn', 'acc_test')]
best_algo_2_dict['precision_test'] = mean_algo_dict[('nn', 'precision_test')]
best_algo_2_dict['recall_test'] = mean_algo_dict[('nn', 'recall_test')]
best_algo_2_dict['f1_test'] = mean_algo_dict[('nn', 'f1_test')]

# Table 3
best_algo_3_dict['acc_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'acc_train')]['values']]
best_algo_3_dict['precision_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'precision_train')]['values']]
best_algo_3_dict['recall_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'recall_train')]['values']]
best_algo_3_dict['f1_train'] = [np.mean(value) for value in algo_data_train_dict[('nn', 'clickbait', 'f1_train')]['values']]

In [45]:
# Table 1
for best_metric, best_values in best_algo_1_dict.items():
    for (model, dataset, metric), value_dict in algo_data_test_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict['values'], best_values)
            print('test: {}\np: {}\n'.format((model, dataset, metric), p))

test: ('svm', 'yelp', 'acc_test')
p: 1.1249249688436414e-05

test: ('logreg', 'yelp', 'acc_test')
p: 5.589064660055868e-06

test: ('randomforest', 'yelp', 'acc_test')
p: 6.240236280414811e-06

test: ('nn', 'yelp', 'acc_test')
p: 7.788995058657149e-05

test: ('svm', 'sub_ob', 'acc_test')
p: 4.353158535423539e-06

test: ('logreg', 'sub_ob', 'acc_test')
p: 3.947133673463829e-06

test: ('randomforest', 'sub_ob', 'acc_test')
p: 3.7585590837153907e-06

test: ('nn', 'sub_ob', 'acc_test')
p: 0.028727349432309892

test: ('svm', 'clickbait', 'acc_test')
p: 1.9139281877275435e-05

test: ('logreg', 'clickbait', 'acc_test')
p: 1.905593457949983e-05

test: ('randomforest', 'clickbait', 'acc_test')
p: 1.88527653641544e-05

test: ('nn', 'clickbait', 'acc_test')
p: 1.0

test: ('svm', 'yelp', 'precision_test')
p: 2.1542209453553922e-05

test: ('logreg', 'yelp', 'precision_test')
p: 1.2955782710460094e-05

test: ('randomforest', 'yelp', 'precision_test')
p: 1.3098570456567658e-05

test: ('nn', 'yelp', 'p

In [67]:
# Table 2
for best_metric, best_values in best_algo_2_dict.items():
    for (algorithm, metric), value_dict in mean_algo_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict, best_values)
            print('test: {}\np: {}\n'.format((algorithm, metric), p))

test: ('svm', 'acc_test')
p: 0.00029056880933519245

test: ('logreg', 'acc_test')
p: 0.00023076718830527577

test: ('randomforest', 'acc_test')
p: 0.0001223460955157153

test: ('nn', 'acc_test')
p: 1.0

test: ('svm', 'precision_test')
p: 0.011572656646295318

test: ('logreg', 'precision_test')
p: 0.005866730324708857

test: ('randomforest', 'precision_test')
p: 0.002534969880948449

test: ('nn', 'precision_test')
p: 1.0

test: ('svm', 'recall_test')
p: 1.484662789107798e-11

test: ('logreg', 'recall_test')
p: 1.3013649507016699e-11

test: ('randomforest', 'recall_test')
p: 6.732947333304875e-12

test: ('nn', 'recall_test')
p: 1.0

test: ('svm', 'f1_test')
p: 1.7874243923286493e-07

test: ('logreg', 'f1_test')
p: 3.261990196074667e-07

test: ('randomforest', 'f1_test')
p: 3.3901419716765365e-07

test: ('nn', 'f1_test')
p: 1.0



In [41]:
# Table 3
for best_metric, best_values in best_algo_3_dict.items():
    for (model, dataset, metric), value_dict in algo_data_test_dict.items():
        if best_metric == metric:
            stat, p = ttest_ind(value_dict['values'], best_values)
            if 0.05 < p < 1:
                print(model, dataset, metric)
                #print('test: {}\nstat: {}\np: {}'.format((model, dataset, metric), stat, p))

## Heat Map

In [4]:
def draw_heatmap(errors, D_list, title):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(errors, annot=True, fmt='.3f', yticklabels=D_list, xticklabels=[])
    ax.collections[0].colorbar.set_label('error')
    ax.set(ylabel='max depth D')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(title)
    plt.show()

In [5]:
def get_mean(*lists):
    '''
    Computes element-wise mean for each element of the passed in lists. 
    If lists are uneven, shrinks lists to shortest size
    Args: 
    - *lists (list-like): lists to perform averaging over
    Returns:
    - list of element-wise means
    '''
    return [np.mean(values) for values in zip(*lists)]

assert get_mean([1,2,3], [4,5,6]) == [2.5,3.5,4.5]
assert get_mean([1,1,1], []) == []
assert get_mean([0,0,0], [2,4,6]) == [1,2,3]

### SVM

In [22]:
# Gets the list of validation performance for each paremeter combo across all data sets and trials for SVM
svm_validation_means = get_mean(*[data_dict[('svm', dataset)][(dataset, i)]['cv_results']['mean_test_score'] 
                    for dataset in DATASETS for i in range(3)])

c_vals = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
svm_param_grid = list(ParameterGrid({'kernel': ['linear'], 'C': c_vals}))
draw_heatmap(svm_validation_means, svm_param_grid, 'test')

IndexError: Inconsistent shape between the condition and the input (got (132, 1) and (132,))

<Figure size 144x288 with 0 Axes>

### Logistic Regression

### Random Forest