# Dependencies

In [None]:
# Imports
import re
import os
import json
import random
from copy import deepcopy
from os.path import join
import sys
base_path = os.path.abspath(os.path.join('..'))
if base_path not in sys.path:
    sys.path.append(base_path)
    
import math
import numpy as np
import scipy as sc
from scipy.stats import entropy, normaltest, mode
import pandas as pd
import sklearn as sk
from sklearn import svm, tree
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import seaborn

from IPython.display import display

import pickle

import seaborn as sns
import matplotlib as mpl
mpl.rcParams['figure.figsize']
import matplotlib.patches as mpatches

from collections import Counter

import itertools
from itertools import compress, product
import matplotlib.pyplot as plt

from time import time, strftime

from pprint import pprint

## Helpers

In [None]:
from helpers.analysis import *
from helpers.processing import *

In [None]:
def format_outcomes_df(outcomes_df, outcome_variable_name, outcomes, id_field='fid'):
    print('Subsetting outcomes')
    
    outcomes_df[outcome_variable_name].fillna(value=False, inplace=True)
    if outcomes:
        outcomes_df = outcomes_df[outcomes_df[outcome_variable_name].isin(outcomes)]
    outcomes_df_subset = outcomes_df[[id_field, outcome_variable_name]]
    return outcomes_df_subset

In [None]:
def plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning Curve', ylim=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.savefig("learning_curve.svg", format="svg")
    plt.show()
    return plt

In [None]:
def get_k_fold_scores(model, X, y, n_splits=5, train_size=None, test_size=None, shuffle=True, sampling_mode='over', model_config={}, resample_test=False):
    print('Getting K-Fold scores with model {}, {} folds, {} sampling'.format(model, n_splits, sampling_mode))
    
    # Stratified K-Fold
    if train_size:
        if not test_size: test_size = 1 - train_size
        splitter = StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, test_size=test_size,  random_state=RANDOM_STATE)
    if n_splits > 1:
        splitter = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=RANDOM_STATE)
            
    splitter.get_n_splits(X, y)

    train_scores = []
    test_scores = []
    highest_test_score = 0
    models = []
    per_class_trains = []
    
    for i, (train_index, test_index) in enumerate(splitter.split(X, y)):
        start_time = time()
        X_train_raw, X_test_raw = X.iloc[train_index], X.iloc[test_index]
        y_train_raw, y_test_raw = y.iloc[train_index], y.iloc[test_index]

        test_leakage(X_train_raw, X_test_raw, y_train_raw, y_test_raw)

        X_train, y_train, per_class_train = resample_X_and_y(X_train_raw, y_train_raw, sampling_mode=sampling_mode)
        
        X_test, y_test = X_test_raw, y_test_raw
        if resample_test:
            X_test, y_test, per_class_test = resample_X_and_y(X_test_raw, y_test_raw, sampling_mode=sampling_mode)
            
        clf = get_new_classifier(model, model_config=model_config)
        clf.fit(X_train, y_train)
        train_score = clf.score(X=X_train, y=y_train)
        test_score = clf.score(X=X_test, y=y_test)

        test_scores.append(test_score)
        
        train_scores.append(train_score)
        per_class_trains.append(per_class_train)
        models.append(clf)
        
        imp = importances(clf, X_test, y_test) # permutation
        plot_importances(imp)

        print('K-fold split', i + 1, '{:.2f} sec'.format(time() - start_time))

    results = {
        'train_scores': train_scores,
        'test_scores': test_scores,
        'models': models,
        'per_class_trains': per_class_trains
    }
    
    return results

In [None]:
def get_new_classifier(model, model_config={}):
    if model == 'rf':
        return RandomForestClassifier(
            random_state=RANDOM_STATE,
            verbose=0,
            n_jobs=-1,
            **model_config
        )
    if model == 'lr':
        return LogisticRegression(
            solver='sag',
            random_state=RANDOM_STATE,
            verbose=0,
            n_jobs=-1,
            **model_config
        )
    if model == 'nb':
        return GaussianNB()
    if model == 'dt':
        return tree.DecisionTreeClassifier(
            random_state=RANDOM_STATE
        )
    if model == 'knn':
        return KNeighborsClassifier()

## Constants

In [None]:
RANDOM_STATE = 42

config = {
    'seed': RANDOM_STATE,
    'features_directory': '../feature/processeds', 
    'features_df_file_name': {
        'dataset': 'features_aggregate_single_pairwise.csv',
        'field': 'field_level_features.csv'
    },
    'outcomes_df_file_name': {
        'dataset': 'chart_outcomes.csv',
        'field': 'field_level_outcomes.csv'
    },
    'id_field': {
        'dataset': 'fid',
        'field': 'field_id'
    },
    'feature_set_lookup_file_name': 'feature_names_by_type.pkl',
    'train_size': 0.6,
    'cv': 1,
    'sampling_mode': 'over',
    'nrows': None,
    'log_outcomes': True
}

In [None]:
base_results_dir = '../results'
if not os.path.exists(base_results_dir):
    os.mkdir(base_results_dir)
    
results_dir = join(base_results_dir, strftime('%Y-%m-%d'))
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

In [None]:
dataset_prediction_task_to_outcomes = {
    'all_one_trace_type': {
        'two': ['line', 'bar'],
        'three': ['line', 'scatter', 'bar'],
        'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
    },
    'has_single_src': {
        'two': [ True, False ]
    },
    'num_x_axes': {
        'numeric': [ i for i in range(10) ]
    },
    'num_y_axes': {
        'numeric': [ i for i in range(10) ]
    }
}

field_prediction_task_to_outcomes = {
    'trace_type': {
        'two': ['line', 'bar'],
        'three': ['line', 'scatter', 'bar'],
        'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
    },
    'is_xsrc': {
        'two': [ True, False ]
    },
    'is_ysrc': {
        'two': [ True, False ]
    },
    'is_x_or_y': {
        'two': [ 'x', 'y' ]
    },
    'is_single_src': {
        'two': [ True, False ]
    }
}

prediction_task_to_outcomes = {
    'dataset': dataset_prediction_task_to_outcomes,
    'field': field_prediction_task_to_outcomes
}

## Data Loading

In [None]:
dataset = 'dataset'
features_df = pd.read_csv(
    join(config['features_directory'], config['features_df_file_name'][dataset]),
    nrows=config['nrows']
)
outcomes_df = pd.read_csv(
    join(config['features_directory'], config['outcomes_df_file_name'][dataset]),
    nrows=config['nrows']
)
print('Features:', features_df.shape)
print('Outcomes:', outcomes_df.shape)

if dataset == 'field':
    def is_x_or_y(is_xsrc, is_ysrc):
        if is_xsrc and pd.isnull(is_ysrc): return 'x'
        if is_ysrc and pd.isnull(is_xsrc): return 'y'
        else: return None

    outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
    outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']
    
feature_names_by_type = pickle.load(open(join(config['features_directory'], config['feature_set_lookup_file_name']), 'rb'))

In [None]:
tasks = [ None,
    {'outcome_variable_name': 'all_one_trace_type', 'prediction_task': 'two', 'sampling_mode': 'over', 'pref_id': 1, 'dataset': 'dataset'},
    {'outcome_variable_name': 'all_one_trace_type', 'prediction_task': 'three', 'sampling_mode': 'over', 'pref_id': 2, 'dataset': 'dataset'},
    {'outcome_variable_name': 'all_one_trace_type', 'prediction_task': 'six', 'sampling_mode': 'over', 'pref_id': 3, 'dataset': 'dataset'},
    {'outcome_variable_name': 'has_single_src', 'prediction_task': 'two', 'sampling_mode': 'over', 'pref_id': 4, 'dataset': 'dataset'},
    {'outcome_variable_name': 'num_x_axes', 'prediction_task': 'numeric', 'sampling_mode': 10000, 'pref_id': 5, 'dataset': 'dataset'},
    {'outcome_variable_name': 'num_y_axes', 'prediction_task': 'numeric', 'sampling_mode': 10000, 'pref_id': 6, 'dataset': 'dataset'},
    {'outcome_variable_name': 'trace_type', 'prediction_task': 'two', 'sampling_mode': 'over', 'pref_id': 7, 'dataset': 'field'},
    {'outcome_variable_name': 'trace_type', 'prediction_task': 'three', 'sampling_mode': 'over', 'pref_id': 8, 'dataset': 'field'},
    {'outcome_variable_name': 'trace_type', 'prediction_task': 'six', 'sampling_mode': 'over', 'pref_id': 9, 'dataset': 'field'},
    {'outcome_variable_name': 'is_single_src', 'prediction_task': 'two', 'sampling_mode': 'over', 'pref_id': 10, 'dataset': 'field'},
    {'outcome_variable_name': 'is_x_or_y', 'prediction_task': 'two', 'sampling_mode': 'over', 'pref_id': 11, 'dataset': 'field'},
]

## Iterating Over Tasks

In [None]:
models = [
    { 'name': 'dt', 'config': {} },
    { 'name': 'lr', 'config': { 'multi_class': 'multinomial', 'max_iter': 100} },
    { 'name': 'nb', 'config': {} },
    { 'name': 'rf', 'config': {} },
    { 'name': 'knn', 'config': { 'n_jobs': -1, 'verbose': 2, 'n_neighbors': 3 } }
]

feature_sets = ['dimensions', 'types', 'values', 'names']

task_results = []
    
for model_index, task_index, feature_set_index in product([3], range(1, 8), [3]):
    task = tasks[task_index]
    feature_set = feature_sets[feature_set_index]
    model = models[model_index]
    
    print('Task:', task['outcome_variable_name'], task['prediction_task'])
    print('Feature set:', feature_set)
    print('Model:', model['name'])
    
    id_field = config['id_field'][task['dataset']]
    
    print(prediction_task_to_outcomes[task['dataset']][task['outcome_variable_name']][task['prediction_task']])

    outcomes_df_subset = format_outcomes_df(
        outcomes_df,
        task['outcome_variable_name'],
        prediction_task_to_outcomes[task['dataset']][task['outcome_variable_name']][task['prediction_task']],
        id_field=id_field
    )
    final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=id_field)
    last_index = final_df.columns.get_loc(task['outcome_variable_name'])
    X, y = final_df.iloc[:, :last_index], final_df.iloc[:, last_index]
    
    print(y.value_counts())

    feature_set_names = [ c for c in get_feature_set_names_by_type(
        feature_names_by_type,
        task_type=task['dataset'],
        feature_set=feature_set
    ) if c in final_df.columns ]

    k_fold_results, X_test, y_test = get_k_fold_scores(
        model['name'],
        X[feature_set_names],
        y,
        n_splits=1,
        train_size=0.6,
        test_size=0.2,
        sampling_mode=task['sampling_mode'],
        model_config=model['config'],
        resample_test=True
    )
    
    mean_k_fold_test = np.mean(k_fold_results['test_scores'])
    sem_k_fold_test = np.std(k_fold_results['test_scores']) / (np.sqrt(config['cv']) * 1.96)
    top_model = k_fold_results['models'][np.argmax(k_fold_results['test_scores'])]
    print('Mean of k-fold CV:', mean_k_fold_test)
    print('SEM of k-fold CV:', sem_k_fold_test)
    print('\n')
    task_results.append({
        'task': task,
        'model': model,
        'feature_set': feature_set,
        'mean': mean_k_fold_test,
        'sem': sem_k_fold_test,
        'top_fitted_model': top_model,
        'features': feature_set_names
    })