# Dependencies

In [2]:
# Imports
import re
import os
import json
import random
from copy import deepcopy
from os.path import join
import sys
base_path = os.path.abspath(os.path.join('..'))
if base_path not in sys.path:
    sys.path.append(base_path)
    
import math
import numpy as np
import scipy as sc
from scipy.stats import entropy, normaltest, mode
import pandas as pd
import sklearn as sk
from sklearn import svm
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.utils import shuffle
from sklearn.externals import joblib

import seaborn

from IPython.display import display, HTML

import pickle

import seaborn as sns
import matplotlib as mpl
mpl.rcParams['figure.figsize']
import matplotlib.patches as mpatches

from collections import Counter

import itertools
from itertools import compress
import matplotlib.pyplot as plt

from time import time, strftime

from pprint import pprint
pd.options.display.max_rows = 999

## Helpers

In [3]:
from helpers.analysis import *
from helpers.processing import *

In [4]:
def format_outcomes_df(outcomes_df, outcome_variable_name, outcomes, id_field='fid'):
    print('Subsetting outcomes')
    outcomes_df[outcome_variable_name].fillna(value=False, inplace=True)
    outcomes_df_subset = outcomes_df[outcomes_df[outcome_variable_name].isin(outcomes)][[id_field, outcome_variable_name]]
    return outcomes_df_subset

In [5]:
from string import capwords
def format_feature_name(n):
    name = n.replace('_', ' ')
    agg = ''
    if '-agg-' in name:
        name, agg = name.split('-agg-')
        return capwords(name), agg
    else:
        return name.title(), agg

In [6]:
def format_feature_importance_outcomes(statistic, columns, p=[], returned_fields=['statistic', 'statistic_norm', 'feature_name_formatted']):
    formatted_feature_names, feature_aggregations = [], []
    for x in pd.Series(X.columns).apply(format_feature_name):
        formatted_feature_names.append(x[0])
        feature_aggregations.append(x[1])
    
    results_df = pd.DataFrame({
        'feature_name': X.columns,
        'feature_name_formatted': formatted_feature_names,
        'aggregations': feature_aggregations,
        'statistic': statistic,
        'statistic_norm': statistic / np.nanmax(statistic),
    })
    
    if p: results_df['p'] = p
    
    results_df.sort_values(['statistic'], ascending=False, inplace=True)
    results_df.reset_index(inplace=True)

    display(HTML(results_df[returned_fields].iloc[:, :].to_html(index=False)))

## Load Data

In [7]:
base_results_dir = '../results'
features_directory = '../features/processed'

model_dir = '../models'
model_file_name = 'clf__model-rf__dataset-dataset__featureset-names__outcome-all_one_trace_type__task-two__perclass-296203__acc-0.938055.pkl'
model='rf'

dataset = 'dataset'
feature_set = 'names'

In [12]:
config = {
    'features_directory': '../features/processed',
    'features_df_file_name': 'features_aggregate_single_pairwise.csv',
    'outcomes_df_file_name': 'chart_outcomes.csv',
    'outcome_variable_name':  'has_single_src',  # 'trace_type', # 'is_ysrc',  # 'has_single_src',
    'prediction_task': 'two',
    'dataset': 'dataset',
    'nrows': None
}

In [10]:
base_results_dir = '../results'
if not os.path.exists(base_results_dir):
    os.mkdir(base_results_dir)
    
results_dir = join(base_results_dir, strftime('%Y-%m-%d'))
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

In [13]:
features_df = pd.read_csv(
    join(config['features_directory'], config['features_df_file_name']),
    nrows=config['nrows']
)
outcomes_df = pd.read_csv(
    join(config['features_directory'], config['outcomes_df_file_name']),
    nrows=config['nrows']
)

In [15]:
if config['dataset'] == 'dataset':
    config['id_field'] = 'fid'
else:
    config['id_field'] = 'field_id'
 
dataset_prediction_task_to_outcomes = {
    'all_one_trace_type': {
        'two': ['line', 'bar'],
        'three': ['line', 'scatter', 'bar'],
        'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'],
    },
    'has_single_src': {
        'two': [ True, False ]
    }
}

field_prediction_task_to_outcomes = {
    'trace_type': {
        'two': ['line', 'bar'],
        'three': ['line', 'scatter', 'bar'],
        'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'],
    },
    'is_xsrc': {
        'two': [ True, False ]
    },
    'is_ysrc': {
        'two': [ True, False ]
    },
    'is_x_or_y': {
        'two': [ 'x', 'y' ]
    },
    'is_single_src': {
        'two': [ True, False ]
    }
}
if config['dataset'] == 'dataset':
    prediction_task_to_outcomes = dataset_prediction_task_to_outcomes
if config['dataset'] == 'field':
    prediction_task_to_outcomes = field_prediction_task_to_outcomes

In [16]:
if config['dataset'] == 'field':
    def is_x_or_y(is_xsrc, is_ysrc):
        if is_xsrc and pd.isnull(is_ysrc): return 'x'
        if is_ysrc and pd.isnull(is_xsrc): return 'y'
        else: return None

    outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc'])
    outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc']

In [17]:
outcomes_df_subset = format_outcomes_df(
    outcomes_df,
    config['outcome_variable_name'],
    prediction_task_to_outcomes[config['outcome_variable_name']][config['prediction_task']],
    id_field=config['id_field']
)
final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=config['id_field'])
final_df.drop([config['id_field']], axis=1, inplace=True, errors='ignore')
last_index = final_df.columns.get_loc(config['outcome_variable_name'])

X = final_df.iloc[:, :last_index]
y = final_df.iloc[:, last_index]

subset = 100000
X_subset, y_subset = resample(X, y, n_samples=subset)

Subsetting outcomes
Joining feature and outcome DFs


## From Persisted Models

In [None]:
feature_names_by_type = pickle.load(open(join(config['features_directory'], config['feature_set_lookup_file_name']), 'rb'))

if dataset == 'dataset':
    feature_names = feature_names_by_type['aggregate_single_field'] + feature_names_by_type['aggregate_pairwise_field']

    dimensions_feature_names = ['exists-agg', 'length-agg']
    type_feature_names = ['data_type', 'general_type']
    value_feature_names = [ 'min-agg', 'max-agg', 'mean-agg', 'median-agg', 'range-agg', 'var-agg', 'std-agg', 'range_overlap', 'is_normal', 'q25-agg', 'q75-agg', 'kurtosis-agg', '_none', 'unique', 'mode', 'anova', 'nested', 'chi_sq', 'ks_', 'correlation', 'shared_elements', 'identical', 'sequence', '_space-agg', 'sorted', 'entropy-agg-', 'gini-agg-', 'abs_dev-agg-', 'normality', 'monotonic', 'outliers', 'moment', 'quant_coeff_disp', 'skewness', 'value_length']
    name_feature_names = ['in_name', 'edit_distance-', 'uppercase', 'shared_words', 'name_length']

    unique_feature_sets = {
        'basic': [],  # feature_names_by_type['basic'],
        'dimensions': [ x for x in feature_names if any(x.startswith(e) for e in dimensions_feature_names) ],
        'types': [ x for x in feature_names if any(e in x for e in type_feature_names) ],
        'values': [ x for x in feature_names if any(e in x for e in value_feature_names) ],
        'names': [ x for x in feature_names if any(e in x for e in name_feature_names) ]
    }
    
    feature_sets = {
        'dimensions': unique_feature_sets['basic'] + unique_feature_sets['dimensions'],
        'types': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'],
        'values': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'],
        'names': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'] + unique_feature_sets['names']
    }
if dataset == 'field':
    feature_names = feature_names_by_type['single_field']
    dimensions_feature_names = ['length']
    type_feature_names = ['data_type', 'general_type']
    value_feature_names = ['has_none', 'percentage_none', 'num_none', 'num_unique_elements', 'unique_percent', 'is_unique', 'list_entropy', 'mean_value_length', 'median_value_length', 'min_value_length', 'max_value_length', 'std_value_length', 'percentage_of_mode', 'mean', 'normalized_mean', 'median', 'normalized_median', 'var', 'std', 'coeff_var', 'min', 'max', 'range', 'normalized_range', 'entropy', 'gini', 'q25', 'q75', 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'skewness', 'kurtosis', 'moment_5', 'moment_6', 'moment_7', 'moment_8', 'moment_9', 'moment_10', 'percent_outliers_15iqr', 'percent_outliers_3iqr', 'percent_outliers_1_99', 'percent_outliers_3std', 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 'has_outliers_3std', 'normality_statistic', 'normality_p', 'is_normal_5', 'is_normal_1', 'is_sorted', 'is_monotonic', 'sortedness', 'lin_space_sequence_coeff', 'log_space_sequence_coeff', 'is_lin_space', 'is_log_space']
    name_feature_names = ['in_name', 'edit_distance-', 'uppercase', 'shared_words', 'name_length']

    unique_feature_sets = {
        'basic': [],  # feature_names_by_type['basic'],
        'dimensions': dimensions_feature_names,
        'types': [ x for x in feature_names if any(e in x for e in type_feature_names) ],
        'values': value_feature_names,
        'names': [ x for x in feature_names if any(e in x for e in name_feature_names) ]
    }
    
    feature_sets = {
        'dimensions': unique_feature_sets['basic'] + unique_feature_sets['dimensions'],
        'types': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'],
        'values': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'],
        'names': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'] + unique_feature_sets['names']
    }

In [None]:
feature_names = feature_names_by_type['single_field']
dimensions_feature_names = ['length']
type_feature_names = ['data_type', 'general_type']
value_feature_names = ['has_none', 'percentage_none', 'num_none', 'num_unique_elements', 'unique_percent', 'is_unique', 'list_entropy', 'mean_value_length', 'median_value_length', 'min_value_length', 'max_value_length', 'std_value_length', 'percentage_of_mode', 'mean', 'normalized_mean', 'median', 'normalized_median', 'var', 'std', 'coeff_var', 'min', 'max', 'range', 'normalized_range', 'entropy', 'gini', 'q25', 'q75', 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'skewness', 'kurtosis', 'moment_5', 'moment_6', 'moment_7', 'moment_8', 'moment_9', 'moment_10', 'percent_outliers_15iqr', 'percent_outliers_3iqr', 'percent_outliers_1_99', 'percent_outliers_3std', 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 'has_outliers_3std', 'normality_statistic', 'normality_p', 'is_normal_5', 'is_normal_1', 'is_sorted', 'is_monotonic', 'sortedness', 'lin_space_sequence_coeff', 'log_space_sequence_coeff', 'is_lin_space', 'is_log_space']
name_feature_names = ['in_name', 'edit_distance-', 'uppercase', 'shared_words', 'name_length']

unique_feature_sets = {
    'basic': [],  # feature_names_by_type['basic'],
    'dimensions': dimensions_feature_names,
    'types': [ x for x in feature_names if any(e in x for e in type_feature_names) ],
    'values': value_feature_names,
    'names': [ x for x in feature_names if any(e in x for e in name_feature_names) ]
}

feature_sets = {
    'dimensions': unique_feature_sets['basic'] + unique_feature_sets['dimensions'],
    'types': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'],
    'values': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'],
    'names': unique_feature_sets['basic'] + unique_feature_sets['dimensions'] + unique_feature_sets['types'] + unique_feature_sets['values'] + unique_feature_sets['names']
}

In [None]:
unique_feature_sets['values']

## Data Loading

In [None]:
clf = joblib.load(join(base_results_dir, '', model_file_name))
feature_set_names = [ c for c in feature_sets['names']]

In [None]:
columns = feature_set_names
if model == 'lr':
    importances = np.abs(clf.coef_[0])
if model == 'rf':
    columns = feature_set_names
    importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
feature_importance_table = [ [columns[i], importances[i]] for i in indices ][:]
feature_importance_df = pd.DataFrame(feature_importance_table, columns=['feature', 'importance'])

In [None]:
feature_importance_df['aggregated'] = feature_importance_df['feature'].apply(lambda k: k.split('-agg-')[0])
feature_importance_df.groupby('aggregated', as_index=False).aggregate(['max', 'mean', 'sum']).sort_values([('importance', 'max')], ascending=False)