# Model selection - drop questionnaire items

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5019184/

## Environment initialization

In [16]:
from __future__ import division, print_function

%autosave 0
%matplotlib notebook
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import sklearn as sk
from sklearn import preprocessing
import pandas as pd

import sys
sys.path.append("../")

import mod_data
import mod_evaluation
import mod_compute
import mod_viewer
import mod_latent

Autosave disabled
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Notebook variables

In [17]:
data_path = '../data/source/all_data_wide.csv'

metric_eval = 'val_categorical_accuracy'

## Execution params

In [18]:
results_path = 'data/results'

q_exc = []

metric_id = 'categorical_accuracy_class'

model_ref_id = 'linear'
model_ref_kwargs = {}

scaling = None

n_splits = 25
drop_max = 120

train_val_random = 10

train = 1
test_size = 200

save_stats = False

## Running params

In [19]:
model_ref = mod_compute.models[model_ref_id]
cache_pre = 'model_'+model_ref_id
cache_post = str(n_splits)

if train_val_random is not None:
    cache_post += '_r'+str(train_val_random)
    
cache_sig = mod_evaluation.cache_sig_gen(
    metric_id, 
    cache_pre=cache_pre,
    cache_post=cache_post
)

print('evaluating', cache_sig)

evaluating model_linear_categorical_accuracy_class_25_r10


## Import train and holdout data

In [20]:
df_data = pd.read_csv(data_path)

_, logits = mod_latent.modelPP(df_data[mod_latent.s_latent_ref])

df_data[mod_data.logits_ref] = pd.DataFrame(logits)

df_data = df_data[df_data['train']==train]

df_train, df_val = sk.model_selection.train_test_split(
    df_data,
    test_size=test_size,
    random_state=train_val_random,
    shuffle=True,
    stratify=np.argmax(np.array(df_data[mod_data.logits_ref]), axis=1)
)

df_train_val = (df_train, df_val)

In [21]:
df_x_train, df_y_train, df_x_val, df_y_val = mod_data.load_data_df(df_train_val=df_train_val)

print('x train:', df_x_train.shape, '\ty train:', df_y_train.shape)
print('x val:\t', df_x_val.shape, '\ty val:\t', df_y_val.shape)

x train: (511, 176) 	y train: (511, 5)
x val:	 (200, 176) 	y val:	 (200, 5)


## Pre-processing

In [22]:
if scaling is not None:
    
    if scaling=='minmax':
        scaler = sk.preprocessing.MinMaxScaler()
        
    elif scaling=='standard':
        scaler = sk.preprocessing.StandardScaler()
    
    scaler.fit(df_x_train)

    columns = df_x_train.columns
    
    df_x_train = pd.DataFrame(scaler.transform(df_x_train), columns=columns)
    df_x_val = pd.DataFrame(scaler.transform(df_x_val), columns=columns)
    
    print('scaling applied:', scaling)
    
else:
    print('no scaling applied')

no scaling applied


## Model search

In [None]:
stats_best, info_best = mod_compute.stats_seq_drop(
    metric_id,
    model_ref,
    df_x_train,
    df_y_train,
    q_exc_init=q_exc, 
    drop_max=drop_max,
    n_splits=n_splits,
    metric_eval=metric_eval,
    **model_ref_kwargs
)

HBox(children=(FloatProgress(value=0.0), Output()))

drop_q	 drop_sum	val_acc_m (val_acc_min - val_acc_max)	 name

1 	 16 		0.9533 	(0.8500 - 1.0000) 		 TEPS_0012
2 	 16 		0.9549 	(0.7548 - 1.0000) 		 1 + ACS_0009
3 	 16 		0.9630 	(0.8381 - 1.0000) 		 2 + PSAS_0013
4 	 16 		0.9605 	(0.8167 - 1.0000) 		 3 + PSAS_0009
5 	 16 		0.9627 	(0.8667 - 1.0000) 		 4 + FSS_0007
6 	 16 		0.9625 	(0.8500 - 1.0000) 		 5 + PSAS_0002
7 	 16 		0.9654 	(0.8548 - 1.0000) 		 6 + ACS_0002
8 	 16 		0.9708 	(0.8833 - 1.0000) 		 7 + PSAS_0011


## Model validation

In [None]:
stats_val = {}

for model_id in info_best:
    
    q_sel = mod_compute.get_q_sel(info_best[model_id]['q_exc'])

    x_train, y_train, x_val, y_val = mod_compute.df_to_model_input(
        df_x_train[q_sel], df_y_train, df_x_val[q_sel], df_y_val
    )
            
    _, my_stats =  mod_compute.model_train(
        x_train, y_train, x_val, y_val, 
        model_ref,
        **model_ref_kwargs
    )
    
    stats_val[model_id] = [my_stats]

## Save results

In [None]:
if save_stats:
    
    if not os.path.isdir(results_path):
        os.mkdir(results_path)

    results = {
        'info': info_best,
        'stats': stats_best,
        'stats_val': stats_val,
    }

    mod_evaluation.to_cache(cache_sig, results_path, results)

## Plot cross-validation results

In [None]:
info = {metric_eval: info_best}
stats = {metric_eval: stats_best}
my_stats_val = {metric_eval: stats_val}
    
df_questions, df_questions_val = mod_evaluation.get_df_questions(
    info, stats, my_stats_val, 
    ci=True
)

df_questions_ca, df_questions_val_ca = mod_evaluation.get_df_questions_ca(
    info, stats, my_stats_val, 
    ci=False
)

In [None]:
mod_viewer.plot_accuracy([[
    df_questions[metric_eval],
    mod_viewer.sort_params_short[metric_eval]
]])

In [None]:
mod_viewer.plot_conditional_accuracy(
    df_questions[metric_eval], 
    df_questions_ca[metric_eval]
)

## Plot holdout results

In [None]:
mod_viewer.plot_accuracy([
    [df_questions[metric_eval], mod_viewer.sort_params_short[metric_eval]],
    [df_questions_val[metric_eval], 'holdout']
])

In [None]:
mod_viewer.plot_conditional_accuracy(
    df_questions_val[metric_eval], 
    df_questions_val_ca[metric_eval]
)