# Model selection - drop questionnaire items

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5019184/

## Environment initialization

In [1]:
from __future__ import division, print_function

%autosave 0
%matplotlib notebook
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import sklearn as sk
from sklearn import preprocessing
import pandas as pd

import sys
sys.path.append("../")

import mod_data
import mod_evaluation
import mod_compute
import mod_viewer
import mod_latent

Autosave disabled


## Notebook variables

In [2]:
data_path = '../data/source/all_data_wide.csv'

metric_eval = 'val_categorical_accuracy'

## Execution params

In [3]:
results_path = 'data/results'

q_exc = []

metric_id = 'categorical_accuracy_class'

model_ref_id = 'linear'
model_ref_kwargs = {}

scaling = None

n_splits = 25
drop_max = 100

train = 0
test_size = 200

save_stats = False

## Running params

In [4]:
model_ref = mod_compute.models[model_ref_id]
cache_pre = 'model_'+model_ref_id

evaluating model_linear_categorical_accuracy_class_25_r20


## Model search

In [8]:
for train_val_random in range(50):
    
    cache_post = str(n_splits) + '_' + str(train) + '_r' + str(train_val_random)

    cache_sig = mod_evaluation.cache_sig_gen(
        metric_id, 
        cache_pre=cache_pre,
        cache_post=cache_post
    )

    print('evaluating', cache_sig)

    df_data = pd.read_csv(data_path)

    _, logits = mod_latent.modelPP(df_data[mod_latent.s_latent_ref])

    df_data[mod_data.logits_ref] = pd.DataFrame(logits)

    df_data = df_data[df_data['train']==train]
    
    df_train, df_val = sk.model_selection.train_test_split(
        df_data,
        test_size=test_size,
        random_state=train_val_random,
        shuffle=True,
        stratify=np.argmax(np.array(df_data[mod_data.logits_ref]), axis=1)
    )

    df_train_val = (df_train, df_val)

    df_x_train, df_y_train, df_x_val, df_y_val = mod_data.load_data_df(df_train_val=df_train_val)

    stats_best, info_best = mod_compute.stats_seq_drop(
        metric_id,
        model_ref,
        df_x_train,
        df_y_train,
        q_exc_init=q_exc, 
        drop_max=drop_max,
        n_splits=n_splits,
        metric_eval=metric_eval,
        **model_ref_kwargs
    )

    ## Model validation
    stats_val = {}

    for model_id in info_best:

        q_sel = mod_compute.get_q_sel(info_best[model_id]['q_exc'])

        x_train, y_train, x_val, y_val = mod_compute.df_to_model_input(
            df_x_train[q_sel], df_y_train, df_x_val[q_sel], df_y_val
        )

        _, my_stats =  mod_compute.model_train(
            x_train, y_train, x_val, y_val, 
            model_ref,
            **model_ref_kwargs
        )

        stats_val[model_id] = [my_stats]

    ## Save results
    if save_stats:

        if not os.path.isdir(results_path):
            os.mkdir(results_path)

        results = {
            'info': info_best,
            'stats': stats_best,
            'stats_val': stats_val,
        }

        mod_evaluation.to_cache(cache_sig, results_path, results)

HBox(children=(FloatProgress(value=0.0), Output()))

drop_q	 drop_sum	val_acc_m (val_acc_min - val_acc_max)	 name

1 	 16 		0.9575 	(0.7778 - 1.0000) 		 BISBAS_0020
2 	 16 		0.9688 	(0.9000 - 1.0000) 		 1 + PANAS_0008
3 	 16 		0.9649 	(0.8556 - 1.0000) 		 2 + MIPIP_0009
4 	 16 		0.9742 	(0.8750 - 1.0000) 		 3 + ACS_0002
5 	 16 		0.9755 	(0.8778 - 1.0000) 		 4 + RPA_0004
6 	 16 		0.9713 	(0.8000 - 1.0000) 		 5 + ACS_0024
7 	 16 		0.9772 	(0.8750 - 1.0000) 		 6 + FIRST_0004
8 	 16 		0.9761 	(0.8750 - 1.0000) 		 7 + ACS_0017
9 	 16 		0.9629 	(0.7714 - 1.0000) 		 8 + FSS_0002
10 	 16 		0.9731 	(0.8333 - 1.0000) 		 9 + PANAS_0006
11 	 16 		0.9759 	(0.8778 - 1.0000) 		 10 + TEPS_0011
12 	 16 		0.9784 	(0.8861 - 1.0000) 		 11 + ACS_0008
13 	 16 		0.9729 	(0.8750 - 1.0000) 		 12 + PSAS_0006
14 	 16 		0.9784 	(0.8528 - 1.0000) 		 13 + ACS_0010
15 	 16 		0.9752 	(0.7889 - 1.0000) 		 14 + FSS_0005
16 	 16 		0.9768 	(0.8111 - 1.0000) 		 15 + RRS_0024
17 	 16 		0.9734 	(0.8000 - 1.0000) 		 16 + FSS_0007
18 	 16 		0.9741 	(0.8000 - 1.0000) 		 17 + ACS