# Basic Cross-Validation Experiment on the ExtraSensory data set with Backend Comparison

## Set up the Notebook

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys, os
sys.path.insert(0, os.path.abspath('..'))
os.chdir(os.path.abspath('..'))

#Disable multi-threading in NumPy 
os.environ["MKL_NUM_THREADS"] = "1" 
os.environ["NUMEXPR_NUM_THREADS"] = "1" 
os.environ["OMP_NUM_THREADS"] = "1"

## Import modules

In [None]:
from Blocks.data_loader import extrasensory_data_loader
from Blocks.filter import MisingLabelFilter,  MisingDataColumnFilter, Take
from Blocks.imputer import Imputer
from Blocks.normalizer import Normalizer
from Blocks.experimental_protocol import ExpTrainTest, ExpCV, ExpWithin
from Blocks.results_analysis import ResultsConcat, ResultsCVSummarize, DataYieldReport

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.dummy import DummyClassifier

import matplotlib.pyplot as plt

from Workflow.workflow import workflow
import Workflow.compute_graph
import time
import pandas as pd


## Define the workflow

This workflow performs a 5-fold cross-validation experiment on the ExtraSensory data set sleeping prediction task. The model used is logistic regression with a fixed regularization hyper-parameter. The workflow includes a column filter that screens out feature dimensions that are less than 20% observed, and a missing label filter that removes instances without labels. Next, the workflow performs mean imputation followed by feature normalization. Lastly, the cross-validation experiment is run on the pre-processed data set and results are evaluated using four metrics (accuracy, F1, precision and recall). The results from each fold are combined and then summarized.

This demonstration compares the run time of three different workflow scheduler backends:  sequential, multithreaded, and multiprocess. 

In [None]:
estimators = {"LR": LogisticRegression(solver="lbfgs",max_iter=100)}

metrics   = [accuracy_score, f1_score, precision_score, recall_score]
df_raw    = extrasensory_data_loader(label="SLEEPING");
df_cf     = MisingDataColumnFilter(df_raw);
df_lf     = MisingLabelFilter(df_cf);
df_imp    = Imputer(df_lf)
df_norm   = Normalizer(df_imp);
res_cv    = ExpCV(df_norm, estimators, metrics=metrics);
res_cat   = ResultsConcat(res_cv)
summary   = ResultsCVSummarize(res_cat)

configs = {#"sequential":[1],
           "multithread":[2,4],
           #"multiprocess":[1,2,4]
          }

results={}
for config in configs:
    for workers in configs[config]:
        
        print(config, workers)
        
        flow=workflow([summary]); 
        start = time.time()
        output=flow.run(backend=config, num_workers=workers, monitor=True, from_scratch=True);
        results[config+"(%d)"%(workers)] = time.time()-start
        print(config, workers, results[config+"(%d)"%(workers)])
    
