# Tuning

In this notebook, the ensemble classifiers are tuned. Most of the time we only tune the number of trees. For the following cases, we also had to tune on other parameters:
- Class Switching (Swt and SwtET): tuning the switching ratio **p_switch**.
- Random Patches (RadP and RadPET): tuning the number of samples **max_samples** and the number of features **max_features**.
- Vadaboost (Vad and VadET): tuning the regularization parameter **lbda**.

## Imports

In [16]:
import os, sys
sys.path.append("..")

import pickle
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform

from utils.filter import snr
from utils.rms_score import rms_metric
from utils.load_classifiers import load_classifiers_names, load_classifiers, load_grid_parameters
from utils.load_dataset import load_big_datasets_names, load_small_datasets_names, load_datasets_names

from sklearn.cross_validation import train_test_split, KFold, LeaveOneOut, StratifiedKFold
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

In [10]:
## Load the names of all classifiers and datasets
clf_names = load_classifiers_names()
data_names = load_datasets_names()
small_data = load_small_datasets_names()
big_data = load_big_datasets_names()

print "Classifiers names: ", clf_names, "\n"
print "Datasets names: ", data_names

Classifiers names:  ['AdET', 'AdSt', 'Ad', 'RadPET', 'BagET', 'ArcX4', 'ET', 'VadET', 'SwtET', 'Vad', 'ArcX4ET', 'Bag', 'RF', 'LogB', 'CART', 'RadP', 'RotET', 'RotbET', 'Rotb', 'Rot', 'Swt'] 

Datasets names:  ['musk', 'relathe', 'madelon', 'pcmac', 'promoters', 'spam', 'leukemia', 'parkinson', 'ovarian', 'wpbc', 'ionosphere', 'basehock', 'breast_cancer', 'colon', 'pima', 'cleve', 'wdbc', 'spect', 'smk_can']


## Datasets

All the datasets are stored in a HDFS file for a convinient access. To get this, go click on this [link](https://s3-eu-west-1.amazonaws.com/ensemble-comparison-data/datasets.h5) and store it in the data folder.

Example:
```python
datasets = pd.HDFStore("../data/datasets.h5")
# get the basehock dataset
X = datasets['basehock_data']
y = datasets['basehock_target']
```

In [11]:
### Load all the datasets
datasets = pd.HDFStore("../data/datasets.h5")

In [None]:
# get the 1-rms metric (not originally in scikit-learn)
rms = rms_metric()
results_path = "./results/tuning/"

# Define a random seed for tuning (then the experiment is reproducible)
tuning_seed = 0

for clf_name in clf_names:
    # Load the classifier and its grid parameters
    clf = load_classifiers(clf_name)
    grid_params = load_grid_parameters(clf_name)
 
    print "===== Tuning classifier %s =====" % clf_name
    clf_path = results_path + clf_name
    
    # Create a directory to store the tuning results (in pickle format)
    if clf_name not in os.listdir(results_path):
        os.mkdir(clf_path)
    
    for data_name in data_names:
        print "===== Tuning %s ===== on %s" % (clf_name, data_name)
        result_tuning = {}
        
        result_tuning["classifier_name"] = clf_name
        result_tuning["dataset_name"] = data_name
        result_tuning["validation_seed"] = tuning_seed
    
        data_path = clf_path + "/" + data_name
        if data_name not in os.listdir(clf_path):
            os.mkdir(data_path)            
        X = datasets[data_name + '_data']
        y = datasets[data_name + '_target']
        
        # feature selection
        best_features = snr(X, y) #???
        
        # repeat the experiment with and without feature selection when the dataset is big
        for fs in (False, True):
            if not fs and (not clf_name.startswith("Rot") or data_name in small_data): 
                result_tuning["feature_selection"] = fs
                
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)
                   
                no_fs_path = data_path + '/' + "noFS"
                if "noFS" not in os.listdir(data_path):
                    os.mkdir(no_fs_path)
                
                gs_seed = 1
                result_tuning["gs_seed"] = gs_seed
                
                kf = StratifiedKFold(y_val, n_folds=3, shuffle=True, random_state=gs_seed)
                
                # Number of jobs - run the grid search in parallel.
                n_jobs = 1
                    
                grid_params = load_grid_parameters(clf_name)
                if clf_name.startswith('Swt') and data_name == "leukemia":
                    grid_params['p_switch'] = np.arange(0.1, 0.25, 0.05)
                    
                gs_acc = GridSearchCV(clf, grid_params, scoring="accuracy", n_jobs=n_jobs, cv=kf)
                gs_auc = GridSearchCV(clf, grid_params, scoring="roc_auc", n_jobs=n_jobs, cv=kf)
                gs_rms = GridSearchCV(clf, grid_params, scoring=rms, n_jobs=n_jobs, cv=kf)
                
                gs_acc.fit(X_val, y_val)
                gs_auc.fit(X_val, y_val)
                gs_rms.fit(X_val, y_val)

                result_tuning["gs_acc"] = {}
                result_tuning["gs_acc"]["best_params"] = gs_acc.best_params_
                result_tuning["gs_acc"]["grid_scores"] = gs_acc.grid_scores_
                
                result_tuning["gs_auc"] = {}
                result_tuning["gs_auc"]["best_params"] = gs_auc.best_params_
                result_tuning["gs_auc"]["grid_scores"] = gs_auc.grid_scores_
                
                result_tuning["gs_rms"] = {}
                result_tuning["gs_rms"]["best_params"] = gs_rms.best_params_
                result_tuning["gs_rms"]["grid_scores"] = gs_rms.grid_scores_  
                
                with open(no_fs_path + "/result_tuning.txt", 'wb') as fp:
                    pickle.dump(result_tuning, fp)
               
            elif fs and (data_name in big_data):
                result_tuning["feature_selection"] = fs
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)
                X_val = X_val.iloc[:, best_features[:100]]
                fs_path = data_path + '/' + "FS"
                if "FS" not in os.listdir(data_path):
                    os.mkdir(fs_path)                
                
                gs_seed = 1
                result_tuning["gs_seed"] = gs_seed
                
                kf = StratifiedKFold(y_val, n_folds=3, shuffle=True, random_state=gs_seed)
                
                n_jobs = 1

                grid_params = load_grid_parameters(clf_name)
                    
                gs_acc = GridSearchCV(clf, grid_params, scoring="accuracy", n_jobs=n_jobs, cv=kf)
                gs_auc = GridSearchCV(clf, grid_params, scoring="roc_auc", n_jobs=n_jobs, cv=kf)
                gs_rms = GridSearchCV(clf, grid_params, scoring=rms, n_jobs=n_jobs, cv=kf)
                
                gs_acc.fit(X_val, y_val)
                gs_auc.fit(X_val, y_val)
                gs_rms.fit(X_val, y_val)
                
                ### GRID SEARCH ###
                result_tuning["gs_acc"] = {}
                result_tuning["gs_acc"]["best_params"] = gs_acc.best_params_
                result_tuning["gs_acc"]["grid_scores"] = gs_acc.grid_scores_
                
                result_tuning["gs_auc"] = {}
                result_tuning["gs_auc"]["best_params"] = gs_auc.best_params_
                result_tuning["gs_auc"]["grid_scores"] = gs_auc.grid_scores_
                
                result_tuning["gs_rms"] = {}
                result_tuning["gs_rms"]["best_params"] = gs_rms.best_params_
                result_tuning["gs_rms"]["grid_scores"] = gs_rms.grid_scores_      
                
                with open(fs_path + "/result_tuning.txt", 'wb') as fp:
                    pickle.dump(result_tuning, fp)                