In [36]:
import numpy as np
from sklearn.model_selection import GridSearchCV

%run ../util.ipynb

## Import data based on pipeline type

In [3]:
def import_data(pipeline_type):
    X = np.load("../data_matrices/X.npy")
    Y = np.load("../data_matrices/Y.npy")
    
    if pipeline_type == 'classification':
        Y = np.array([int(np.round(y)) for y in Y])
    return X,Y

## Create model dictionary

In [20]:
def create_model_dict():
    model_dict_reg = {}
    model_dict_clf = {}
    model_dict = {'regression':model_dict_reg, 'classification':model_dict_clf}
    
    model_dict_reg['ridge'] = (ridge_regression_pipeline, ridge_regression_parameters)
    model_dict_reg['lasso'] = (lasso_regression_pipeline, lasso_regression_parameters)
    model_dict_reg['knn'] = (knn_regression_pipeline, knn_regression_parameters)
    
    model_dict_clf['knn'] = (knn_classification_pipeline, knn_classification_parameters)
    model_dict_clf['svm'] = (svm_classification_pipeline, svm_classification_parameters)
    model_dict_clf['lda'] = (lda_classification_pipeline, lda_classification_parameters)
    model_dict_clf['qda'] = (qda_classification_pipeline, qda_classification_parameters)
    
    return model_dict

## Grid Search

In [80]:
def grid_search(X, Y, pipeline_name, pipeline_type):
    pipeline, parameters = model_dict[pipeline_type][pipeline_name]
    if pipeline_type == 'regression':
        grid = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='neg_mean_squared_error')
        grid.fit(X, Y)
        if hasattr(grid.best_estimator_.named_steps[pipeline_name], 'coef_'):
            print(grid.best_estimator_.named_steps[pipeline_name].coef_)
            print('\n')
        return -grid.best_score_, grid.best_estimator_
    elif pipeline_type == 'classification':
        grid = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='accuracy')
        grid.fit(X, Y)
        return grid.best_score_, grid.best_estimator_

## MAIN

In [81]:
%run pipelines.py
%run parameters.py
model_dict = create_model_dict()


pipeline_name = 'lasso'
pipeline_type = 'regression'
# pipeline_type = 'classification'

X,Y = import_data(pipeline_type)

score, params = grid_search(X, Y, pipeline_name, pipeline_type)

print(score, params)

[ -2.17029538e+00   2.58730965e-01   8.82280365e-02  -1.17100771e-01
  -5.84797218e-02   2.41384933e-03  -1.26204343e-02  -0.00000000e+00
   0.00000000e+00   0.00000000e+00  -5.29545583e-02  -7.97141502e-05]


0.135660708868 Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=12, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=0.055000000000000035, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False))])


In [89]:
%run pipelines.py
%run parameters.py
model_dict = create_model_dict()


pipeline_name = 'ridge'
pipeline_type = 'regression'
# pipeline_type = 'classification'

X,Y = import_data(pipeline_type)

score, params = grid_search(X, Y, pipeline_name, pipeline_type)

print(score, params)

[[-2.22529538  0.31373097  0.14322804 -0.17210077 -0.11347972  0.05741385]]


0.211751340381 Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge', Ridge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
