## Building a pipeline

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [3]:
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

In [4]:
logistic = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [5]:
pipe.fit(X_digits, y_digits)

Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [6]:
pipe.predict(X_digits[:1])

array([0])

## Finding the best model

In [7]:
from sklearn.grid_search import GridSearchCV

In [8]:
n_components = [20, 40, 64] # number of compomentens in PCA 
Cs = np.logspace(-4, 0, 3, 4) # Inverse of regularization strength
penalty = ["l1", "l2"] # Norm used by the Logistic regression penalization
class_weight = [None, "balanced"] # Weights associatied with clases

estimator = GridSearchCV(pipe,
                         {"pca__n_components": n_components,
                          "logistic__C": Cs,
                          "logistic__class_weight": class_weight,
                          "logistic__penalty": penalty
                         }, n_jobs=8, cv=5)
estimator.fit(X_digits, y_digits)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=None, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'logistic__class_weight': [None, 'balanced'], 'logistic__C': array([  1.00000e-04,   1.00000e-02,   1.00000e+00]), 'logistic__penalty': ['l1', 'l2'], 'pca__n_components': [20, 40, 64]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
estimator.grid_scores_

[mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l1', 'logistic__C': 0.0001},
 mean: 0.87702, std: 0.03613, params: {'logistic__class_weight': None, 'pca__n_components': 20, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88592, std: 0.03789, params: {'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.88703, std: 0.03829, params: {'logistic__class_weight': None, 'pca__n_components': 64, 'logistic__penalty': 'l2', 'logistic__C': 0.0001},
 mean: 0.09905, std: 0.00076, params: {'logistic__class_weight': 'balanced', 'pca__n_components': 20, 'logistic__penal

In [10]:
print(estimator.best_score_)
print(estimator.best_params_)

0.922092376183
{'logistic__class_weight': None, 'pca__n_components': 40, 'logistic__penalty': 'l1', 'logistic__C': 1.0}


## Exercise

Find the best model for the diabetes dataset

http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes