---

# Pipelines

In [1]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline


In [2]:
#get the dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(112, 4) (112,)
(38, 4) (38,)


In [3]:
steps = [('scaler', StandardScaler()),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps) 
knn_scaled = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test) 

print(classification_report(y_test, y_pred))
print("Accuracy:",pipeline.score(X_test,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.73      1.00      0.85        11
           2       1.00      0.79      0.88        19

    accuracy                           0.89        38
   macro avg       0.91      0.93      0.91        38
weighted avg       0.92      0.89      0.90        38

Accuracy: 0.8947368421052632


In [4]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scaler', 'knn', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights'])

In [5]:
parameters = {
    'knn__n_neighbors':[1,3,5,7,9,11,13,15,17,19,21],
    'knn__weights':['uniform','distance'],
    'knn__metric':['euclidean','manhattan']
}

In [6]:
gscv = GridSearchCV(estimator = pipeline, 
                           param_grid = parameters,
                           verbose = 1,
                           n_jobs = -1, 
                           cv = 5)

In [7]:
%%time
gscv.fit(X_train, y_train)

Fitting 5 folds for each of 44 candidates, totalling 220 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


CPU times: user 385 ms, sys: 83.2 ms, total: 468 ms
Wall time: 6.28 s


[Parallel(n_jobs=-1)]: Done 220 out of 220 | elapsed:    6.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=Fal

In [8]:
gscv_y_pred = gscv.predict(X_test)

print(classification_report(y_test, gscv_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.73      1.00      0.85        11
           2       1.00      0.79      0.88        19

    accuracy                           0.89        38
   macro avg       0.91      0.93      0.91        38
weighted avg       0.92      0.89      0.90        38



In [9]:
print("Best combination of hyperparameters:",gscv.best_params_) 

print("Best accuracy:",gscv.best_score_) 

Best combination of hyperparameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
Best accuracy: 0.9732142857142857


In [10]:
best_mod = gscv.best_estimator_

In [11]:
best_mod_y_pred = best_mod.predict(X_test)
print(classification_report(y_test, best_mod_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.73      1.00      0.85        11
           2       1.00      0.79      0.88        19

    accuracy                           0.89        38
   macro avg       0.91      0.93      0.91        38
weighted avg       0.92      0.89      0.90        38

