In [1]:
"""Create scikit-learn pipeline with grid search example."""
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd

'''
Load the standard iris multi-class classification dataset
'''
iris_data = load_iris()

'''
Transform the scikit-learn dataset into a pandas dataframe
'''
col_names = iris_data['feature_names'] + ['target']
df = pd.DataFrame(data=np.c_[iris_data['data'], iris_data['target']],
                  columns=col_names)

'''
Utilize an 80/20 train/test split
'''
X = df[[col for col in df.columns if col != 'target']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

'''
Build the machine learning pipeline, define the grid of
hyperparameters to search, and create a scorer for
measuring model performance during the cross
validation process.
'''
pipeline = Pipeline(steps=[('standardize', StandardScaler()),
                           ('decision_tree', DecisionTreeClassifier(criterion='entropy'))])

params = [{'decision_tree__max_depth': range(2, 8)}]
scorer = make_scorer(f1_score, average='micro')
clf = GridSearchCV(estimator=pipeline, scoring=scorer,
                   param_grid=params, n_jobs=-1, cv=10, verbose=3,
                   return_train_score=True)

clf.fit(X_train, y_train)

print('Best CV F1 Score:  %s' % str(clf.best_score_))
print('Best Model Params:  %s' % str(clf.best_params_))
print('Test set F1 Score:  %s' % str(scorer(clf, X_test, y_test)))


Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best CV F1 Score:  0.925
Best Model Params:  {'decision_tree__max_depth': 3}
Test set F1 Score:  1.0


In [2]:
pipeline = Pipeline(steps=[('standardize', StandardScaler()),
                           ('decision_tree', DecisionTreeClassifier(criterion='entropy', max_depth=3))])

In [4]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardize', StandardScaler()),
                ('decision_tree',
                 DecisionTreeClassifier(criterion='entropy', max_depth=3))])

In [5]:
pipeline.predict(X_test)

array([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2.,
       1., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.])

In [7]:
import joblib


In [8]:
joblib.dump(pipeline,"model.pkl")

['model.pkl']

In [10]:
model = joblib.load("model.pkl")

In [11]:
X_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4
31,5.4,3.4,1.5,0.4
64,5.6,2.9,3.6,1.3
141,6.9,3.1,5.1,2.3
68,6.2,2.2,4.5,1.5
82,5.8,2.7,3.9,1.2


In [12]:
model.predict([[4.8,3.1,1.6,0.2]])

array([0.])

In [14]:
y_test.unique()

array([1., 0., 2.])