# Pipelines

The purpose of a pipeline in scikit-learn is to bundle multiple modeling/preprocessing steps into one, so that you can use them in cross-validation or hyperparameter search.

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)


In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


In [3]:
p = make_pipeline(
        MinMaxScaler(),
        LogisticRegression()
      )
p.fit(X, y)
p.score(X, y)

scores = cross_val_score(p, X, y, cv=5)
print(scores)
print(scores.mean())
print(scores.std())

[0.76666667 0.83333333 0.76666667 0.86666667 0.96666667]
0.8400000000000001
0.07423685817106694


#### Alternatively, import Pipeline:

In [4]:
from sklearn.pipeline import Pipeline
from sklearn import svm

model = Pipeline([
        ('scaler', MinMaxScaler()),
        ('svc', svm.SVC(kernel='linear', C=1.0)),
    ])

model.fit(X, y)

print(model.score(X, y))

scores = cross_val_score(model, X, y, cv=5)
print(scores)
print(scores.mean())
print(scores.std())

0.9666666666666667
[0.96666667 0.96666667 0.96666667 0.93333333 1.        ]
0.9666666666666666
0.02108185106778919


## Building your own Estimator

Often it is neccessary to write code for feature engineering for which no scikit-learn function exists. If you want to apply such operations to new data, you can write your own Estimator class that can be integrated into a pipeline:

In [6]:
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FillMissingValues(BaseEstimator, TransformerMixin):
    
    def __init__(self, strategy='mean'):
        """Store the hyperparameters"""
        self.strategy = strategy

    def fit(self, X):
        self.mean_ = X.mean()
        return self

    def transform(self, X):
        return X.fillna(self.mean_)

    def get_params(self, deep=True):
        """Scikit requires us to return a dictionary here"""
        return {"strategy": self.strategy}

    def set_params(self, **parameters):
        """Scikit uses this to set parameters"""
        for parameter, value in parameters.items():
             setattr(self, parameter, value)
        return self

X = pd.Series([1, 3, np.NaN, 4, 0])
m = FillMissingValues()
m.fit(X)
print(m.transform(X))

# use ClassifierMixin in you want to create your own model
#  (needs to have a predict() method)

0    1.0
1    3.0
2    2.0
3    4.0
4    0.0
dtype: float64
