Source: https://towardsdatascience.com/scikit-learn-pipeline-tutorial-with-parameter-tuning-and-cross-validation-e5b8280c01fb

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import RandomizedSearchCV

### Load and Preprocess Data

In [2]:
data = pd.read_csv('../Data/bank-full.csv', sep=';')
target = data.pop('y')
target = target.map({'yes': 1, 'no':0})
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1234)

In [5]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [7]:
# mask for categorical columns
categorical_mask = (data.dtypes=='object')
# get a list of categorical columns
categorical_columns = data.columns[categorical_mask].tolist()
# get a list of numeric columns
num_cols = data.select_dtypes(include=['int64','float64']).columns.tolist()
# columns to apply OrdinalEncoder on.
oe_cols = [c for c in categorical_columns if data[c].nunique()>5]
# columns to apply OneHotEncoder on.
ohe_cols = [c for c in categorical_columns if data[c].nunique()<=5]
len(oe_cols), len(ohe_cols), len(num_cols)

(3, 7, 10)

In [8]:
## Prepare encoders and Imputer
ohe_unique_list = [data[c].unique().tolist() for c in ohe_cols]
oe_unique_list = [data[c].unique().tolist() for c in oe_cols]
ohe = OneHotEncoder(categories=ohe_unique_list)
oe = OrdinalEncoder(categories=oe_unique_list)
imp = SimpleImputer(strategy='constant', fill_value=0)

In [10]:
## put them all in a make_column_transformer
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html
preprocess = make_column_transformer(
    (oe, oe_cols),
    (ohe, ohe_cols),
    (imp, num_cols),
    remainder='passthrough'
)

### Prepare Pipeline with Preprocessing, Learner and Feature Selection

In [12]:
estimator = RandomForestClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html
# Compute the ANOVA F-value for the provided sample.
fs = SelectKBest(score_func=f_classif, k=5)
#selector = RFE(estimator, n_features_to_select=5, step=1)
steps = [
    ('preprocess', preprocess),
    ('select', fs),
    ('clf', estimator)
]
pipeline = Pipeline(steps)

### Fit the Pipeline
Now, It’s as simple as any other machine learning algorithm, we first fit and then use predict. Predict function does all other preprocessing and then applies the trained model.

In [13]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:,1]

pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred, 'y_pred_proba': y_pred_proba})
print(roc_auc_score(y_test, y_pred))

0.7553451129127914


In [14]:
pred_df

Unnamed: 0,y,y_pred,y_pred_proba
8992,0,0,0.0000
8278,0,0,0.0000
15815,0,0,0.0000
26590,1,1,0.5200
11866,0,0,0.0000
...,...,...,...
16198,1,1,0.5700
32816,0,0,0.0000
37945,0,1,0.5400
36386,1,0,0.2425


## Randomized Search with CV

In [16]:
param_grid = {
    'clf__max_depth': np.arange(3,10,1),
    'clf__n_estimators': np.arange(50,250,50)
}

In [17]:
rand_auc = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=5, scoring='roc_auc', cv=5, verbose=False)
rand_auc.fit(X_train, y_train)
print(rand_auc.best_score_)

0.9452372238370155


In [None]:
dir(rand_auc)

In [18]:
y_pred = rand_auc.predict_proba(X_test)[:,1]
pred_df = pd.DataFrame({'y': y_test,'y_pred': y_pred})
print(roc_auc_score(y_test, y_pred))

0.9442201222348885


## Evaluate Multiple Classifiers (See How Easy it is!)
Takes a couple of minutes!

In [None]:
%%time
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),    
    RandomForestClassifier(),
    GradientBoostingClassifier()
    ]

for classifier in classifiers:
    steps = [
        ('preprocess', preprocess),
        ('select', fs),
        ('clf', classifier)
    ]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipeline.score(X_test, y_test))

As you can see, using scikit-learn’s Pipeline feature helps a lot in streamlining machine learning workflow and makes a data scientist's job easier and can focus their time on fine-tuning models, rather than doing data pre-processing steps repetitively.