In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
steps=[("standard_scaler",StandardScaler()),
      ("classifier",LogisticRegression())]
steps

[('standard_scaler', StandardScaler()), ('classifier', LogisticRegression())]

In [3]:
pipe=Pipeline(steps)
pipe

Pipeline(steps=[('standard_scaler', StandardScaler()),
                ('classifier', LogisticRegression())])

In [4]:
##visualize Pipeline
from sklearn import set_config
set_config(display="diagram")

In [5]:
pipe

In [6]:
##creating a dataset
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000)
X.shape

(1000, 20)

In [7]:
X

array([[ 1.1365886 ,  0.02122456, -1.07046204, ..., -1.80110114,
         0.39132489,  0.84647861],
       [ 0.63413096,  0.95486302,  0.37757526, ..., -1.43548655,
         1.57435019,  0.09037192],
       [ 0.45197221,  0.16091323, -0.42487988, ...,  0.39801657,
        -1.36287848, -0.41350507],
       ...,
       [-1.01892843,  0.6770028 ,  0.2122892 , ..., -1.00062264,
         0.62746313,  1.27315236],
       [-1.79923784, -1.28953561,  2.06283912, ..., -0.46116267,
        -0.22926675,  0.71780701],
       [ 1.04492963,  0.3840274 ,  0.20720086, ..., -1.17617749,
         0.75323004, -0.26879561]])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
X_train

array([[-1.94137688, -0.32940061, -0.5119563 , ...,  0.49956262,
        -1.61057437, -0.03964451],
       [-0.99972699,  0.04025459,  0.19224448, ..., -1.47495945,
         0.19013136, -0.79719642],
       [-2.48940714, -0.44061584,  1.1001332 , ...,  1.21701706,
        -0.53980454,  1.88143487],
       ...,
       [-1.44230358, -1.11642719,  0.47807353, ...,  0.6135859 ,
         1.67217253, -2.87657275],
       [ 1.52916211, -1.49704611,  0.29579666, ..., -0.15813191,
         2.24957518,  0.35184234],
       [-1.80847523,  0.67738341,  0.62188626, ...,  0.48984814,
        -1.16837837,  0.15135474]])

In [9]:
pipe.fit(X_train,y_train)

In [10]:
y_pred=pipe.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,

### Displaying a pipeline with standard scaler, dimesnionality reduction and then estimator ###

In [11]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [12]:
steps=[("scaling",StandardScaler()),
      ("PCA",PCA(n_components=3)),
      ("SVC",SVC())]

In [13]:
pipe=Pipeline(steps)
pipe

In [14]:
pipe.fit(X_train,y_train)

In [15]:
pipe.predict(X_test)

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,

### Complex examples of columns transformer ###

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
## numerical processing pipeline
import numpy as np
numeric_processor=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
          ("scaler",StandardScaler())]

)
numeric_processor

In [18]:
##categorical procesing pipeline
from sklearn.preprocessing import OneHotEncoder
categorical_processor=Pipeline(
    steps=[("imputation_consatnt",SimpleImputer(fill_value="missing",strategy="constant")),
          ("onehot",OneHotEncoder(handle_unknown="ignore"))]

)
categorical_processor

In [19]:
## combine processing technqiues
from sklearn.compose import ColumnTransformer
preprocessor=ColumnTransformer(
    [("categorical",categorical_processor,["gender","City"]),
    ("numerical",numeric_processor,["age","height"])]
)
preprocessor

In [20]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(preprocessor,LogisticRegression())
pipe

In [21]:
X_train

array([[-1.94137688, -0.32940061, -0.5119563 , ...,  0.49956262,
        -1.61057437, -0.03964451],
       [-0.99972699,  0.04025459,  0.19224448, ..., -1.47495945,
         0.19013136, -0.79719642],
       [-2.48940714, -0.44061584,  1.1001332 , ...,  1.21701706,
        -0.53980454,  1.88143487],
       ...,
       [-1.44230358, -1.11642719,  0.47807353, ...,  0.6135859 ,
         1.67217253, -2.87657275],
       [ 1.52916211, -1.49704611,  0.29579666, ..., -0.15813191,
         2.24957518,  0.35184234],
       [-1.80847523,  0.67738341,  0.62188626, ...,  0.48984814,
        -1.16837837,  0.15135474]])

## HyperParameter Tuning ##

In [23]:
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
X=df.iloc[:,1:]
y=df['total_bill']
X.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4


In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [28]:
## Pieplining
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)
numeric_preprocessor

In [29]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
categorical_preprocessor

In [30]:
preprocessor=Pipeline(
    steps=[("categorical",categorical_preprocessor),("numerical",numeric_preprocessor)]

)
preprocessor

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

preprocessor=ColumnTransformer(
    [("categorical",categorical_processor,["sex","smoker","day","time"]),
    ("numerical",numeric_processor,["tip","size"])]
)

pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)
pipe

In [37]:
pipe.fit(X_train,y_train)

In [40]:
pipe.predict(X_test)

array([16.1251    , 20.07299167, 22.4644    , 11.76036667, 21.95635   ,
       15.1814    , 34.2959    , 11.2517    , 30.0861    , 23.1561    ,
       16.39077333, 23.64373333, 23.60532   , 19.3443    , 10.26003333,
       13.10480556, 28.687     , 23.910825  , 10.92620429, 19.557     ,
       14.00714786, 16.66641667, 20.3455    , 12.20498   , 19.8954    ,
       15.278     , 16.87501667, 30.5494    , 19.61391667, 24.35715   ,
       15.7564    , 27.2395    , 22.70208667, 21.706725  , 24.069725  ,
       18.8821    , 31.0472    , 29.3321    , 11.88574167, 17.8378    ,
       21.3853    , 18.26245   , 24.6434    , 16.09816667, 16.3066    ,
       32.6899    , 15.9641    , 10.26803333, 16.85285   ])

In [54]:
param_grid = { "regressor__n_estimators" : [100,200,300],
              "regressor__max_features" : ["sqrt","auto","log2"],
              "regressor__max_depth" : [4,6,8,10]
}
# param_grid = {  'bootstrap': [True], 'max_depth': [2,4,6,8], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}

In [55]:
gridsearchcv= GridSearchCV(pipe, param_grid = param_grid, n_jobs=-1)
gridsearchcv

In [56]:
gridsearchcv.fit(X_train, y_train)

In [57]:
gridsearchcv.predict(X_test)

array([18.13408493, 17.78133153, 19.18174228, 14.85391654, 19.33724932,
       20.69656838, 29.28369869, 13.24899459, 28.01959456, 23.47018256,
       18.0116836 , 18.92328728, 19.23606474, 20.73235304, 13.36976069,
       12.78127425, 29.96503439, 25.91631552, 12.06091454, 21.45644199,
       16.51195038, 20.7895089 , 21.35683163, 13.75359768, 21.09962116,
       16.99710154, 16.85102651, 30.2858626 , 22.58264535, 22.69384745,
       17.39460861, 18.77811136, 21.98459414, 22.11938976, 25.01614125,
       21.70236515, 27.2110369 , 31.16404162, 11.93347091, 16.7383558 ,
       21.38153218, 17.84494347, 27.07414805, 17.57041362, 17.00792346,
       30.34186918, 14.31642281, 12.3830271 , 18.41898418])

In [58]:
gridsearchcv.best_estimator_

In [59]:
gridsearchcv.best_params_

{'regressor__max_depth': 6,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 100}

In [60]:
gridsearchcv.best_score_

0.4633588546489921