## sklearn.pipeline.Pipeline
- Pipeline of transforms with a final estimator.
- That is, Sequentially apply a list of transforms and a final estimator.
- Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. 
- The final estimator only needs to implement fit.
- Pipelines assembles several steps and then cross validated.
- Cross Validation: used to test the ability of a machine learning model to predict new data. This flags overfitting or selection bias and gives insights on how the model will generalize to an independent dataset.

##### what pipelines do?
- I can apply methods like feature scaling, imputing, dimensionality reduction in pipeline fashion and then any machine learning algorithm can be applied over this and be cross validated together

In [1]:
from sklearn.pipeline import Pipeline
##feature Scaling
from sklearn.preprocessing import StandardScaler
#importing LR algo
from sklearn.linear_model import LogisticRegression

In [2]:
# combining transformation technique and estimator (ML algo)
#creating a list called steps which inturn has tuples in key value pair. 

steps=[("standard_scaler",StandardScaler()),
      ("classifier",LogisticRegression())]

In [3]:
steps

[('standard_scaler', StandardScaler()), ('classifier', LogisticRegression())]

In [4]:
#passing steps inside Pipeline using Pipeline() and saving in pipe variable
pipe= Pipeline(steps)

In [5]:
pipe

Pipeline(steps=[('standard_scaler', StandardScaler()),
                ('classifier', LogisticRegression())])

In [6]:
# to visualise: set_config
from sklearn import set_config

In [7]:
set_config(display= "diagram")

In [8]:
pipe

In [9]:
# I can add any dataset. Creating dataset

from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000) 
# make_classification() Generates a random n-class classification problem with default X as 20(20 cols)
# y as 2 class and samples 100. Here I have given sample 1000

In [10]:
X

array([[ 0.87206259,  0.00350708, -0.26201905, ...,  1.10163372,
         0.64360476,  0.16775987],
       [ 0.61835318,  0.51356027,  0.53625189, ..., -0.65818131,
        -0.76118548, -1.47710464],
       [-0.72910641, -2.01848758,  0.49350838, ...,  1.50520394,
        -0.64497849, -0.2373421 ],
       ...,
       [ 1.30219491, -0.3560658 , -1.33499749, ...,  0.91902825,
         0.93045216,  0.68493587],
       [ 0.19336917, -2.03414027,  0.32946048, ...,  0.86265806,
         0.89290973, -0.71611758],
       [ 1.66784729,  1.82019875,  0.11848861, ...,  0.44790398,
         0.32349038, -0.65883478]])

In [11]:
y

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,

In [12]:
X.shape

(1000, 20)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [14]:
X_train

array([[-0.96394176,  0.2543214 , -1.4668463 , ..., -0.85604681,
         0.41234556, -1.15593387],
       [-0.3441473 , -0.55628248, -0.71064703, ...,  0.12527108,
        -0.64690736, -0.0773938 ],
       [-1.02335617, -0.89617994, -1.66623345, ...,  0.27510921,
        -1.34659532, -0.06257454],
       ...,
       [ 2.32131478,  0.36427114,  0.49552722, ..., -0.27892843,
         0.2890696 , -2.10170944],
       [ 0.3894091 , -0.45742955, -0.25590278, ...,  0.7629092 ,
        -0.53483462,  0.42571336],
       [-1.77637354,  0.18860725,  1.34765762, ...,  1.04234035,
        -1.49603865,  0.05646045]])

In [15]:
y_train

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,

In [16]:
#fitting
pipe.fit(X_train,y_train)
# I'm getting this diagram as output as I have set configurations already above
# set_config(display= "diagram")

In [17]:
# predicting
y_pred=pipe.predict(X_test)
#now it just perfoms transform (not fir_transform)

In [18]:
y_pred

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,

#### Applying different techniques like scaling, dimension reduction etc

## Standard scalar, dimensionality reduction and estimator in pipeline

In [19]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [20]:
steps=[("scaling",StandardScaler()),
      ("PCA",PCA(n_components=3)),
      ("SVC",SVC())]

#standardise-> then reduce features to 3 -> apply Support Vector classifier Method

In [21]:
pipe2=Pipeline(steps)

In [22]:
# if I just want to apply standard scaling 
pipe2['scaling'].fit_transform(X_train)

#only scaling is applied here.

array([[-0.75162989,  0.25666186, -1.41104438, ..., -0.92011082,
         0.49204374, -1.22737481],
       [-0.27628295, -0.56647286, -0.66783012, ...,  0.10631227,
        -0.85263376, -0.16001875],
       [-0.79719734, -0.91162467, -1.60700777, ...,  0.26303755,
        -1.74085851, -0.14535315],
       ...,
       [ 1.76797412,  0.36831128,  0.51763242, ..., -0.31646583,
         0.33555004, -2.16334309],
       [ 0.28631288, -0.46609179, -0.22089446, ...,  0.77325874,
        -0.71036208,  0.3378714 ],
       [-1.37471863,  0.18993186,  1.35513055, ...,  1.06553363,
        -1.9305706 , -0.02755251]])

In [23]:
#instead of applying techniqies one by one
pipe2.fit(X_train,y_train)

In [24]:
pipe2.predict(X_test)

array([1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,

### Column transformation using pipleline (imp)
- This handles missing values before applying other techniques in pipeline.
- Imputer: using imputer fill out missing value
- Apply scaling technique


#### Steps
1. Handling missing value
2. Handling numeric data
3. Handling categorical data.
4. Combining both numeric and categorical data 

In [25]:
from sklearn.impute import SimpleImputer
import numpy as np

In [26]:
# pipeline to handle numeric data: replacing missing value with mean

numeric_processor=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
          ("scaler",StandardScaler())]

)

In [27]:
numeric_processor

In [28]:
# impute for categorical data-> for missing values replace with word "constant"(or missing)
# apply one hot encoding to replace category. 

from sklearn.preprocessing import OneHotEncoder
categorical_processor=Pipeline(
    steps=[("imputation_constant",SimpleImputer(fill_value="missing",strategy="constant")),
          ("onehot",OneHotEncoder(handle_unknown="ignore"))]

)

In [29]:
categorical_processor

### to combine both processor, use ColumnTransformer()
- All steps in piplines are mentioned in ColumnTransformer()
- This transforms all the columns which are mentioned inside
- For instance, I have passed gender, city which is categorical data and age, height which are numerica data. 
- syntax= ColumnTransformer(List of [(name, transformer, columns)]
- These data is transformed
- Now I need to apply estimator on this transformation

In [30]:
#sklearn.compose.ColumnTransformer¶
from sklearn.compose import ColumnTransformer

In [31]:
preprocessor= ColumnTransformer(
    [("categorical",categorical_processor,['gender','city']),
    ("numerical",numeric_processor, ['age','height'])]
)

In [32]:
preprocessor
# pipeline would look like this

#### Add estimator
- this can be done using make_pipeline(): Construct a Pipeline from the given estimators.
- sklearn.pipeline.make_pipeline


In [33]:
from sklearn.pipeline import make_pipeline

In [34]:
# my final pipe has 1. preprocessor which inturn has all transformer 2. estimator, which is 
# nothing but my ML algorithm.
 
#make_pipeline()
final_pipe= make_pipeline(preprocessor, LogisticRegression())

In [35]:
final_pipe

In [55]:
## Example 2: using dataset

In [36]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [37]:
import seaborn as sns

In [38]:
df=sns.load_dataset('tips')

In [39]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [40]:
X1=df.iloc[:,1:]
y1=df['total_bill']

In [41]:
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1=train_test_split(X1,y1,test_size=0.2)

In [42]:
numeric_preprocessor1 = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

In [43]:
from sklearn import set_config

In [44]:
set_config(display='diagram')

In [45]:
numeric_preprocessor1

In [46]:
categorical_preprocessor1 = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [47]:
categorical_preprocessor1

In [50]:
preprocessor1=Pipeline(
    steps=[("categorical",categorical_preprocessor1),("numerical",numeric_preprocessor1)]

)

In [51]:
preprocessor1


In [52]:
final_pipe2=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [53]:
final_pipe2

In [54]:
pipe.fit(X_train,y_train)