# <center>PipeLines
- Pipelines chains together multiple steps so that the output of each step is used as input to the next step
<pre>
            +---------+     +---------+    +---------+     +---------+ 
            |         |     |         |    |         |     |         |
  input   ------->    ------->       ------->        ------->       ------->  output
            |         |     |         |    |         |     |         |
            +---------+     +---------+    +---------+     +---------+
    </pre>
  &rarr; Piplines makes it easy to apply the same preprocessing to train and test!

    

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [5]:
# step 1 =  train/test/split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Survived'), df['Survived'], test_size=.2, random_state=42)

In [6]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [7]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [8]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
# applying imputer

si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

si_age.fit(X_train[['Age']])
si_embarked.fit(X_test[['Embarked']])

X_train_age = si_age.transform(X_train[['Age']])
X_test_age = si_age.transform(X_test[['Age']])

X_train_embarked = si_embarked.transform(X_train[['Embarked']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [10]:
X_train_age.shape, X_train_embarked.shape

((712, 1), (712, 1))

In [11]:
# onehot encoding on Sex and Embarked
ohe_sex = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

ohe_sex.fit(X_train[['Sex']])
ohe_embarked.fit(X_test_embarked)

X_train_sex = ohe_sex.transform(X_train[['Sex']])
X_test_sex = ohe_sex.transform(X_test[['Sex']])

X_train_embarked = ohe_embarked.transform(X_train_embarked)
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [12]:
X_train_embarked.shape

(712, 3)

now we have numpy array of columns = age, embarked, sex

we create are numpy array of all other columns and them concatenate them

In [13]:
X_train_remaining = X_train.drop(['Sex', 'Age', 'Embarked'], axis=1)
X_test_remaining = X_test.drop(['Sex', 'Age', 'Embarked'], axis=1)

X_train_remaining.shape

(712, 4)

In [14]:
X_train_transformed = np.concatenate((X_train_age, X_train_sex, X_train_remaining, X_train_embarked), axis=1)
X_test_transformed = np.concatenate((X_test_age, X_test_sex, X_test_remaining, X_test_embarked), axis=1)

X_train_transformed.shape


(712, 10)

**Training Model**

In [15]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed, y_train)

In [16]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7932960893854749

Exporting Model

In [18]:
import pickle

In [19]:
pickle.dump(ohe_sex,open('models/26-a-ohe_sex.pkl', 'wb')) # useful when user upload male or female, this can apply onehot encoding on them
pickle.dump(ohe_embarked, open('models/26-a-ohe_embarked.pkl', 'wb')) # same as sex
pickle.dump(clf, open('models/26-a-clf.pkl', 'wb')) # this is our model

# Using model, (let us be user)

In [20]:
import pickle
import numpy as np

In [21]:
ohe_sex = pickle.load(open('models/26-a-ohe_sex.pkl', 'rb'))
ohe_embarked = pickle.load(open('models/26-a-ohe_embarked.pkl', 'rb'))
clf = pickle.load(open('models/clf.pkl', 'rb'))

In [22]:
# assume user input
# Pclass/gender/age/Sibsp/Parch/Fare/Embarked
test_input = np.array([2, 'male', 31.0, 0,0,10.5, 'S'], dtype=object).reshape(1,7)

In [23]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [24]:
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))



In [25]:
test_input_sex

array([[0., 1.]])

In [26]:
test_input_embarked = ohe_embarked.transform(test_input[:,-1].reshape(1,1))

In [27]:
test_input_embarked

array([[0., 0., 1.]])

In [28]:
test_input_age = test_input[:,2].reshape(1,1)

In [29]:
test_input_age

array([[31.0]], dtype=object)

In [30]:
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]], test_input_age, test_input_embarked, test_input_sex), axis=1)

In [31]:
test_input_transformed.shape

(1, 10)

In [32]:
clf.predict(test_input_transformed)

array([1], dtype=int64)

In [33]:
# this is very lenthy and many chances of geting mistakes, so we use pipline to make it easy

# Using Pipeline to improve above code

In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [35]:
df = pd.read_csv('train.csv')

In [36]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


<h3> Let'sPlan

In [37]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True) # droping unusefull columns

In [38]:
# step 1 -> train/test/split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),
                                                   df['Survived'], test_size=0.2)

In [39]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
35,1,male,42.0,1,0,52.0,S
273,1,male,37.0,0,1,29.7,C
377,1,male,27.0,0,2,211.5,C
711,1,male,,0,0,26.55,S
632,1,male,32.0,0,0,30.5,C


In [40]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),  # while creating pipeline use index of the column, because numpy has no column name
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
], remainder='passthrough')

In [41]:
# One Hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6]),
], remainder='passthrough')

In [42]:
# scaling
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

In [43]:
# Featuring Scaling
trf4 = SelectKBest(score_func=chi2, k=8)

In [44]:
# train the model
trf5 = DecisionTreeClassifier()

<h3>Create Pipeline</h3>

In [79]:
# recommanded method
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
])

# Pipeline Vs make_pipeline
Pipeline requires, naming of steps, make_pipeline does not.<br>
(Same applies to ColumnTransformer vs make_column_transformer)

In [65]:
# Alternate Syntex (Not Recommanded)
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [66]:
# train
pipe.fit(X_train, y_train)
# when we train a model then we use fit() function like is this case
# but if we are not training model and just preprocessing data then we have to call the fit_transformer() function

<h3>Exploring the Pipeline</h3> 

**This may help in Debugging**

In [67]:
# code here
pipe.named_steps # these are the steps of pipeline(making steps) (it is in dictionary)

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x00000213AC052B60>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [68]:
pipe.named_steps['columntransformer-1']  # we can access them by key name

In [69]:
pipe.named_steps['columntransformer-1'].transformers_  # list of ColumnTransformer

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [70]:
pipe.named_steps['columntransformer-1'].transformers_[0]  # Accessing first list (returns a tuple)

('impute_age', SimpleImputer(), [2])

In [71]:
pipe.named_steps['columntransformer-1'].transformers_[0][1]  # Accessing first element of tuple

In [72]:
# Now we can use its attributes to find some values, which can be calculated by our pipelines
# like:-
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_

array([29.46640489])

In [73]:
# finding most frequent value from 'impute_embarked' of SimpleImputer of columntransformer-1
pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_


array(['S'], dtype=object)

In [74]:
# Predict
y_pred = pipe.predict(X_test)

In [75]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.6480446927374302

# Cross Validation Using Pipeline

In [76]:
# cross validation using cross_val_score
# Cross-validation is a technique used to evaluate the performance of a machine learning model. It helps ensure that the model's performance is not just good on one particular subset of the data but is generalizable to unseen data.
from sklearn.model_selection import cross_val_score
cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=5, scoring='accuracy').mean()

0.6320496405003447

1. **`estimator`**:
   - The model or pipeline you want to evaluate (e.g., `pipe`).

2. **`X`**:
   - The input data (features) used for training.

3. **`y`**:
   - The target data (labels) used for training.

4. **`cv`**:
   - The number of parts to split the data into (e.g., `cv=5` means 5 parts).

5. **`scoring`**:
   - The metric used to evaluate the model (e.g., `'accuracy'` to measure how many correct predictions the model makes).

6. **`n_jobs`**:
   - The number of CPU cores to use for computation. `-1` means using all available cores.

7. **`verbose`**:
   - Controls the amount of information printed during execution. Higher numbers mean more details.

8. **`fit_params`**:
   - Additional parameters to pass to the model during fitting (training).

9. **`pre_dispatch`**:
   - Controls the number of jobs to dispatch during parallel execution (if `n_jobs` is greater than 1).

10. **`error_score`**:
    - What to do if an error occurs during fitting. `np.nan` means to ignore the fold with the error and use `NaN` as the score.

# GridSearch using Pipeline

GridSearchCV is a method in scikit-learn used to find the best parameters for a given model by trying all possible combinations of a provided parameter grid. When combined with a pipeline, it allows for automated tuning of preprocessing steps and model parameters in a single workflow.

In [77]:
# gridsearchcv
params = {
    'trf5__max_depth': [1,2,3,4,5,None] # trf5 is the name of our model
}

In [83]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

1. **`pipe`**: This is your machine learning pipeline, which includes all the preprocessing steps and the model you want to tune.

2. **`params`**: This is a dictionary where the keys are the names of the parameters you want to tune, and the values are lists of the values you want to try for each parameter. For example, `{'decisiontreeclassifier__max_depth': [1, 2, 3, 4, 5, None]}`.

3. **`cv`**: This stands for cross-validation. It specifies the number of folds to use for splitting the data into training and validation sets. Here, `cv=5` means the data will be split into 5 folds, and the model will be trained and validated 5 times, each time with a different fold as the validation set.

4. **`scoring`**: This specifies the metric to evaluate the model's performance. Here, `scoring='accuracy'` means the model will be evaluated based on its accuracy, which is the proportion of correctly classified instances.


In [81]:
grid.best_score_

0.6334482418989461

In [82]:
grid.best_params_

{'trf5__max_depth': 1}

# Exporting the Pipeline 

In [85]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

# Using Model which us Exported by adding the Pipeline 

In [87]:
import pickle
import numpy as np

In [88]:
pipe = pickle.load(open('pipe.pkl', 'rb'))

In [109]:
# Assume user input
test_input2 = np.array([2,'male',31.0,0,0,10.5,'S'], dtype=object).reshape(1,7)

In [110]:
pipe.predict(test_input2)



array([0], dtype=int64)