In [104]:
import numpy as np
import pandas as pd

In [105]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [106]:

df = pd.read_csv('train.csv')

In [107]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [108]:
# Step 1 -> train/test/split
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,)
xtrain

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
815,1,male,,0,0,0.0000,S
399,2,female,28.00,0,0,12.6500,S
644,3,female,0.75,2,1,19.2583,C
229,3,female,,3,1,25.4667,S
507,1,male,,0,0,26.5500,S
...,...,...,...,...,...,...,...
506,2,female,33.00,0,2,26.0000,S
831,2,male,0.83,1,1,18.7500,S
38,3,female,18.00,2,0,18.0000,S
734,2,male,23.00,0,0,13.0000,S


In [109]:
#imputation tranformer
t1=ColumnTransformer([
('i_age',SimpleImputer(),[2]),
 ('i_embarked',SimpleImputer(strategy='most_frequent'),[6])
    
],remainder='passthrough')
temp=pd.DataFrame(t1.fit_transform(xtrain))
temp
#orders of col has benn changed

Unnamed: 0,0,1,2,3,4,5,6
0,29.296432,S,1,male,0,0,0.0
1,28.0,S,2,female,0,0,12.65
2,0.75,C,3,female,2,1,19.2583
3,29.296432,S,3,female,3,1,25.4667
4,29.296432,S,1,male,0,0,26.55
...,...,...,...,...,...,...,...
707,33.0,S,2,female,0,2,26.0
708,0.83,S,2,male,1,1,18.75
709,18.0,S,3,female,2,0,18.0
710,23.0,S,2,male,0,0,13.0


In [110]:
#one hot encoding
t2=ColumnTransformer([
    ('ohe',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')
temp2=pd.DataFrame(t2.fit_transform(temp))
temp2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,1.0,29.296432,1,0,0,0.0
1,0.0,0.0,1.0,1.0,0.0,28.0,2,0,0,12.65
2,1.0,0.0,0.0,1.0,0.0,0.75,3,2,1,19.2583
3,0.0,0.0,1.0,1.0,0.0,29.296432,3,3,1,25.4667
4,0.0,0.0,1.0,0.0,1.0,29.296432,1,0,0,26.55
...,...,...,...,...,...,...,...,...,...,...
707,0.0,0.0,1.0,1.0,0.0,33.0,2,0,2,26.0
708,0.0,0.0,1.0,0.0,1.0,0.83,2,1,1,18.75
709,0.0,0.0,1.0,1.0,0.0,18.0,3,2,0,18.0
710,0.0,0.0,1.0,0.0,1.0,23.0,2,0,0,13.0


In [111]:
#scaling
#3 new col was added during encoding
t3=ColumnTransformer([
    ('minmax',MinMaxScaler(),slice(0,10))  
],remainder='passthrough')
pd.DataFrame(t3.fit_transform(temp2))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,1.0,0.362860,0.0,0.000,0.000000,0.000000
1,0.0,0.0,1.0,1.0,0.0,0.346569,0.5,0.000,0.000000,0.024691
2,1.0,0.0,0.0,1.0,0.0,0.004147,1.0,0.250,0.166667,0.037590
3,0.0,0.0,1.0,1.0,0.0,0.362860,1.0,0.375,0.166667,0.049708
4,0.0,0.0,1.0,0.0,1.0,0.362860,0.0,0.000,0.000000,0.051822
...,...,...,...,...,...,...,...,...,...,...
707,0.0,0.0,1.0,1.0,0.0,0.409399,0.5,0.000,0.333333,0.050749
708,0.0,0.0,1.0,0.0,1.0,0.005152,0.5,0.125,0.166667,0.036598
709,0.0,0.0,1.0,1.0,0.0,0.220910,1.0,0.250,0.000000,0.035134
710,0.0,0.0,1.0,0.0,1.0,0.283740,0.5,0.000,0.000000,0.025374


In [112]:
#feature Selection
t4=SelectKBest(score_func=chi2,k=7)

In [113]:
#train model
t5=DecisionTreeClassifier()



**CREATE PIPELINE**

In [114]:
pipe=Pipeline([
    ('t1',t1),
    ('t2',t2),
    ('t3',t3),
    ('t4',t4),
    ('t5',t5)
    
])

pipe

## Alternative syntax ##

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [131]:
#tarin
pipe.fit(xtrain,ytrain)
#model has trained

## Explore The Pipeline ##

In [132]:
pipe.named_steps
#key value pair

{'t1': ColumnTransformer(remainder='passthrough',
                   transformers=[('i_age', SimpleImputer(), [2]),
                                 ('i_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 't2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 't3': ColumnTransformer(remainder='passthrough',
                   transformers=[('minmax', MinMaxScaler(), slice(0, 10, None))]),
 't4': SelectKBest(k=7, score_func=<function chi2 at 0x000002C89F8F63E0>),
 't5': DecisionTreeClassifier()}

In [120]:
pipe.named_steps['t1'].transformers_

[('i_age', SimpleImputer(), [2]),
 ('i_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [130]:
pipe.named_steps['t1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

## Predication ##

In [163]:
ypred=pipe.predict(xtest)
ypred

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [137]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest,ypred)

0.8268156424581006

**exporting the pipline**

In [139]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))