In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config
from sklearn.metrics import accuracy_score
set_config(display='diagram')

In [None]:
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

In [None]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [6]:
df['Embarked'] = df['Embarked'].replace('nan',np.nan)
df.dropna(subset=['Embarked'], inplace=True)
df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [7]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=.2,random_state=42)

In [8]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embared',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [9]:
# one-hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [10]:
# scaler
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,8))
])

In [11]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=5)

In [12]:
# train the model
trf5 = DecisionTreeClassifier()

### Creating pipeline

In [14]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

# alternative syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [17]:
# train
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('columntransformer-1', ...), ('columntransformer-2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embared', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ohe_sex_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...001F8F86B2660>
,k,5

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


##### Explore the pipeline

In [29]:
print(pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_)
print(pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_)

[29.90574468]
['S']


In [30]:
# prediction
y_pred = pipe.predict(X_test)

In [33]:
# accuracy
accuracy_score(y_test,y_pred)

0.6235955056179775