In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')
df.shape

(891, 12)

In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
436,0,3,female,21.0,2,2,34.375,S
355,0,3,male,28.0,0,0,9.5,S
554,1,3,female,22.0,0,0,7.775,S
158,0,3,male,,0,0,8.6625,S
151,1,1,female,22.0,1,0,66.6,S


# Pipeline Steps

1. `Missing` value Impute
2. OHE on `Sex` and `Embarked`
3. `Scaling` Features
4. `Features` selection
5. Model `Training`

# Train Test split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),
                                                    df['Survived'],
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape

((712, 7), (179, 7))

In [6]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
571,1,female,53.0,2,0,51.4792,S
170,1,male,61.0,0,0,33.5,S
615,2,female,24.0,1,2,65.0,S
368,3,female,,0,0,7.75,Q
123,2,female,32.5,0,0,13.0,S


# Transformer 1: missing value imputation

In [7]:
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

# Transformer 2: One Hot Encoding

In [8]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough')

# Transformer 3: Scaling

In [9]:
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

# Transformer 4: Features Selection

In [10]:
trf4 = SelectKBest(score_func=chi2, k=9)

# Transformer 4: Model

In [11]:
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [12]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

# Train Pipeline

In [13]:
pipe.fit(X_train, y_train)

# Precict Model

In [14]:
# Predict
y_pred = pipe.predict(X_test)

In [15]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

# Accuracy Check

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# Export Model

In [17]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))