In [151]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [152]:
df=pd.read_csv("./data/titanic.csv")[['Age','Fare','SibSp','Parch','Survived']]

In [153]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [154]:
df.isnull().sum()

Age         177
Fare          0
SibSp         0
Parch         0
Survived      0
dtype: int64

In [155]:
df.shape

(891, 5)

In [156]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df.isnull().sum()

Age         0
Fare        0
SibSp       0
Parch       0
Survived    0
dtype: int64

In [157]:
df['Family']=df['SibSp'] + df['Parch']
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,Family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


In [158]:
df.drop(columns=['SibSp','Parch'],inplace=True)
df.head()

Unnamed: 0,Age,Fare,Survived,Family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [159]:
X=df.drop('Survived',axis=1)
y=df['Survived']

In [160]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(623, 3) (623,) (268, 3) (268,)


In [161]:
X_train.head()

Unnamed: 0,Age,Fare,Family
445,4.0,81.8583,2
650,29.699118,7.8958,0
172,1.0,11.1333,2
450,36.0,27.75,3
314,43.0,26.25,2


## ****Without Binarization****

In [162]:
clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("accuracy_score = ",accuracy_score(y_test,y_pred))

accuracy_score =  0.6492537313432836


## ****With Binarization****

In [163]:
# Applying Binarization
from sklearn.preprocessing import Binarizer

In [164]:
trf = ColumnTransformer([
    ('bin',Binarizer(copy=False),['Family'])
],remainder='passthrough')

In [165]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [166]:
pd.DataFrame(X_train_trf,columns=['Family','Age','Fare']).head()

Unnamed: 0,Family,Age,Fare
0,1.0,4.0,81.8583
1,0.0,29.699118,7.8958
2,1.0,1.0,11.1333
3,1.0,36.0,27.75
4,1.0,43.0,26.25


In [167]:
clf=DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2=clf.predict(X_test_trf)
print("accuracy_score = ",accuracy_score(y_test,y_pred2))

accuracy_score =  0.6567164179104478
