In [37]:
import pandas as pd 
import numpy as np 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier


In [38]:
df= sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [39]:
df.drop(columns=['class','who','adult_male','deck','embark_town','alive','alone'],inplace=True)

In [40]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [41]:
# Step 1 : Train/Test Split
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['survived']),
                                               df['survived'],
                                               test_size=0.2,
                                               random_state=42)

In [42]:
X_train.head(5)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [43]:
y_train.head(5)

331    0
733    0
382    0
704    0
813    0
Name: survived, dtype: int64

In [44]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [45]:
#Applying Imputation
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')
X_train_age=si_age.fit_transform(X_train[['age']])
X_train_embarked=si_embarked.fit_transform(X_train[['embarked']])

X_test_age=si_age.transform(X_test[['age']])
X_test_embarked=si_embarked.transform(X_test[['embarked']])





In [46]:
X_train_age

array([[45.5       ],
       [23.        ],
       [32.        ],
       [26.        ],
       [ 6.        ],
       [24.        ],
       [45.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [42.        ],
       [36.        ],
       [33.        ],
       [17.        ],
       [29.        ],
       [50.        ],
       [35.        ],
       [38.        ],
       [34.        ],
       [17.        ],
       [11.        ],
       [61.        ],
       [30.        ],
       [ 7.        ],
       [63.        ],
       [20.        ],
       [29.49884615],
       [29.        ],
       [36.        ],
       [29.49884615],
       [50.        ],
       [27.        ],
       [30.        ],
       [33.        ],
       [29.49884615],
       [29.49884615],
       [ 2.        ],
       [25.        ],
       [51.        ],
       [25.        ],
       [29.49884615],
       [29.49884615],
       [24.        ],
       [18.        ],
       [29.49884615],
       [25

In [47]:
X_train_embarked

array([['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
      

In [48]:
ohe_sex=OneHotEncoder(sparse_output=False,handle_unknown="ignore")
ohe_embarked= OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [49]:
X_train_sex=ohe_sex.fit_transform(X_train[['sex']])
X_train_embarked_ohe=ohe_embarked.fit_transform(X_train_embarked)



In [50]:
X_test_sex=ohe_sex.transform(X_test[['sex']])
X_test_embarked_ohe=ohe_embarked.transform(X_test_embarked)

In [51]:
X_train_embarked_ohe

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], shape=(712, 3))

In [52]:
X_train_embarked

array([['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
      

In [53]:
X_train_rem= X_train.drop(columns=['sex','embarked','age'])
X_test_rem= X_test.drop(columns=['sex','embarked','age'])

In [None]:
X_train_rem = X_train_rem.values   # If DataFrame
X_train_embarked_ohe = np.array(X_train_embarked_ohe)  # If from OHE
X_train_age = np.array(X_train_age)
X_train_sex = np.array(X_train_sex)

X_test_rem = X_test_rem.values   # If DataFrame
X_test_embarked_ohe = np.array(X_test_embarked_ohe)  # If from OHE
X_test_age = np.array(X_test_age)
X_test_sex = np.array(X_test_sex)



In [58]:
X_train_transformed = np.concatenate(
    (X_train_rem, X_train_embarked_ohe, X_train_age, X_train_sex),
    axis=1
)

X_test_transformed=np.concatenate(
    (
        X_test_rem, X_test_embarked_ohe, X_test_age, X_test_sex

    ), axis=1
)

In [60]:
clf= DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [64]:
y_pred=clf.predict(X_test_transformed)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1])

In [65]:
from sklearn.metrics import accuracy_score
accuracy_score= accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy_score)

Accuracy: 0.7821229050279329


In [66]:
import pickle

In [69]:
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))


pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb') )
pickle.dump(clf,open('models/clf.pkl','wb'))
