In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
770,771,0,3,"Lievens, Mr. Rene Aime",male,24.0,0,0,345781,9.5,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
275,276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S
684,685,0,2,"Brown, Mr. Thomas William Solomon",male,60.0,1,1,29750,39.0,,S


In [3]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
275,1,1,female,63.0,1,0,77.9583,S
745,0,1,male,70.0,1,1,71.0,S
877,0,3,male,19.0,0,0,7.8958,S
608,1,2,female,22.0,1,2,41.5792,C
564,0,3,female,,0,0,8.05,S


In [4]:
df['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77


In [5]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


# Train-Test-Split

In [6]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Impute Train and Test missing value

In [8]:
from sklearn.impute import SimpleImputer
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

# Train data imputer
X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

# test data imputer
X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [9]:
X_train_age.shape

(712, 1)

In [10]:
X_train_embarked[0]

array(['S'], dtype=object)

# One Hot Encoding Sex and Embarked
1. `Sex` is Nominal Data
2. `Embarked` is Nominal Data

In [11]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
79,1,3,female,30.0,0,0,12.475,S
773,0,3,male,,0,0,7.225,C
774,1,2,female,54.0,1,3,23.0,S
114,0,3,female,17.0,0,0,14.4583,C
150,0,2,male,51.0,0,0,12.525,S


In [12]:
from sklearn.preprocessing import OneHotEncoder
ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

# Train data
X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

#Test data
X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [13]:
X_train_sex.shape

(712, 2)

In [14]:
X_train_embarked.shape

(712, 3)

# Remove these column from main DataFrame

In [15]:
X_train_rem = X_train.drop(columns=['Age', 'Sex','Embarked'])
X_test_rem = X_test.drop(columns=['Age', 'Sex','Embarked'])

X_train_rem.sample(2)

Unnamed: 0,Pclass,SibSp,Parch,Fare
307,1,1,0,108.9
804,3,0,0,6.975


In [16]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [17]:
X_train_transformed.shape

(712, 10)

# Train_Model

In [18]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

## Prediction

In [19]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1])

## Check Model Accuracy

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7821229050279329

# Model Export using pickle

In [21]:
import pickle

In [22]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))