# loading data

In [1]:
import pandas as pd

data = pd.read_csv('../data/raw/titanic-dataset.csv')

dropped_columns = ["PassengerId", "Cabin", "Name", "Ticket"]

df = (
    data
    .drop(dropped_columns, axis=1)
    .dropna()
)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


# feature engineering

In [2]:
from sklearn.preprocessing import label_binarize

fe_df = df.copy()

fe_df['Sex'] = label_binarize(fe_df['Sex'], classes=["male", "female"])
fe_df['Embarked'] = label_binarize(fe_df['Embarked'], classes=["Q", "S", "C"])

fe_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,34.5,0,0,7.8292,1
1,1,3,1,47.0,1,0,7.0,0
2,0,2,0,62.0,0,0,9.6875,1
3,0,3,0,27.0,0,0,8.6625,0
4,1,3,1,22.0,1,1,12.2875,0


# preparing train/test data

In [3]:
label_column = "Survived"
train_split = 0.9

train_size = int(len(fe_df) * 0.9)
train, test = fe_df.iloc[:train_size], fe_df.iloc[train_size:]

X_train = train.drop(label_column, axis=1)
y_train = train[label_column]

X_test = test.drop(label_column, axis=1)
y_test = test[label_column]

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

msg = "Size of train is {} and size of test is {}".format(len(X_train), len(X_test))

print(msg)

Size of train is 297 and size of test is 34


# training model

In [4]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

predictions

array([1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0])

# model evaluation

In [5]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, predictions)

acc

1.0