# **Question 1**

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_path = './titanic/train.csv'
test_path = './titanic/test.csv'
submission_path = './titanic/submission.csv'

train_split = 0.8

In [None]:
def preprocess(ds):

    # Age
    ds['Age'].fillna(ds['Age'].mean(), inplace=True)
    ds.loc[ds['Age'] <= 16, 'Age'] = 0
    ds.loc[(ds['Age'] > 16) & (ds['Age'] <= 32), 'Age'] = 1
    ds.loc[(ds['Age'] > 32) & (ds['Age'] <= 48), 'Age'] = 2
    ds.loc[(ds['Age'] > 48) & (ds['Age'] <= 64), 'Age'] = 3
    ds.loc[ds['Age'] > 64, 'Age'] = 4
    ds['Age'] = ds['Age'].astype(int)

    # Fare
    ds['Fare'].fillna(ds['Fare'].mean(), inplace=True)
    ds.loc[ds['Fare'] <= 7.91, 'Fare'] = 0
    ds.loc[(ds['Fare'] > 7.91) & (ds['Age'] <= 14.454), 'Fare'] = 1
    ds.loc[(ds['Fare'] > 14.454) & (ds['Age'] <= 31), 'Fare'] = 2
    ds.loc[ds['Fare'] > 31, 'Fare'] = 3
    ds['Fare'] = ds['Fare'].astype(int)

    # Embark
    ds['Embarked'].fillna('S', inplace=True)
    ds['Embarked'] = ds['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # FamilyMems & IsAlone
    ds['FamilyMems'] = ds['SibSp'] + ds['Parch'] + 1

    # Sex
    ds['Sex'] = ds['Sex'].map({'male': 0, 'female': 1}).astype(int)

    # Convert to numpy array
    y = ds['PassengerId']
    if 'Survived' in ds.columns:
        y = ds[['Survived']]
        ds = ds.drop(['Survived'], axis=1)
    x = ds.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)
    print('Number of rows:', len(x))
    return x.values, y.values.reshape((-1, ))

# def validate()

In [None]:
dataset = pd.read_csv(train_path)
num_train = int(train_split*len(dataset))
train_ds = dataset.iloc[:num_train]
val_ds = dataset.iloc[num_train:]
train_x, train_y = preprocess(train_ds)
val_x, val_y = preprocess(val_ds)

In [None]:
max_depth = 5

In [None]:
dtc = DecisionTreeClassifier(max_depth=max_depth)
dtc = dtc.fit(train_x, train_y)
pred_y = dtc.predict(val_x)
val_acc = (pred_y == val_y).mean()
print('Validation accuracy:', val_acc)

In [None]:
train_x, train_y = preprocess(pd.read_csv(train_path))
test_x, test_idx = preprocess(pd.read_csv(test_path))

In [None]:
dtc = DecisionTreeClassifier(max_depth=max_depth)
dtc = dtc.fit(train_x, train_y)
pred_y = dtc.predict(test_x)
submission = np.stack((test_idx, pred_y), axis=1)
submission = pd.DataFrame(data=submission, columns=['PassengerId', 'Survived'])
submission.to_csv('./titanic/submission.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f titanic/submission.csv -m "Decision tree with feature engineering"

# **Question 2**