# **Question 1**

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
train_path = './titanic/train.csv'
test_path = './titanic/test.csv'
submission_path = './titanic/submission.csv'

test_split = 0.2

In [None]:
def preprocess(ds):

    # Age
    ds['Age'].fillna(ds['Age'].mean(), inplace=True)
    ds.loc[ds['Age'] <= 16, 'Age'] = 0
    ds.loc[(ds['Age'] > 16) & (ds['Age'] <= 32), 'Age'] = 1
    ds.loc[(ds['Age'] > 32) & (ds['Age'] <= 48), 'Age'] = 2
    ds.loc[(ds['Age'] > 48) & (ds['Age'] <= 64), 'Age'] = 3
    ds.loc[ds['Age'] > 64, 'Age'] = 4
    ds['Age'] = ds['Age'].astype(int)

    # Fare
    ds['Fare'].fillna(ds['Fare'].mean(), inplace=True)
    ds.loc[ds['Fare'] <= 7.91, 'Fare'] = 0
    ds.loc[(ds['Fare'] > 7.91) & (ds['Age'] <= 14.454), 'Fare'] = 1
    ds.loc[(ds['Fare'] > 14.454) & (ds['Age'] <= 31), 'Fare'] = 2
    ds.loc[ds['Fare'] > 31, 'Fare'] = 3
    ds['Fare'] = ds['Fare'].astype(int)

    # Embark
    ds['Embarked'].fillna('S', inplace=True)
    ds['Embarked'] = ds['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # FamilyMems & IsAlone
    ds['FamilyMems'] = ds['SibSp'] + ds['Parch'] + 1

    # Sex
    ds['Sex'] = ds['Sex'].map({'male': 0, 'female': 1}).astype(int)

    ds = ds.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    return ds.values

# def validate()

In [None]:
dataset = pd.read_csv(train_path)
train_ds, val_ds = train_test_split(dataset, test_size=test_split, random_state=0)

train_x = preprocess(train_ds.drop(['Survived'], axis=1))
train_y = train_ds['Survived'].values.reshape((-1, ))
val_x = preprocess(val_ds.drop(['Survived'], axis=1))
val_y = val_ds['Survived'].values.reshape((-1, ))

print('train x shape:', train_x.shape)
print('train y shape:', train_y.shape)
print('test x shape:', val_x.shape)
print('test y shape:', val_y.shape)

In [None]:
max_depth = 5

In [None]:
dtc = DecisionTreeClassifier(max_depth=max_depth)
dtc = dtc.fit(train_x, train_y)
pred_y = dtc.predict(val_x)
val_acc = (pred_y == val_y).mean()
print('Validation accuracy:', val_acc)

In [None]:
dataset = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

train_x = preprocess(dataset.drop(['Survived'], axis=1))
train_y = dataset['Survived'].values.reshape((-1, ))
test_idx = test_data['PassengerId'].values.reshape((-1, ))
test_x = preprocess(test_data)

In [None]:
dtc = DecisionTreeClassifier(max_depth=max_depth)
dtc = dtc.fit(train_x, train_y)
pred_y = dtc.predict(test_x)
submission = np.stack((test_idx, pred_y), axis=1)
submission = pd.DataFrame(data=submission, columns=['PassengerId', 'Survived'])
submission.to_csv('./titanic/submission.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f titanic/submission.csv -m "Decision tree with feature engineering"

# **Question 2**

In [None]:
ds_path = './heart_disease_uci/heart.csv'
test_split = 0.2

In [None]:
def preprocess(df):

    df['age'] = (df['age'] - df['age'].mean()) / df['age'].std()
    df['trestbps'] = (df['trestbps'] - df['trestbps'].mean()) / df['trestbps'].std()
    df['chol'] = (df['chol'] - df['chol'].mean()) / df['chol'].std()

    return df.values

In [None]:
dataset = pd.read_csv(ds_path)
train_ds, val_ds = train_test_split(dataset, test_size=test_split, random_state=0)

In [None]:
train_x = preprocess(train_ds.drop(['target'], axis=1))
train_y = train_ds['target'].values.reshape((-1, ))
test_x = preprocess(test_ds.drop(['target'], axis=1))
test_y = test_ds['target'].values.reshape((-1, ))

print('Train X shape:', train_x.shape)
print('Train Y shape:', train_y.shape)
print('Test X shape:', test_x.shape)
print('Test Y shape:', test_y.shape)

In [None]:
k = 4
nn = KNeighborsClassifier(n_neighbors=k)
nb = GaussianNB()
nn.fit(train_x, train_y)
nb.fit(train_x, train_y)

In [None]:
y_pred_nn = nn.predict(test_x)
acc_nn = (y_pred_nn == test_y).mean()
print('K Nearest Neighbor Accuracy:', acc_nn)

In [None]:
y_pred_nb = nb.predict(test_x)
acc_nb = (y_pred_nb == test_y).mean()
print('Naive Bayse Accuracy:', acc_nb)