# ** Question 8 **

In [98]:
import time
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [99]:
data_path = './titanic/train.csv'

In [100]:
data = pd.read_csv(data_path)
train, test = train_test_split(data, test_size=0.2, random_state=100)

train_y = train[['Survived']]
train_x = train.drop(['Survived'], axis=1)
test_y = test[['Survived']]
test_x = test.drop(['Survived'], axis=1)

print('train x shape:', train_x.shape)
print('train y shape:', train_y.shape)
print('test x shape:', test_x.shape)
print('test y shape:', test_y.shape)

train x shape: (712, 11)
train y shape: (712, 1)
test x shape: (179, 11)
test y shape: (179, 1)


In [101]:
def preprocess(ds):

    # Age
    ds['Age'].fillna(ds['Age'].mean(), inplace=True)
    ds.loc[ds['Age'] <= 16, 'Age'] = 0
    ds.loc[(ds['Age'] > 16) & (ds['Age'] <= 32), 'Age'] = 1
    ds.loc[(ds['Age'] > 32) & (ds['Age'] <= 48), 'Age'] = 2
    ds.loc[(ds['Age'] > 48) & (ds['Age'] <= 64), 'Age'] = 3
    ds.loc[ds['Age'] > 64, 'Age'] = 4
    ds['Age'] = ds['Age'].astype(int)

    # Fare
    ds['Fare'].fillna(ds['Fare'].mean(), inplace=True)
    ds.loc[ds['Fare'] <= 7.91, 'Fare'] = 0
    ds.loc[(ds['Fare'] > 7.91) & (ds['Age'] <= 14.454), 'Fare'] = 1
    ds.loc[(ds['Fare'] > 14.454) & (ds['Age'] <= 31), 'Fare'] = 2
    ds.loc[ds['Fare'] > 31, 'Fare'] = 3
    ds['Fare'] = ds['Fare'].astype(int)

    # Embark
    ds['Embarked'].fillna('S', inplace=True)
    ds['Embarked'] = ds['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    
    # FamilyMems & IsAlone
    ds['FamilyMems'] = ds['SibSp'] + ds['Parch'] + 1

    # Sex
    ds['Sex'] = ds['Sex'].map({'male': 0, 'female': 1}).astype(int)

    ds = ds.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    return ds.values

In [102]:
train_x = preprocess(train_x)
test_x = preprocess(test_x)

In [111]:
dep = 30
criterion = 'gini'
num_trees = 100
rnd_state = 0

In [112]:
rfc = RandomForestClassifier(max_depth=dep, criterion=criterion,  n_estimators=num_trees)
dtc = DecisionTreeClassifier(max_depth=dep, criterion=criterion)

start = time.time()
rfc.fit(train_x, train_y)
rfc_time = time.time() - start

start =  time.time()
dtc.fit(train_x, train_y)
dtc_time = time.time() - start

rfc_pred_y = rfc.predict(test_x)
dtc_pred_y = dtc.predict(test_x)

rfc_test_acc = (rfc_pred_y == test_y.squeeze()).mean()
dtc_test_acc = (dtc_pred_y == test_y.squeeze()).mean()

print('Random Forest')
print('\tTest accuracy:', rfc_test_acc)
print('\tTrain duration:', rfc_time)

print('\nDecision Tree')
print('\tTest accuracy:', dtc_test_acc)
print('\tTrain duration:', dtc_time)


Random Forest
	Test accuracy: 0.8379888268156425
	Train duration: 0.10272598266601562

Decision Tree
	Test accuracy: 0.8044692737430168
	Train duration: 0.0019948482513427734


# ** Question 9 **

In [117]:
from sklearn.svm import SVC

In [118]:
data = pd.read_csv(data_path)
train, test = train_test_split(data, test_size=0.2, random_state=100)

train_y = train[['Survived']]
train_x = train.drop(['Survived'], axis=1)
test_y = test[['Survived']]
test_x = test.drop(['Survived'], axis=1)

print('train x shape:', train_x.shape)
print('train y shape:', train_y.shape)
print('test x shape:', test_x.shape)
print('test y shape:', test_y.shape)

train x shape: (712, 11)
train y shape: (712, 1)
test x shape: (179, 11)
test y shape: (179, 1)


In [119]:
train_x = preprocess(train_x)
test_x = preprocess(test_x)

In [120]:
linear_svm = SVC(kernel='linear')
non_linear_svm = SVC(kernel='poly', degree=2)

linear_svm.fit(train_x, train_y)
non_linear_svm.fit(train_x, train_y)

lin_pred = linear_svm.predict(test_x)
nlin_pred = non_linear_svm.predict(test_x)

lin_acc = (lin_pred == test_y.squeeze()).mean()
nlin_acc = (nlin_pred == test_y.squeeze()).mean()

print('Linear Kernel')
print('\tTest accuracy:', lin_acc)

print('\nNon-linear Kernel')
print('\tTest accuracy:', nlin_acc)


Linear Kernel
	Test accuracy: 0.7877094972067039

Non-linear Kernel
	Test accuracy: 0.7932960893854749
