# Comprehensive Model for Titanic Dataset

In [355]:
# basic imports
import numpy as np
import pandas as pd

In [356]:
# load both train and test sets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Want:
- pclass
- sex
- age
- sibsp (?)
- fare
- embarked

In [357]:
# choose features to use
pclass = np.append(train.Pclass.values, test.Pclass.values).reshape(-1, 1)
sex = np.append(train.Sex.values, test.Sex.values).reshape(-1, 1)
age = np.append(train.Age.values, test.Age.values).reshape(-1, 1)
sibsp = np.append(train.SibSp.values, test.SibSp.values).reshape(-1, 1)
fare = np.append(train.Fare.values, test.Fare.values).reshape(-1, 1)
embarked = np.append(train.Embarked.values, test.Embarked.values).reshape(-1, 1)

# save labels
survived = train.Survived.values

In [358]:
# one-hot encode pclass
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
pclass = labelencoder.fit_transform(pclass.ravel())
onehotencoder = OneHotEncoder()
pclass = onehotencoder.fit_transform(pclass.reshape(-1, 1)).toarray()

In [359]:
# one-hot encode sex
sex = np.asarray([1 if i == 'male' else 0 for i in sex]).reshape(-1, 1)

In [360]:
# fill in missing values in age
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean')
age = imputer.fit_transform(age)

In [361]:
# one-hot encode sibsp
sibsp = onehotencoder.fit_transform(sibsp).toarray()

In [362]:
# fill in missing values in fare
fare = imputer.fit_transform(fare)

In [363]:
# one hot encode embarked
embarked = labelencoder.fit_transform(embarked.ravel().astype(str))
embarked = onehotencoder.fit_transform(embarked.reshape(-1, 1)).toarray()

In [364]:
# some missing values in embarked, forget about
embarked = embarked[:, :-1]

In [365]:
# combine features
X = np.hstack((pclass, 
              sex,
              age,
              sibsp,
              fare,
              embarked))

# rename labels
y = survived

# count number of training points
m = y.size

## Create Model

In [366]:
# create random forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=500)
forest.fit(X[:m], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [367]:
# make predictions on training set
y_pred = forest.predict(X[:m])

# see performance on training set
from sklearn.metrics import accuracy_score
accuracy_score(y, y_pred)

0.98092031425364756

In [368]:
# make predictions on test set
predictions = forest.predict(X[m:])

In [369]:
# include predictions and passenger ID
predictions = np.hstack((test.PassengerId.values.reshape(-1, 1),
                       predictions.reshape(-1, 1)))

predictions = np.vstack((np.array([['PassengerID', 'Survived']]),
                       predictions))

# save predictions to csv file (for Kaggle)
np.savetxt('titanic_random_forest_predictions.csv', 
           predictions, 
           delimiter=',', 
           fmt='%s')