### Random Forests on the Titanic

In [1]:
import numpy as np
import pandas as pd

In [2]:
titanic_train = pd.read_csv("titanic_train.csv")    # Read the data

# Impute median Age for NA Age values
new_age_var = np.where(titanic_train["Age"].isnull(), # Logical check
                       28,                       # Value if check is true
                       titanic_train["Age"])     # Value if check is false

titanic_train["Age"] = new_age_var 

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [6]:
# Set the seed
np.random.seed(12)

# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# Convert some variables to numeric
titanic_train["Sex"] = label_encoder.fit_transform(titanic_train["Sex"])

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=1000, # Number of trees
                                  max_features=2,    # Num features considered
                                  oob_score=True)    # Use OOB scoring*

features = ["Sex","Pclass","SibSp","Age","Fare"]

# Train the model
rf_model.fit(X=titanic_train[features],
             y=titanic_train["Survived"])

print("OOB accuracy: ")
print(rf_model.oob_score_)

OOB accuracy: 
0.819304152637


check the feature importance for our random forest model:

In [7]:
for feature, imp in zip(features, rf_model.feature_importances_):
    print(feature, imp)

Sex 0.273466442475
Pclass 0.0900259300159
SibSp 0.0486868588702
Age 0.276683473829
Fare 0.311137294809
