# Titanic Survival Predictions

We will make use of the Random Forest model for regression. For each passenger, we will predict whether they survived or not.

In [167]:
import pandas as pd
# Random Forest ensemble to get more accurate estimates
from sklearn.ensemble import RandomForestRegressor
# Evaluate with ROC curve and AUC scoring
from sklearn.metrics import roc_auc_score

In [168]:
# Import dataset
X = pd.read_csv("datasets/titanic_train.csv")
# Extract the feature we're predicting
y = X.pop("Survived")
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [169]:
# Clean Age's NULL values by replacing them with the feature's mean
X.Age = X.fillna(X.Age.mean(), inplace=True)
# Don't really need the name, passenger id, or ticket number - all seem irrelevant
X.drop(labels=["Name","PassengerId","Ticket"], axis=1, inplace=True)
# Remove non-numericals columns from the model
X_numerics = X[list(X.dtypes[X.dtypes != "object"].index)]

In [170]:
model = RandomForestRegressor(n_estimators=100, oob_score=True)
model.fit(X_numerics, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

In [171]:
# See how good the model is with out-of-the-bag training, with less predictable data
model.oob_score_

0.10053303282260784

In [172]:
oob_pred = model.oob_prediction_
print("AUC Score:", roc_auc_score(y, oob_pred))

AUC Score: 0.7351857177856603


Let's try something else.

In [173]:
# GridSearchCV lets us tune the model and get a better measure with K-fold cross validation
from sklearn.model_selection import GridSearchCV, cross_val_score
# We can change up values for the arguments like cv (number of folds), and the arguments for the estimator.
# We might even decide to change the estimator for the sake of getting a higher AUC score
gs = GridSearchCV(estimator=RandomForestRegressor(),
                 param_grid={
                     "max_depth": range(2,10),
                     "n_estimators": (50,100,150,200,250,300,350,400,500)
                 },
                 n_jobs=-1,
                 cv=5,
                 verbose=10,
                 scoring="neg_mean_squared_error")

gs_result = gs.fit(X_numerics, y)
gs_top_params = gs_result.best_params_
print("Our best parameters were", gs_top_params)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1440s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0576s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:   14.4s
[Parallel(n_jo

Our best parameters were {'max_depth': 6, 'n_estimators': 350}


In [174]:
# Make a new Random Forest with the parameters we got from the grid search
gs_rf_model = RandomForestRegressor(max_depth=gs_top_params["max_depth"], 
                                    n_estimators=gs_top_params["n_estimators"], 
                                    oob_score=True)
gs_rf_model.fit(X_numerics, y)
gs_rf_model.oob_score_

0.19987582773840984

Yay! We got a better OOB score!

In [175]:
gs_rf_oob_pred = gs_rf_model.oob_prediction_
print("AUC Score:", roc_auc_score(y, gs_rf_oob_pred))

AUC Score: 0.764441461881784


And, indeed, we have a better ROC score. Both the OOB score and AUC score will oscillate around these values on different executions.