### 1. Package imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

%matplotlib inline

from utility import * # custom functions that are used throughout this script

### 2. Data read-in

In [2]:
path_input_file = os.path.join(os.getcwd(), "01_data", "data_prepared.csv")

data = pd.read_csv(path_input_file)
data = data.set_index("PassengerId", drop=True)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,TrainTest,FamilyAllSurvived,FamilyAllDied,Fsize,...,"AgeBin_(64.0, 72.0]","AgeBin_(72.0, 80.0]",Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_None,Deck_T
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3,22,1,0,1.98,Train,0,1,2,...,0,0,0,0,0,0,0,0,1,0
2,1.0,1,38,1,0,4.27,Train,0,0,2,...,0,0,0,1,0,0,0,0,0,0
3,1.0,3,26,0,0,2.07,Train,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1.0,1,35,1,0,3.97,Train,0,0,2,...,0,0,0,1,0,0,0,0,0,0
5,0.0,3,35,0,0,2.09,Train,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### 3. Data prep

#### Split data

In [3]:
train, test = split_data(data)

#### Apply StandardScaler

In [4]:
columns_to_scale = ["Age", "Fare", "Pclass", "SibSp", "Parch","Fsize"]
train[columns_to_scale] = train[columns_to_scale].astype(float)
scaler = StandardScaler().fit(train[columns_to_scale])

In [5]:
train[columns_to_scale] = scaler.transform(train[columns_to_scale])
test[columns_to_scale] = test[columns_to_scale].astype(float)
test[columns_to_scale] = scaler.transform(test[columns_to_scale])

### 4. GridSearch

In [6]:
def run_grid_search(X, y):
    n_estimators = [1000]
    criterion = ["gini"]
    max_depth = range(10, 101, 10)
    min_samples_split = range(2, 6)
    min_samples_leaf = range(8, 13)
    max_features = [1, 2, 3, "sqrt", "log2", None]
    bootstrap = [True]
    
    parameter_grid = {"n_estimators": n_estimators,
                      "criterion": criterion,
                      "max_depth": max_depth,
                      "min_samples_split": min_samples_split,
                      "min_samples_leaf": min_samples_leaf,
                      "max_features": max_features,
                      "bootstrap": bootstrap}
    
    rfc = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=rfc, param_grid=parameter_grid, cv=10, n_jobs=-1, verbose=2)
    grid_search.fit(X, y)
    
    return grid_search

In [7]:
# grid_search = run_grid_search(X=train.drop("Survived", axis=1),
#                               y=train["Survived"])

Fitting 10 folds for each of 1200 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 5824 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 6837 tasks      | elapsed: 38.9min
[Parallel(n_jobs=-1)]: Done 7930 tasks      | elapsed: 45.5min
[Parallel(n_jobs=-1)]: Done 9105 tasks      | 

In [8]:
# best_params = grid_search.best_params_
# best_params

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 8,
 'min_samples_split': 4,
 'n_estimators': 1000}

In [None]:
best_params = {'bootstrap': True,
             'criterion': 'gini',
             'max_depth': 30,
             'max_features': 'sqrt',
             'min_samples_leaf': 8,
             'min_samples_split': 4,
             'n_estimators': 1000}

### 5. Train classifier & predict

In [9]:
rfc = RandomForestClassifier(**best_params)
rfc.fit(train.drop("Survived", axis=1), train["Survived"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
path_output_folder = "02_predictions"
predict_and_save_in_kaggle_format(clf=rfc, test_df=test, path_output_folder=path_output_folder,
                                 name_output_file="predictions_rfc_optimized.csv")

![](99_misc/rfc_score.jpg)