Using random forest is stepping into the realm of model ensambles, but it is one of the most straightforward ones. Regardless it is a very important algorithm that combines both bootrapping and bagging to improve predictive power. I won't be going into much theory and will be reusing a lot of the code from 07_descision_trees.ipynb. However I'll define main concepts.

So what is a random forest? First of all it as an ensamble of decision trees (this is where the 'forest' comes from). Secondly, both features and observation are randomly sampled to build the decision trees. Finally, the predictions from individual trees are aggregated to get the final prediction. So the algorithm looks like:

1. Sample observations with replacement
2. When building a tree select number of randomly selected features to build the tree
3. Make the split according to a selected measure such as information gain

Repeat steps 1-3 until stopping conditions are met


### Step 1

Sampling with replacement is often called bootstrapping which in turn has an interesting property: not all of the samples will be selected will building given deceision tree in fact the number of unselected observations will be $\underset{n \to \infty}{lim}(1-\frac{1}{n})^n=e^{-1}\approx 0.0367$ where n is sampling size. We can use the the unused samples as out-of-sample (OOS) set. The error rate is usually called the out-of-bag error (OOB).

### Step 2
We take m out of K (total available features) (usually m is $m=\sqrt{K}$) This will create trees that dont share some of the features and probably some trees that don't share any features thus producing uncorrelated trees (predictions) 

### Step 3
Step 3 is just the descision tree algorithm.


In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
import pandas as pd
from trees_func import *
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

  warn(


In [26]:
def get_bootstrap_samples(X, y):
    boot_idx = list(np.random.choice(range(len(X)), len(X), replace = True))
    all_idx = set(range(0, len(X)))
    oob_idx = list(all_idx - set(boot_idx))
    X_boot = X[boot_idx]
    y_boot = y[boot_idx]
    X_oob = X[oob_idx]
    y_oob = y[oob_idx]
    return X_boot, y_boot, X_oob, y_oob

In [3]:
X_b, y_b, X_oob, y_oob = get_bootstrap_samples(X, y)

We'll need to splightly modify `get_best_split` to incorporate the feature parameter

In [4]:
def get_random_feats(X_b, n_feats = None):
    total_feats = X_b.shape[1]
    if n_feats is None:
        feat_idx = np.random.choice(range(total_feats), int(np.sqrt(total_feats)), replace = False)
    else:
        feat_idx = np.random.choice(range(total_feats), n_feats, replace = False)
    X_feats = X_b.iloc[:,feat_idx]
    return X_feats

In [5]:
n_estimators = 100
max_features = 3
max_depth = 10
min_samples_split = 2

In [6]:
feats=['age', 'sibsp', 'parch', 'fare', 'pclass']

In [7]:
X=X[feats].reset_index(drop = True)
y=y.reset_index(drop = True)

In [8]:
X_b, y_b, X_oob, y_oob = get_bootstrap_samples(X, y)
X_np=X_b.to_numpy()
y_np=y_b.to_numpy()
my_tree=grow_tree(X_np, y_np, depth = 0, min_samples_split=2, max_depth=4)

In [9]:
def oob_score(tree, X_test, y_test):
    mis_label = 0
    for i in range(len(X_test)):
        pred = predict_value(X_test[i], tree)
        if pred != y_test[i]:
            mis_label += 1
    return mis_label / len(X_test)

In [10]:
def traverse_tree(tree):
    if tree:
        print(vars(tree))
        traverse_tree(tree.data_left)
        traverse_tree(tree.data_right)

In [11]:
traverse_tree(my_tree)

{'feature': 0, 'threshold': 5.0, 'data_left': <trees_func.Node object at 0x7f22c86b8a60>, 'data_right': <trees_func.Node object at 0x7f22c86b9960>, 'gain': 0.19491318231521504, 'value': None}
{'feature': 1, 'threshold': 2.0, 'data_left': <trees_func.Node object at 0x7f22c86babf0>, 'data_right': <trees_func.Node object at 0x7f22c86baf50>, 'gain': 0.22883259093217168, 'value': None}
{'feature': 3, 'threshold': 14.4, 'data_left': <trees_func.Node object at 0x7f239877a170>, 'data_right': <trees_func.Node object at 0x7f2398778160>, 'gain': 0.17605802720794045, 'value': None}
{'feature': 3, 'threshold': 12.475, 'data_left': <trees_func.Node object at 0x7f23988de3b0>, 'data_right': <trees_func.Node object at 0x7f23988de5f0>, 'gain': 0.6617049208213364, 'value': None}
{'feature': None, 'threshold': None, 'data_left': None, 'data_right': None, 'gain': None, 'value': '1'}
{'feature': None, 'threshold': None, 'data_left': None, 'data_right': None, 'gain': None, 'value': '0'}
{'feature': 3, 'thres

In [12]:
predict_value(X_oob.iloc[0], my_tree)

'1'

In [13]:
def build_tree(X_bootstrap, y_bootstrap, max_depth, min_samples_split, max_features):
    root_node = find_split_point(X_bootstrap, y_bootstrap, max_features)
    split_node(root_node, max_features, min_samples_split, max_depth, 1)
    return root_node

def random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split):
    tree_ls = list()
    oob_ls = list()
    for i in range(n_estimators):
        X_bootstrap, y_bootstrap, X_oob, y_oob = draw_bootstrap(X_train, y_train)
        tree = build_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
        tree_ls.append(tree)
        oob_error = oob_score(tree, X_oob, y_oob)
        oob_ls.append(oob_error)
    print("OOB estimate: {:.2f}".format(np.mean(oob_ls)))
    return tree_ls

In [14]:
tree_ls = list()
oob_ls = list()
my_tree=grow_tree(X_np, y_np, depth = 0, min_samples_split=2, max_depth=4)
tree_ls.append(my_tree)

In [15]:
predict_value(X_oob.to_numpy()[0], my_tree)
oob_error = oob_score(my_tree, X_oob.to_numpy(), y_oob.to_numpy())

In [24]:
for i in range(n_estimators):
    X_bootstrap, y_bootstrap, X_oob, y_oob = draw_bootstrap(X_train, y_train)
    tree = grow_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
    tree_ls.append(tree)
    oob_error = oob_score(tree, X_oob, y_oob)
    oob_ls.append(oob_error)
# print("OOB estimate: {:.2f}".format(np.mean(oob_ls)))
# return tree_ls

NameError: name 'draw_bootstrap' is not defined

In [39]:
def random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split):
    tree_ls = list()
    oob_ls = list()
    n_estimators=10
    for i in range(n_estimators):
        X_bootstrap, y_bootstrap, X_oob, y_oob = get_bootstrap_samples(X_train, y_train)
        tree = grow_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
        tree_ls.append(tree)
        oob_error = oob_score(tree, X_oob, y_oob)
        oob_ls.append(oob_error)
    
    print("OOB estimate: {:.2f}".format(np.mean(oob_ls)))
    return(tree_ls)

In [99]:
X_np

array([[27.    ,  0.    ,  0.    ,  7.8958,  3.    ],
       [16.    ,  0.    ,  0.    , 86.5   ,  1.    ],
       [30.    ,  0.    ,  0.    , 12.475 ,  3.    ],
       ...,
       [32.    ,  0.    ,  0.    ,  7.8542,  3.    ],
       [    nan,  1.    ,  0.    , 14.4542,  3.    ],
       [60.    ,  0.    ,  0.    , 76.2917,  1.    ]])

In [43]:
res=random_forest(X_train=X_np, y_train=y_np, n_estimators = 10, max_features=2, max_depth=5, min_samples_split=5)

OOB estimate: 0.45


In [36]:
oob_ls

[0.36363636363636365,
 0.3524590163934426,
 0.3775933609958506,
 0.4024640657084189,
 0.3908523908523909,
 0.3917748917748918,
 0.35639412997903563,
 0.3656565656565657,
 0.3550420168067227,
 0.3199152542372881]

In [44]:
def predict_rf(tree_ls, X_test):
    pred_ls = list()
    for i in range(len(X_test)):
        ensemble_preds = [predict_tree(tree, X_test.values[i]) for tree in tree_ls]
        final_pred = max(ensemble_preds, key = ensemble_preds.count)
        pred_ls.append(final_pred)
    return np.array(pred_ls)

In [61]:
tree_preds = list()

In [62]:
for tr in res:
    rr = [predict_value(X_np[i], tr) for i in range(X_np.shape[0])]
    tree_preds.append(rr)

In [75]:
for i in range(0, len(tree_preds))tree_preds[0][0]

'0'

In [92]:
ensamble_preds=list()
for i in range(0, len(tree_preds[0])):
    ensamble_preds.append(st.mode(list(map(lambda l: l[i], tree_preds))))

In [97]:
np.mean(y==ensamble_preds)

0.440794499618029

In [98]:
5000*12

60000