In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)

In [2]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [3]:
X.shape

(10000, 2)

### 7. train a tree with moon data

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)

In [13]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid = GridSearchCV(tree, params, n_jobs=2, verbose=1, cv=3)

grid.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 882 out of 882 | elapsed:    6.8s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='deprecated', n_jobs=2,
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14

In [14]:
grid.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=17,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [16]:
from sklearn.metrics import accuracy_score

y_pred = grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

### 8. build a random forest

##### a). get 1000 random sets from the training set

In [19]:
len(X_train)

8000

In [21]:
from sklearn.model_selection import ShuffleSplit
idx = ShuffleSplit(n_splits=1000,train_size=100, random_state=0)

In [28]:
mini_sets = []
for train_idx, test_idx in idx.split(X_train):
    X_mini_train = X_train[train_idx]
    y_mini_train = y_train[train_idx]
    mini_sets.append((X_mini_train, y_mini_train))

In [32]:
mini_sets[0][0].shape

(100, 2)

In [33]:
len(mini_sets)

1000

##### b). Train one Decision Tree on each subset, using the best grid params above

In [37]:
from sklearn.base import clone

forest = [clone(grid.best_estimator_) for _ in range(1000)]

In [44]:
accuracy_scores = []

for tree, set in zip(forest, mini_sets):
    tree.fit(set[0], set[1])
    y_hat = tree.predict(X_test)
    score = accuracy_score(y_test, y_hat)
    accuracy_scores.append(score)

np.mean(accuracy_scores)

0.8047835

##### c). For each test set instance, generate the predictions of the 1,000 Decision Trees, and keep only the most frequent prediction

In [45]:
test = forest[0].predict(X_test)

In [46]:
test

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [48]:
test2 = forest[1].predict(X_test)

In [51]:
accuracy_score(test, test2)

0.7805

In [110]:
# create an empty array with fixed shape
Y_hats = np.empty([1000, len(X_test)], dtype=np.uint8)

for i,trained in enumerate(forest):
    y_hat = trained.predict(X_test)
    Y_hats[i] = y_hat

In [111]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_hats, axis=0)

In [112]:
y_pred_majority_votes.shape

(1, 2000)

In [113]:
accuracy_score(y_pred_majority_votes.T, y_test)

0.87