# Training and Visualizing a Decision Trees

In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris(as_frame=True)
X_iris = iris.data[["petal length (cm)", "petal width (cm)"]].values
y_iris = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_iris, y_iris)

In [3]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file='iris_tree.dot',
    feature_names=["petal length (cm)", "petal width (cm)"],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [5]:
from graphviz import Source

Source.from_file("iris_tree.dot")

ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x13c4cd430>

In [7]:
help(tree_clf.tree_)

Help on Tree object:

class Tree(builtins.object)
 |  Array-based representation of a binary decision tree.
 |  
 |  The binary tree is represented as a number of parallel arrays. The i-th
 |  element of each array holds information about the node `i`. Node 0 is the
 |  tree's root. You can find a detailed description of all arrays in
 |  `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
 |  nodes, resp. In this case the values of nodes of the other type are
 |  arbitrary!
 |  
 |  Attributes
 |  ----------
 |  node_count : int
 |      The number of nodes (internal nodes + leaves) in the tree.
 |  
 |  capacity : int
 |      The current capacity (i.e., size) of the arrays, which is at least as
 |      great as `node_count`.
 |  
 |  max_depth : int
 |      The depth of the tree, i.e. the maximum depth of its leaves.
 |  
 |  children_left : array of int, shape [node_count]
 |      children_left[i] holds the node id of the left child of node i.
 |      For leave

# Estimating Class Probabilities

In [8]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [9]:
tree_clf.predict([[5, 1.5]])

array([1])

# Regularization Hyperparameters

In [11]:
from sklearn.datasets import make_moons

X_moons, y_moons = make_moons(n_samples=150, noise=0.2, random_state=42)

tree_clf1 = DecisionTreeClassifier(random_state=42)
tree_clf2 = DecisionTreeClassifier(min_samples_leaf=5, random_state=42)

tree_clf1.fit(X_moons, y_moons)
tree_clf2.fit(X_moons, y_moons)

In [12]:
X_moons_test, y_moons_test = make_moons(n_samples=1000, noise=0.2, random_state=43)

print(tree_clf1.score(X_moons_test, y_moons_test))
print(tree_clf2.score(X_moons_test, y_moons_test))

0.898
0.92


# Regression

In [13]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
X_quad = np.random.rand(200, 1) - 0.5
y_quad = X_quad ** 2 + 0.025 * np.random.randn(200, 1)

tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(X_quad, y_quad)

# Sensitivity to Axis Orientation

In [14]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pca_pipeline = make_pipeline(StandardScaler(), PCA())
X_iris_rotated = pca_pipeline.fit_transform(X_iris)
tree_clf_pca = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf_pca.fit(X_iris_rotated, y_iris)

# Exercises

## 7.

In [23]:
from sklearn.model_selection import train_test_split

train, target = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42)

In [28]:
from sklearn.model_selection import GridSearchCV

model = DecisionTreeClassifier(random_state=42)
parameters = {
    "max_depth": list(range(1, 11)),
    "max_leaf_nodes": list(range(1, 11)),
    "min_samples_leaf": [10, 20, 40, 60, 80, 100]
}
grid = GridSearchCV(model, parameters, cv=10)
grid_search = grid.fit(X_train, y_train)

600 fits failed out of a total of 6000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
600 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maelfosso/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maelfosso/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/Users/maelfosso/Library/Python/3.9/lib/python/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/Users/maelfosso/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 581, in _validate_params
    va

In [29]:
grid_search.best_params_

{'max_depth': 2, 'max_leaf_nodes': 4, 'min_samples_leaf': 10}

In [30]:
grid_search.best_score_

0.85925

In [31]:
grid_search.best_estimator_

## 8. Grow a forest

In [35]:
import numpy as np
from sklearn.model_selection import ShuffleSplit

n_subsets = 1000
n_instances = 100

sets = []

ss = ShuffleSplit(n_splits=n_subsets, test_size=len(X_train) - n_instances, random_state=0)
for ix_train, ix_test in ss.split(X_train):
    X_min_train = X_train[ix_train]
    y_min_train = y_train[ix_train]
    
    sets.append((X_min_train, y_min_train))

In [36]:
from sklearn.base import clone

forest = [clone(grid_search.best_estimator_) for _ in range(n_subsets)]

for tree, (X_min_train, y_min_train) in zip(forest, sets):
    tree.fit(X_min_train, y_min_train)
    
    # y_pred = tree.predict(X_test)

In [37]:
Y_pred = np.empty([n_subsets, len(X_test)], dtype=np.uint8)

for ix, tree in enumerate(forest):
    Y_pred[ix] = tree.predict(X_test)

In [39]:
from scipy.stats import mode

y_pred_majority_vote, n_votes = mode(Y_pred, axis=0)

  y_pred_majority_vote, n_votes = mode(Y_pred, axis=0)


In [40]:
n_votes

array([[642, 595, 982, ..., 995, 856, 999]])

In [41]:
y_pred_majority_vote

array([[0, 1, 0, ..., 1, 0, 0]], dtype=uint8)

In [43]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_majority_vote.reshape(-1))

0.8225