[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncerdan/HandsOnML/blob/master/Ch_06_Decision_Trees.ipynb)

# Training and Visualizing a Decision Tree

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]  # peral length and width
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
# for visualizing:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file='images/iris_tree.dot',
    feature_names=iris.feature_names[2:],
    class_names =iris.target_names,
    rounded=True,
    filled=True
)

# then use CLI:
# $ dot -Tpng images/iris_tree.dot -o images/iris_tree.png

# Estimating Class Probablities

In [3]:
# can get the probabilities of each class
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [4]:
# you can see that predict just picks the larges proba
tree_clf.predict([[5, 1.5]])

array([1])

# Regression

In [5]:
# Make a noisy quadratic dataset
import numpy as np
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)

In [6]:
# Define your X, y
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
# visualize
export_graphviz(
    tree_reg,
    out_file='images/iris_tree_reg.dot',
    rounded=True,
    filled=True
)

# Exercises

## 7.) Train and fine-tune a Decision Tree for the moons dataset

In [21]:
# get the dataset
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4)

In [22]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [28]:
# search for hyperparams
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_leaf_nodes': [2, 3, 5], 'max_depth': [2, 4]}
]

grid_search_cv = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3)
grid_search_cv.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_depth': [2, 4], 'max_leaf_nodes': [2, 3, 5]}],


In [30]:
# look at what it found
print(grid_search_cv.best_estimator_)
print(grid_search_cv.best_score_)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=5,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
0.8533333333333334


In [32]:
# now let's test it!
from sklearn.metrics import accuracy_score

pred = grid_search_cv.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)    # not too bad!

0.8596