In [25]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # get petal length and width (columns)
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [26]:
import os

from sklearn.tree import export_graphviz

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)

export_graphviz(
    tree_clf,
    out_file=image_path("iris_tree.dot"),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [27]:
# make_moons practice
from sklearn.datasets import make_moons

moons = make_moons(n_samples=10000, noise=0.4)

In [28]:
# make training set and test set
from sklearn.model_selection import train_test_split

X, y = moons[0], moons[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [29]:
# time to do a grid search
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid= [
    { 'max_leaf_nodes': [2, 4, 6, 8]},
    { 'max_depth': [2, 4, 6], 'min_samples_split': [10, 100, 500, 1000]}
]

decision_tree = DecisionTreeClassifier()
grid_search = GridSearchCV(decision_tree, param_grid, cv=5, scoring='average_precision')

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_leaf_nodes': [2, 4, 6, 8]}, {'max_depth': [2, 4, 6], 'min_samples_split': [10, 100, 500, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='average_precision', verbose=0)

In [30]:
grid_search.best_params_

{'max_depth': 6, 'min_samples_split': 100}

In [31]:
best_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=100)

best_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [32]:
best_tree.predict_proba(X_train[0])

array([[ 0.02598425,  0.97401575]])

In [33]:
best_tree.predict(X_train[1])[0]

1

In [34]:
right = 0
wrong = 0

for idx, X_instance in enumerate(X_test):
    
    prediction = best_tree.predict(X_instance)[0]
    
    if prediction == y_test[idx]:
        right += 1
    else:
        wrong += 1
        
print("Right: %d" % right)
print("Wrong: %d" % wrong)

accuracy = float(right) / len(X_test)
print("Accuracy: %f" % accuracy)

Right: 1694
Wrong: 306
Accuracy: 0.847000


In [35]:
# WOOOOT
print("WOOOT")

1