## Decision Trees
 - DecisionTreeClassifier (multi-class calssification)

- !pip install --upgrade scikit-learn
- !pip install --upgrade graphviz

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, plot_tree


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as img 
import seaborn as sns

In [None]:
X = [[0,0], [1,1]]
y = [0, 1]
clf = DecisionTreeClassifier()
clf = clf.fit(X, y)

In [None]:
clf.predict([[-1,0],[0,1],[2,2], [3,3]])

In [None]:
clf.predict_proba([[-1,0],[0,1],[2,2], [3,3]])

#### Iris Dataset

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()
iris.data[:10]

In [None]:
clf1 = DecisionTreeClassifier()
clf1 = clf.fit(iris.data, iris.target)

In [None]:
plt.figure(figsize=(8,8))
plot_tree(clf1, fontsize=8);

In [None]:
import graphviz

In [None]:
dot_data = export_graphviz(clf1, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("iris")

In [None]:
!open iris.pdf

In [None]:
dot_data = export_graphviz(clf1, out_file=None, 
                     feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  

graph = graphviz.Source(dot_data, format="png")
graph

#### Model Evaluation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
X = iris.data
y = iris.target

In [None]:
# 70% training set, 30% testing set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
# Predict the response

y_pred = clf.predict(X_test)
y_pred

In [None]:
clf.predict_proba(X_test)

In [None]:
# Model accuracy

metrics.accuracy_score(y_test, y_pred)

In [None]:
dot_data = export_graphviz(clf, out_file=None, 
                     feature_names=iris.feature_names,  
                      class_names=iris.target_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)

graph

### Pandas Data Frame

In [None]:
iris = sns.load_dataset("iris")
iris.head()

In [None]:
feature_names = iris.columns[:-1]
feature_names

In [None]:
class_column = iris.columns[-1]
class_column

In [None]:
class_names = iris[class_column].unique()
class_names

In [None]:
X = iris[feature_names]
X.head()

In [None]:
y = iris[class_column]
y.head()

In [None]:
# 70% training set, 30% testing set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
# Predict the response

y_pred = clf.predict(X_test)
y_pred

In [None]:
# Model accuracy

metrics.accuracy_score(y_test, y_pred)

In [None]:
dot_data = export_graphviz(clf, out_file=None, 
                     feature_names=feature_names,  
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)

graph

- Default splitting criteria is Gini
- Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

In [None]:
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
dot_data = export_graphviz(clf, out_file=None, 
                     feature_names=feature_names,  
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)

graph

### Decision Surface

In [None]:
iris = load_iris()

In [None]:
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(12,8))

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")
plt.show()
