# Data Science Bootcamp
# <center> **Aula 20 -- Decision Trees**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

## Load data

In [None]:
iris = load_iris()
X = iris['data']
y = iris['target']

In [None]:
colors = {0:'red', 1:'blue', 2:'green'}
c = []
for i in range(0, len(y)):
    c.append(colors[y[i]])

In [None]:
plt.figure()
plt.scatter(X[:,0], X[:,1], c=c)
plt.show()

In [None]:
# separamos en clases 0, 1, 2
y0 = np.argwhere(y==0)
y1 = np.argwhere(y==1)
y2 = np.argwhere(y==2)

X0 = X[y0]
X1 = X[y1]
X2 = X[y2]

# extract class 0 and 1
X01 = np.vstack([X0,X1]).reshape(-1,4)
y01 = np.stack(np.array(50*[0] + 50*[1]))

# extract class 1 and 2
X12 = np.vstack([X1,X2]).reshape(-1,4)
y12 = np.stack(np.array(50*[0] + 50*[1]))

In [None]:
print(X.shape, y.shape)
print(X01.shape, y01.shape)
print(X12.shape, y12.shape)

## Árboles de Decisión

### Experimento 1: Clase 0 y 1, Profundidad 1

In [None]:
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=1, random_state=None)

In [None]:
model.fit(X01[:,[0,1]], y01)

In [None]:
plt.figure()
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X01[:,[0,1]])
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y01, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y01, yhat))

## Regiones de clasificación

In [None]:
# Plotting decision regions
h = 0.01
x_min, x_max = X01[:,0].min() - 1, X01[:,0].max() + 1
y_min, y_max = X01[:,1].min() - 1, X01[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
plt.figure(figsize=(6,6))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X01[:,0], X01[:,1], c=y01, s=20, edgecolor='k')
plt.title('Árbol de Decisión')
#plt.h([0,0], w, 'k-')
plt.xlim([x_min,x_max])
plt.ylim([y_min,y_max])
plt.show()

### Experimento 2: Clase 0 y 1, Profundidad 2

In [None]:
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=2, random_state=None)

In [None]:
model.fit(X01[:,[0,1]], y01)

In [None]:
plt.figure()
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X01[:,[0,1]])
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y01, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y01, yhat))

## Regiones de clasificación

In [None]:
# Plotting decision regions
h = 0.01
x_min, x_max = X01[:,0].min() - 1, X01[:,0].max() + 1
y_min, y_max = X01[:,1].min() - 1, X01[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
plt.figure(figsize=(6,6))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X01[:,0], X01[:,1], c=y01, s=20, edgecolor='k')
plt.title('Árbol de Decisión')
#plt.h([0,0], w, 'k-')
plt.xlim([x_min,x_max])
plt.ylim([y_min,y_max])
plt.show()

### Experimento 3: Clase 0 y 1, Profundidad 3

In [None]:
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, random_state=None)

In [None]:
model.fit(X01[:,[0,1]], y01)

In [None]:
plt.figure()
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X01[:,[0,1]])
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y01, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y01, yhat))

## Regiones de clasificación

In [None]:
# Plotting decision regions
h = 0.01
x_min, x_max = X01[:,0].min() - 1, X01[:,0].max() + 1
y_min, y_max = X01[:,1].min() - 1, X01[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
plt.figure(figsize=(6,6))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X01[:,0], X01[:,1], c=y01, s=20, edgecolor='k')
plt.title('Árbol de Decisión')
#plt.h([0,0], w, 'k-')
plt.xlim([x_min,x_max])
plt.ylim([y_min,y_max])
plt.show()

### Experimento 4: Clase 1 y 2, Profundidad 7

In [None]:
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=7, random_state=None)

In [None]:
model.fit(X12[:,[0,1]], y12)

In [None]:
plt.figure(figsize=(15,10))
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X12[:,[0,1]])
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y12, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y01, yhat))

## Regiones de clasificación

In [None]:
# Plotting decision regions
h = 0.01
x_min, x_max = X12[:,0].min() - 1, X12[:,0].max() + 1
y_min, y_max = X12[:,1].min() - 1, X12[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
plt.figure(figsize=(6,6))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X12[:,0], X12[:,1], c=y12, s=20, edgecolor='k')
plt.title('Árbol de Decisión')
#plt.h([0,0], w, 'k-')
plt.xlim([x_min,x_max])
plt.ylim([y_min,y_max])
plt.show()

## Todos los datos

In [None]:
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=4, random_state=None)

In [None]:
model.fit(X[:,[0,1]], y)

In [None]:
plt.figure(figsize=(15,8))
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X[:,[0,1]])
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y, yhat))

## Regiones de clasificación

In [None]:
# Plotting decision regions
h = 0.01
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
plt.figure(figsize=(6,6))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:,0], X[:,1], c=y, s=20, edgecolor='k')
plt.title('Árbol de Decisión')
#plt.h([0,0], w, 'k-')
plt.xlim([x_min,x_max])
plt.ylim([y_min,y_max])
plt.show()

## Importancias

In [None]:
model.feature_importances_

#### Todas las variables

In [None]:
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=4, random_state=None)

In [None]:
model.fit(X, y)

In [None]:
plt.figure(figsize=(15,8))
plot_tree(model)
plt.show()

In [None]:
yhat = model.predict(X)
print(yhat)

In [None]:
cfmatrix = confusion_matrix(y, yhat)
print(cfmatrix)

In [None]:
plt.figure()
sns.heatmap(cfmatrix, annot=True)
plt.show()

In [None]:
print(classification_report(y, yhat))

In [None]:
names = iris.feature_names
importances = model.feature_importances_

In [None]:
print(names)

In [None]:
idx = np.argsort(importances)[::-1]
for i in range (0, 4):
    print ('{}: {} - {}'.format(i+1, importances[idx[i]], names[idx[i]]))

In [None]:
def plot_importances(model, features, modelname='Model', fsz=(5,5)):
    #for name, importance in zip(names, model.feature_importances_):
    #    print(name, "=", importance)
    imp = model.feature_importances_
    idx = np.argsort(imp)
    plt.figure(figsize=fsz)
    plt.title('{} Feature Importances'.format(modelname))
    plt.barh(range(len(idx)), imp[idx], color='b', align='center')
    plt.yticks(range(len(idx)), [features[i] for i in idx])
    plt.xlabel('Relative Importance')
    plt.show()

In [None]:
names = iris.feature_names

In [None]:
plot_importances(model, names, modelname='Decision Tree', fsz=(4,3))