# Decision Trees

In [None]:
!pip install scikit-learn==0.23.2

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

## Loading data

In [None]:
iris_dataset = load_iris(as_frame=True)

In [None]:
iris_dataset.keys()

## Exploring

In [None]:
X = iris_dataset.data
X.shape

In [None]:
X.head()

In [None]:
X.describe()

In [None]:
y = iris_dataset.target
y.shape

In [None]:
y.head()

In [None]:
y.value_counts()

In [None]:
iris_dataset.target_names

## Visualization

In [None]:
sepal_length = X.iloc[:, 0]
petal_width = X.iloc[:, 3]
x_min, x_max = sepal_length.min() - .5, sepal_length.max() + .5
y_min, y_max = petal_width.min() - .5, petal_width.max() + .5

plt.figure(2, figsize=(11, 7))
plt.clf()

# Plot the training points
plt.scatter(sepal_length, petal_width, c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Petal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(());

## Training a decision tree

### Only two features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, [0,3]], y, train_size=0.7, stratify=y, random_state=0)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
model = DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
plot_tree(model, ax=ax, feature_names=iris_dataset.feature_names, filled=True);

In [None]:
# Parameters
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02

fig, ax = plt.subplots(figsize=(11,7))

# Plot the decision boundary
sepal_length = X.iloc[:, 0]
petal_width = X.iloc[:, 3]
x_min, x_max = sepal_length.min() - 1, sepal_length.max() + 1
y_min, y_max = petal_width.min() - 1, petal_width.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

ax.set_xlabel(iris_dataset.feature_names[0])
ax.set_ylabel(iris_dataset.feature_names[3])

# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
    idx = np.where(y_train == i)
    ax.scatter(X_train.iloc[idx[0], 0], X_train.iloc[idx[0], 1], c=color, label=iris_dataset.target_names[i],
                cmap=plt.cm.RdYlBu, edgecolor='black', s=60)

### Training with all features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
model = DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
plot_tree(model, ax=ax, feature_names=iris_dataset.feature_names, filled=True);