___
<h1> Machine Learning </h1>
<h2> M. Sc. in Electrical and Computer Engineering </h2>
<h3> Instituto Superior de Engenharia / Universidade do Algarve </h3>

[MEEC](https://ise.ualg.pt/en/curso/1477) / [ISE](https://ise.ualg.pt) / [UAlg](https://www.ualg.pt)

Pedro J. S. Cardoso (pcardoso@ualg.pt)
___

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedShuffleSplit

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn import tree

# Decision Trees
## Classification

Let us start, again, with the iris dataset. Following the same flow, it is now easy to train a Decision Tree Model

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    train_size=.75,
                                                    random_state=42)

dtc = DecisionTreeClassifier(
    random_state=42,
).fit(X_train, y_train)

With a perfect score on training

In [None]:
score = dtc.score(X_train, y_train)
score   

and also in test...!

In [None]:
score = dtc.score(X_test, y_test)
score   

Ater this we can plot the tree

In [None]:
fig, axes = plt.subplots(figsize = (30, 30))

tree.plot_tree(
    dtc,
    feature_names = iris.feature_names,
    class_names=iris.target_names,
    filled = True
)

Use the tree to predict some of the test examples...

In [None]:
print(iris.feature_names, '\n', X_test[:5], '\n', iris.target_names[:5])

If we do some cross validation
(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html) using a stratified shuffle split (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html) that is not so perfect!

In the first example, then minimum number of samples required to be at a leaf node is 1. 

In [None]:
skf = StratifiedShuffleSplit(
    n_splits=10,
    random_state=1
)

clf = DecisionTreeClassifier(
    random_state=1
)

scores = cross_val_score(
    clf,
    iris.data, iris.target, 
    cv=skf
)

print(f'scores:{scores}\nmu:{scores.mean()}')

In the second example, we increase minimum number of samples required to be at a leaf node to be 3, giving a slight increase in performance

In [None]:
clf = DecisionTreeClassifier(
    random_state=1,
    min_samples_leaf = 3,
)

scores = cross_val_score(
    clf,
    iris.data, iris.target, 
    cv=skf
)

print(f'scores:{scores}\nmu:{scores.mean()}')

## Regression
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

In this example we'll use the Boston dataset.

In [None]:
boston = load_boston()

X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    shuffle=True,
    random_state=42,
    test_size=0.1
)

Now, define the rergressor and do the training

In [None]:
dtr = DecisionTreeRegressor(
    random_state=0
).fit(X_train, y_train)     

Finaly, we score it

In [None]:
score = dtr.score(X_test, y_test)
score  

In [None]:
y_pred = dtr.predict(X_test)

plt.figure(figsize=(15,10))

plt.plot(y_test, c='b')
plt.plot(y_pred, c='g')
plt.plot(np.abs(y_pred-y_test), c='r')

plt.legend(["test", "pred", "$\Delta = |y_i-\hat{y_i}|$"])

plt.show

In [None]:
fig, axes = plt.subplots(figsize = (800, 100))

tree.plot_tree(
    dtr,
    feature_names=boston.feature_names,
    filled=True,
    fontsize=12
)

To be more sure, let us do a cross validation experiment

In [None]:
fig.savefig("tree.png", dpi=fig.dpi)

In [None]:
kfold = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=1
)

reg = DecisionTreeRegressor(
    random_state=1,
    min_samples_leaf = 3,
)

scores = cross_val_score(
    reg,
    boston.data, boston.target, 
    cv=kfold
)

print(f'scores:{scores}\nmu:{scores.mean()}')

## Slide's example

In [None]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt


n = 25
a, b = 0, 10
x = np.linspace(a, b, n)
y = np.sin(x)
plt.plot(x, y, ".")

In [None]:
model = DecisionTreeRegressor(min_samples_leaf=1).fit(x.reshape(-1, 1), y)

x_hat = np.linspace(a, b, 100).reshape(-1, 1)
y_hat = model.predict(x_hat)
plt.plot(x, y, ".")
plt.plot(x_hat, y_hat)

plt.legend(["data", "approx."])
plt.title("Decision regression tree (min_samples_leaf=1)")
plt.xlabel('data')
plt.ylabel('target')

In [None]:
model = DecisionTreeRegressor(min_samples_leaf=2).fit(x.reshape(-1, 1), y)

x_hat = np.linspace(a, b, 100).reshape(-1, 1)
y_hat = model.predict(x_hat)
plt.plot(x, y, ".")
plt.plot(x_hat, y_hat)

plt.legend(["data", "approx."])
plt.title("Decision regression tree (min_samples_leaf=2)")
plt.xlabel('data')
plt.ylabel('target')

In [None]:
fig, axes = plt.subplots(figsize = (30, 10))

tree.plot_tree(
    model,
    feature_names=["x"],
    filled=True,
    fontsize=12
)
plt.show()

# Random Forest
## Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target, 
                                                    train_size=.75,
                                                    random_state=42)

rfc = RandomForestClassifier(
        n_estimators=100,
        n_jobs=-1,
        min_samples_leaf = 1,
        max_leaf_nodes = 5,
).fit(X_train, y_train)

In [None]:
scores = rfc.score(X_train, y_train)
print(f'scores:{scores}\nmu:{scores.mean()}')      

In [None]:
scores = rfc.score(X_test, y_test)
print(f'scores:{scores}\nmu:{scores.mean()}')      

In [None]:
sss = StratifiedShuffleSplit(
    n_splits=5, 
    random_state=42
)

rfc = RandomForestClassifier(
        n_estimators=20,
        n_jobs=-1,
        min_samples_leaf = 1,
#         max_leaf_nodes = 5,
)

scores = cross_val_score(
    rfc,  
    iris.data, 
    iris.target, 
    cv=sss
)

print(f'scores:{scores}\nmu:{scores.mean()}')

## Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

boston = load_boston()

X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    random_state=1,
                                                    test_size=0.1)

rfr = RandomForestRegressor( 
        n_estimators=10,
        n_jobs=-1,
        random_state=0).fit(X_train, y_train)

y_pred = rfr.predict(X_test)

score = rfr.score(X_test, y_test)
score        

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(15,10))

plt.plot(y_test, c='b')
plt.plot(y_pred, c='g')
plt.plot(np.abs(y_pred-y_test), c='r')

plt.legend(["test", "pred", "$\Delta = |y_i-\hat{y_i}|$"])

plt.show

In [None]:
kfold = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=42
)
scores = cross_val_score(rfr,  boston.data, boston.target, cv=kfold)
scores

In [None]:
scores.mean()