# Partition of feature space by random forest

In [5]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_boston, load_iris, load_wine, load_digits, \
                             load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score

import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
#%config InlineBackend.figure_format = 'svg'
%config InlineBackend.figure_format = 'retina'

from rfpimp import *   # pip install rfpimp

from sklearn import tree

import dtreeviz
print(dtreeviz.__version__)
from dtreeviz.trees import *
from dtreeviz import clfviz_bivar

1.3


## Decision tree and random forest for wine data set

In [None]:
wine = load_wine()
X = wine.data
X = X[:,[12,6]]
y = wine.target
len(X), len(y)

colors = {'classes':
          [None, # 0 classes
          None, # 1 class
          ["#FEFEBB","#a1dab4"], # 2 classes
          ["#FEFEBB","#D9E6F5",'#a1dab4'], # 3
          ]
         }

feature_c_bivar = ['proline','flavanoid']
target_c_bivar = "wine"
class_name_bivar = list(wine.target_names)
feature_c_bivar, target_c_bivar,  class_name_bivar

In [None]:
dtc_bivar = DecisionTreeClassifier(max_depth=2)
dtc_bivar.fit(X, y)

ctreeviz_bivar(dtc_bivar, X, y, 
               feature_names=feature_c_bivar, target_name=target_c_bivar, class_names=class_name_bivar,
               show={'splits', "legend"}, 
               colors={'scatter_edge': 'black'})

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=2, n_jobs=-1)
rf.fit(X, y)

fig,axes = plt.subplots(1,2, figsize=(8,3.8), dpi=300)
clfviz_bivar(rf, X, y, ntiles=50, ax=axes[0], tile_fraction=.9, show_proba=True, feature_names=['proline', 'flavanoid'])
clfviz_bivar(rf, X, y, ntiles=50, ax=axes[1], tile_fraction=.85, show_proba=False, feature_names=['proline', 'flavanoid'])
plt.show()

In [None]:
fig,ax = plt.subplots(1,1, figsize=(3.5,3))
crfviz_bivar(rf, X, y, ntiles=20, ax=ax, tile_fraction=.95, feature_names=['proline', 'flavanoid'])

In [None]:
import pltvid

dpi = 300
camera = pltvid.Capture(dpi=dpi)
max = 10
for depth in range(1,max+1):
    t = DecisionTreeClassifier(max_depth=depth)
    t.fit(X,y)

    fig,ax = plt.subplots(1,1, figsize=(4,3.5), dpi=dpi)
    ctreeviz_bivar(t, X, y, 
                   feature_names=['proline', 'flavanoid'], target_name="wine",
                   show={'splits'},
                   colors={'scatter_edge': 'black',
                           'tesselation_alpha':.4},
                   ax=ax)
    plt.title(f"Wine tree depth {depth}")
    plt.tight_layout()
    if depth>=max:
        camera.snap(8)
    else:
        camera.snap()
    # plt.show()

camera.save("/tmp/wine-dtree-maxdepth.png", duration=500)

## Titantic

In [None]:
df = pd.read_csv("../data/titanic/titanic.csv")
df['Sex'] = np.where(df['Sex']=='male', 0, 1)
df.head(2).T

In [None]:
X, y = df.drop(['Survived','Name','Ticket','Cabin','Embarked'], axis=1), df['Survived']
X['Age_na'] = X['Age'].isna()
X['Age'] = X['Age'].fillna(X['Age'].median(skipna=True))

In [None]:
rf = RandomForestClassifier(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
rf.fit(X, y)

I = importances(rf, X, y)
plot_importances(I)

In [None]:
X = X[['Age','Fare']]
rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
rf.fit(X, y)

In [None]:
ctreeviz_bivar_fusion(rf.estimators_, X, y,
                      feature_names=['Age','Fare'], target_name="wine",
                      class_names=None,
                      alpha=.1)

In [None]:
crfviz_bivar(rf, X.values, y, ntiles=50, feature_names=['Age','Fare'])

## Cancer

In [None]:
cancer = load_breast_cancer()

df = pd.DataFrame(data=cancer.data)
df.columns = [f'f{i}' for i in range(df.shape[1])]
df['y'] = cancer.target
df.head(3)

In [None]:
X, y = df.drop('y',axis=1), df['y']

In [None]:
rf = RandomForestClassifier(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
rf.fit(X, y)

I = importances(rf, X, y)[0:5]

In [None]:
plot_importances(I)

In [None]:
X = df[['f27','f22']]

rf = RandomForestClassifier(n_estimators=30, min_samples_leaf=5, n_jobs=-1)
rf.fit(X, y)

In [None]:
ctreeviz_bivar_fusion(rf.estimators_, X, y,
                      feature_names=['f27', 'f22'], target_name="cancer",
                      class_names=['not-cancer','cancer'],
                      alpha=.05)

In [None]:
crfviz_bivar(rf, X.values, y, feature_names=['f27', 'f22'], ntiles=40)

## Synthetic data sets

In [None]:
def smiley(n = 1000):
    # mouth
    x1 = np.random.normal(1.0,.2,n).reshape(-1,1)
    x2 = np.random.normal(0.4,.05,n).reshape(-1,1)
    cl = np.full(shape=(n,1), fill_value=0, dtype=int)
    d = np.hstack([x1,x2,cl])
    data = d
    
    # left eye
    x1 = np.random.normal(.7,.2,n).reshape(-1,1)
#     x2 = np.random.normal(0.8,.1,n).reshape(-1,1)
    x2 = x1 + .3 + np.random.normal(0,.1,n).reshape(-1,1)
    cl = np.full(shape=(n,1), fill_value=1, dtype=int)
    d = np.hstack([x1,x2,cl])
    data = np.vstack([data, d])

    # right eye
    x1 = np.random.normal(1.3,.2,n).reshape(-1,1)
    x2 = np.random.normal(0.8,.1,n).reshape(-1,1)
    x2 = x1 - .5 + .3 + np.random.normal(0,.1,n).reshape(-1,1)
    cl = np.full(shape=(n,1), fill_value=2, dtype=int)
    d = np.hstack([x1,x2,cl])
    data = np.vstack([data, d])

    # face outline
    noise = np.random.normal(0,.1,n).reshape(-1,1)
    x1 = np.linspace(0,2,n).reshape(-1,1)
    x2 = (x1-1)**2 + noise
    cl = np.full(shape=(n,1), fill_value=3, dtype=int)
    d = np.hstack([x1,x2,cl])
    data = np.vstack([data, d])

    df = pd.DataFrame(data, columns=['x1','x2','class'])
    return df

Test we get 4 classes in a smiley face:

In [None]:
df = smiley(n=150)
plt.scatter(df.x1,df.x2,s=3,c=df['class'])

In [None]:
df = smiley(n=300)
X = df[['x1','x2']]
y = df['class']
rf = RandomForestClassifier(n_estimators=10, min_samples_leaf=1, n_jobs=-1)
rf.fit(X, y)

fig,ax = plt.subplots(1,1,figsize=(7,6))
crfviz_bivar(rf, X.values, y, feature_names=['x1', 'x2'], ntiles=70, dot_w=15, ax=ax)

In [None]:
t = DecisionTreeClassifier(max_depth=4)
t.fit(X,y)
ctreeviz_bivar(t, X, y, 
               feature_names=['x1', 'x2'], target_name="class",
               show={'splits'},
               colors={'scatter_edge': 'black'})

### Animate num trees in RF

In [None]:
import pltvid

df = smiley(n=100)
X = df[['x1','x2']]
y = df['class']
rf = RandomForestClassifier(n_estimators=10, min_samples_leaf=1, n_jobs=-1)
rf.fit(X, y)

dpi = 300
camera = pltvid.Capture(dpi=dpi)
max = 100
tree_sizes = [*range(1,10)]+[*range(10,max+1,5)]
for nt in tree_sizes:
    np.random.seed(1) # use same bagging sets for animation
    rf = RandomForestClassifier(n_estimators=nt, min_samples_leaf=1, n_jobs=-1)
    rf.fit(X, y)

    fig,ax = plt.subplots(1,1, figsize=(3,2.8), dpi=dpi)
    crfviz_bivar(rf, X.values, y, feature_names=['x1', 'x2'],
                 ntiles=70, dot_w=15, boundary_markersize=.4, ax=ax)
    plt.title(f"Synthetic dataset, {nt} trees")
    plt.tight_layout()
    if nt>=tree_sizes[-1]:
        camera.snap(5)
    else:
        camera.snap()
    # plt.show()

camera.save("/tmp/smiley-numtrees.png", duration=500)

### Animate decision tree max depth

In [None]:
import pltvid

df = smiley(n=100) # more stark changes with fewer
X = df[['x1','x2']]
y = df['class']

dpi = 300
camera = pltvid.Capture(dpi=dpi)
max = 10
for depth in range(1,max+1):
    t = DecisionTreeClassifier(max_depth=depth)
    t.fit(X,y)

    fig,ax = plt.subplots(1,1, figsize=(4,3.5), dpi=dpi)
    ctreeviz_bivar(t, X, y, 
                   feature_names=['x1', 'x2'], target_name="class",
                   show={'splits'},
                   colors={'scatter_edge': 'black',
                           'tesselation_alpha':.6},
                   ax=ax)
    plt.title(f"Synthetic dataset, tree depth {depth}")
    plt.tight_layout()
    if depth>=max:
        camera.snap(8)
    else:
        camera.snap()
    # plt.show()

camera.save("/tmp/smiley-dtree-maxdepth.png", duration=500)

### Animate decision tree min samples per leaf

In [None]:
import pltvid

df = smiley(n=100)
X = df[['x1','x2']]
y = df['class']

dpi = 300
camera = pltvid.Capture(dpi=dpi)
max = 20
for leafsz in range(1,max+1):
    t = DecisionTreeClassifier(min_samples_leaf=leafsz)
    t.fit(X,y)

    fig,ax = plt.subplots(1,1, figsize=(4,3.5), dpi=dpi)
    ctreeviz_bivar(t, X, y, 
                   feature_names=['x1', 'x2'], target_name="class",
                   show={'splits'},
                   colors={'scatter_edge': 'black',
                           'tesselation_alpha':.4},
                   ax=ax)
    plt.title(f"Synthetic dataset, {leafsz} samples/leaf")
    plt.tight_layout()
    if leafsz>=max:
        camera.snap(8)
    else:
        camera.snap()
    # plt.show()

camera.save("/tmp/smiley-dtree-minsamplesleaf.png", duration=500)