# Tutorial

> Using `Optimizer` to post-process Random Forest classifier to extract an optimal actionable plan that can change a given input to a desired class with a minimum cost.

**Dataset** [Breast Cancer](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
import re
import IPython, graphviz

import numpy as np
import pandas as pd

from oae.core import *
from oae.tree import *
from oae.optimizer import *
from sklearn.tree import export_graphviz

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, roc_auc_score

SEED = 41
np.random.seed(SEED)

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
columns = ['code_number',
           'clump_thickness',
           'cell_size_uniformity',
           'cell_shape_uniformity',
           'marginal_adhesion',
           'single_epithelial_cell_size',
           'bare_nuclei',
           'bland_chromatin',
           'normal_nucleoli',
           'mitoses',
           'target'
          ]
data.columns = columns; data.head()

Unnamed: 0,code_number,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,target
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [None]:
lbls, lbl_map = pd.factorize(data['target'])

In [None]:
data = data.assign(bare_nuclei=data.bare_nuclei.str.replace('?', '-1').astype(np.int))
data = data.assign(target=lbls); data.head()

Unnamed: 0,code_number,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,target
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [None]:
data.target.value_counts(normalize=True)

0    0.655222
1    0.344778
Name: target, dtype: float64

In [None]:
features = data.columns[1:-1]

Xtr, Xte, ytr, yte = tts(data.loc[:, features], data.target, test_size=.2, random_state=SEED)

In [None]:
Xtr.dtypes

clump_thickness                int64
cell_size_uniformity           int64
cell_shape_uniformity          int64
marginal_adhesion              int64
single_epithelial_cell_size    int64
bare_nuclei                    int64
bland_chromatin                int64
normal_nucleoli                int64
mitoses                        int64
dtype: object

We verify that our classifier is able to learn what are the characterisitics of  `benign` and `malignant` classes. 

In [None]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=SEED)
clf.fit(Xtr, ytr)

print(f'train accuracy: {accuracy_score(ytr, clf.predict(Xtr))}')
print(f'holdout accuracy: {accuracy_score(yte, clf.predict(Xte))}')

train accuracy: 0.998211091234347
holdout accuracy: 0.9714285714285714


Let's randomly select an instance from holdout set and look at the ground. We realize that the classifier marks it as `malignant` and we want to know what features could be changed so that classifier would mark it as `benign`.

In [None]:
instanceidx = 4
print(yte.iloc[instanceidx], ' ', clf.predict_proba(Xte.iloc[instanceidx:instanceidx+1]))

1   [[0. 1.]]


In [None]:
atm        = ATMSKLEARN(clf, data.loc[:, features].values)
instance   = Instance(Xte.iloc[instanceidx], ['categorical'] * 9)
partitions = atm.v_i_j(instance)
pi_t_k     = atm.pi_t_k()


h_t_k   = atm.h_t_k(combine, class_=0)
instance_phi_t_k = atm.phi_t_k(Xte.iloc[instanceidx:instanceidx+1].values)
w_t     = atm.calculate_tree_weights()

orig_mask  = atm.v_i_j_mask(partitions, instance)

In [None]:
assert all([sum(r) == 1 for r in atm.phi_t_k(Xte.iloc[instanceidx:instanceidx+1].values)])
tolerance = 1e-6
assert np.abs(np.sum([h_t_k[i][j] * instance_phi_t_k[i][j] * w_t[i] for i in range(len(h_t_k)) for j in range(len(h_t_k[i]))])\
              - 0) < tolerance
assert all([np.sum(x) == 1 for x in orig_mask])

In [None]:
opt = Optimizer(cost_matrix, combine, z=0.45, class_=0)
v_i_j_sol, phi_t_k_sol = opt.solve(atm, instance)

num_solutions: 4
objective value: 2.0


In [None]:
atm.suggest_changes(v_i_j_sol, instance)

['no change, current value: 5',
 'no change, current value: 3',
 'no change, current value: 5',
 'no change, current value: 1',
 'no change, current value: 8',
 'current value: 10, proposed change: [-1, 1]',
 'current value: 5, proposed change: [3, 4]',
 'no change, current value: 3',
 'no change, current value: 1']

In [None]:
X_transformed = atm.transform(v_i_j_sol, instance); X_transformed

Unnamed: 0,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,5,3,5,1,8,-1,3,3,1


In [None]:
clf.predict_proba(X_transformed)

array([[0.6, 0.4]])

## Export

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_Representation.ipynb.
Converted 02_Optimizer.ipynb.
Converted 03_tutorial_breast_cancer.ipynb.
Converted index.ipynb.
