<a href="https://colab.research.google.com/github/okayode/HIP-IMO_23/blob/main/feat_sel_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz


# Feature Selection

In [15]:
def draw_tree(x_train,y_train,x_test,y_test,feat_labels,target_labels,filename):
    tree_clf = DecisionTreeClassifier(max_depth=None,random_state=1234)
    tree_clf.fit(x_train,y_train)
    for feature in zip(feat_labels, tree_clf.feature_importances_):
        print(feature)
    dot_data=export_graphviz(
        tree_clf,
        out_file=None,
        feature_names=feat_labels,
        class_names=target_labels,
        leaves_parallel=False,
        rounded=True,
        filled=False
    )
    # Draw graph
    dot_data = dot_data.replace('helvetica', 'Microsoft JhengHei')
    graph = graphviz.Source(dot_data)

    graph.render(filename)
    return tree_clf.predict(x_test),y_test

In [16]:
## build our data

data = {'surgeon' : [4,4,1,1,2,1,4,2,4,4],
        'AgeatSurgery' : [68,57,61,91,79,71,73,78,68,69],
        'BMI' : [21.05,33.76,26.28,19.22,26.71,20.19,33.95,26.33,21.32,34.21],
        'Target' : [2, 1, 1, 1, 1, 2, 2, 2, 1, 1]}

# in the target col, '1'=yes, '2'=no
# Create DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,surgeon,AgeatSurgery,BMI,Target
0,4,68,21.05,2
1,4,57,33.76,1
2,1,61,26.28,1
3,1,91,19.22,1
4,2,79,26.71,1
5,1,71,20.19,2
6,4,73,33.95,2
7,2,78,26.33,2
8,4,68,21.32,1
9,4,69,34.21,1


In [17]:
X_3F = df.iloc[0:10,:-1]
y_3F = df.iloc[0:10, -1].values

feat_labels = ["Surgeon","Age at Surgery","BMI"]
target_labels = ["Grew TILs", "No TILs"]


In [18]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_3F, y_3F, test_size = 0.30, random_state=1,stratify=y_3F)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_3F, y_3F, test_size = 0.30, random_state=123,stratify=y_3F)

In [19]:
print(X_3F)

   surgeon  AgeatSurgery    BMI
0        4            68  21.05
1        4            57  33.76
2        1            61  26.28
3        1            91  19.22
4        2            79  26.71
5        1            71  20.19
6        4            73  33.95
7        2            78  26.33
8        4            68  21.32
9        4            69  34.21


In [20]:
print(y_3F)

[2 1 1 1 1 2 2 2 1 1]


In [21]:
print(X_train1)

   surgeon  AgeatSurgery    BMI
8        4            68  21.32
1        4            57  33.76
0        4            68  21.05
6        4            73  33.95
2        1            61  26.28
7        2            78  26.33
3        1            91  19.22


In [22]:
print(y_train1)

[1 1 2 2 1 2 1]


In [23]:
# tree 1

feat_labels = ["Surgeon","Age at Surgery","BMI"]
draw_tree(X_train1,y_train1,X_test1,y_test1,feat_labels,target_labels,"demo_tree_1")

('Surgeon', 0.3)
('Age at Surgery', 0.3111111111111111)
('BMI', 0.3888888888888889)


(array([2, 1, 1]), array([1, 1, 2]))

In [24]:
# X_train2 is a diffrent sub-sample from X-3F

print(X_train2)

   surgeon  AgeatSurgery    BMI
0        4            68  21.05
1        4            57  33.76
8        4            68  21.32
7        2            78  26.33
2        1            61  26.28
6        4            73  33.95
4        2            79  26.71


In [25]:
# y_train2 is a diffrent sub-sample from y-3F

print(y_train2)

[2 1 1 2 1 2 1]


In [26]:
# tree 2
# using a different sub-sample (X_train2 inplace of X_train1), we obtain different FIS for each feature

feat_labels = ["Surgeon","Age at Surgery","BMI"]
draw_tree(X_train2,y_train2,X_test2,y_test2,feat_labels,target_labels,"demo_tree_2")

('Surgeon', 0.0)
('Age at Surgery', 0.5625)
('BMI', 0.4375)


(array([2, 1, 2]), array([2, 1, 1]))

# Exercises

## 1. Generate a new tree, using a new sub-sample from X-3F and y-3F as done above.

## 2. Explain the role of setting 'random_state = #num'

## 3. Explain the role of 'stratify'

## 4. I mentioned to you that Random forest algorithm aggregates the FIS from multiple decision trees, can you use the sklearn function 'RandomForestClassifier' (see the link below for the documentation page) to build a function similar to the function 'draw_tree' (see above), you will name this function 'draw_forest'. In 'draw_forest', use 500 decision trees

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier