# Desion Tree Classifiers

In [1]:
import numpy as no
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Train/Test Sets

In [3]:
train, test = train_test_split(df, test_size = 0.2)

# train sets
train_x = train.drop('Name', 1)
train_y = train['Name']

# test sets
test_x = test.drop('Name', 1)
test_y = test['Name']

# Decision Tree

In [4]:
# model
tree = DecisionTreeClassifier(criterion = "entropy")

# train
tree.fit(train_x, train_y)

# predict
tree_predictions = tree.predict(test_x)

In [9]:
cross_val_score(tree, df.drop('Name', 1), df['Name'], cv = 5)

array([ 0.96666667,  0.96666667,  0.9       ,  0.93333333,  1.        ])

### Feature Importances

The ".feature\_importances\_" attribute of the DecisionTreeClassifier() object gives us the information gain of each attribute as a measure of importance.

In [5]:
# I put the attribute names and their respective information gains in a data frame for readability.
pd.DataFrame({'Gain': tree.feature_importances_}, index = train_x.columns).sort_values('Gain', ascending = False)

Unnamed: 0,Gain
PetalWidth,0.943897
PetalLength,0.04156
SepalLength,0.014543
SepalWidth,0.0


### Confusion Matrix

A confusion matrix is a good way to check the accuracy of your model and to see in what ways your model may be predicting incorrectly.

We do this ising the Pandas crosstab() functionâ€¦

In [6]:
pd.crosstab(test_y, tree_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,Iris-setosa,Iris-versicolor,Iris-virginica,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,15,0,0,15
Iris-versicolor,0,8,1,9
Iris-virginica,0,0,6,6
All,15,8,7,30


# Bagging

![](bagging.png)

In [7]:
# model
bag = BaggingClassifier()

# train
bag.fit(train_x, train_y)

# predict
bag_predictions = bag.predict(test_x)

# confusion matrix
pd.crosstab(test_y, bag_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

Predicted:,Iris-setosa,Iris-versicolor,Iris-virginica,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,15,0,0,15
Iris-versicolor,0,8,1,9
Iris-virginica,0,0,6,6
All,15,8,7,30


# Random Forest

In [12]:
# model
forest = RandomForestClassifier(criterion = 'entropy')

# train
forest.fit(train_x, train_y)

# predict
forest_predictions = forest.predict(test_x)

# feature importances
print(pd.DataFrame({'Importance': forest.feature_importances_}, index = train_x.columns).sort_values('Importance', ascending = False))

# confusion matrix
pd.crosstab(test_y, forest_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

             Importance
PetalWidth     0.496852
PetalLength    0.377932
SepalLength    0.104263
SepalWidth     0.020954


Predicted:,Iris-setosa,Iris-versicolor,Iris-virginica,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,15,0,0,15
Iris-versicolor,0,8,1,9
Iris-virginica,0,0,6,6
All,15,8,7,30


# Out of Bag Error

In [9]:
# model
oob_forest = RandomForestClassifier(criterion = 'entropy', oob_score = True, n_estimators = 500)

# train
oob_forest.fit(df.drop('Name', 1), df['Name'])

# Out of bag score
oob_forest.oob_score_

0.95999999999999996