# Tree-based models for classification

## Decision trees

### Building a decision tree classification model

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/classification_sprint/iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


#### Preprocessing

In [3]:
# Separate into features and target
y = df['species']
X = df.drop('species', axis=1)

In [4]:
# Standardise the data
standard_scaler = StandardScaler()
X_transformed = standard_scaler.fit_transform(X)

In [5]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.30, random_state=50)

#### Training

In [6]:
tree = DecisionTreeClassifier(random_state=42)

In [7]:
tree.fit(X_train, y_train)

#### Testing

In [8]:
y_pred = tree.predict(X_test)

In [9]:
y_test.value_counts()

species
Iris-versicolor    17
Iris-setosa        14
Iris-virginica     14
Name: count, dtype: int64

In [10]:
labels = ['Iris-setosa', 'Iris-versicolor','Iris-virginica']

pd.DataFrame(data=confusion_matrix(y_test, y_pred), index=labels, columns=labels)

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,14,0,0
Iris-versicolor,0,16,1
Iris-virginica,0,1,13


In [11]:
print(classification_report(y_test, y_pred, target_names=['Iris-setosa', 'Iris-versicolor','Iris-virginica']))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.94      0.94      0.94        17
 Iris-virginica       0.93      0.93      0.93        14

       accuracy                           0.96        45
      macro avg       0.96      0.96      0.96        45
   weighted avg       0.96      0.96      0.96        45



### Tuning parameters to improve the model

###  Building a random forest classification model

In [12]:
from sklearn.ensemble import RandomForestClassifier

#### Training

In [13]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)

#### Testing

In [14]:
pred_forest = forest.predict(X_test)

In [15]:
labels = ['Iris-setosa', 'Iris-versicolor','Iris-virginica']

pd.DataFrame(data=confusion_matrix(y_test, pred_forest), index=labels, columns=labels)

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,14,0,0
Iris-versicolor,0,16,1
Iris-virginica,0,1,13


In [16]:
print(classification_report(y_test, pred_forest, target_names=['Iris-setosa', 'Iris-versicolor','Iris-virginica']))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.94      0.94      0.94        17
 Iris-virginica       0.93      0.93      0.93        14

       accuracy                           0.96        45
      macro avg       0.96      0.96      0.96        45
   weighted avg       0.96      0.96      0.96        45

