### Import Necessary Libraries 

In [2]:
import graphviz
import pydotplus
import numpy as np
import pandas as pd
import seaborn as sns
from six import StringIO
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from decisiontree import DecisionTree__Classifier

### Load the Iris Dataset 

The famous **Iris** dataset consists of 150 samples of iris flowers divided into three species: **Setosa**, **Versicolor**, and **Virginica**.  

The dataset contains four features (attributes) for each sample:
- **Sepal length**, **Sepal width**, **Petal length**, and **Petal width**

The **target** variable (class label) is the species of the iris flower: Label **0** for **Setosa**, **1** for **Versicolor**, and **2** for **Virginica**.

Here is the picture of three Iris Flower spieces.
<img src="Iris.png" alt="Iris Dataset Screenshot" width="600" height="400"/>

Here is a dataset sample.
<img src="iris2.png" alt="Iris Dataset Screenshot" width="600" height="400"/>

In [3]:
iris = load_iris()
X = iris.data
y = iris.target

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 40)

### Function to create Decision Tree 

In [5]:
def train_and_visualize_and_test(criterion, max_depth = None, min_samples_split = 2, min_samples_leaf = 1):
    
    # Training the model
    clf = DecisionTree__Classifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    clf.fit(X_train, y_train)

    # Making predictions
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    # Evaluating accuracy
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)

    print(f"Training accuracy: {accuracy_train}")
    print(f"Test accuracy: {accuracy_test}")
    dot_data = clf.export__graphviz(feature_names=iris.feature_names, class_names=iris.target_names)
    graph = graphviz.Source(dot_data)
    return graph

### Overfit Decision Tree 

In [6]:
# Train and visualize with entropy
graph_entropy_en_overfit = train_and_visualize_and_test(criterion = 'entropy')
graph_entropy_en_overfit.render("Temp/iris_entropy_overfit")

Training accuracy: 1.0
Test accuracy: 0.9466666666666667


'Temp\\iris_entropy_overfit.pdf'

In [7]:
# Train and visualize with Gini
graph_entropy_g_overfit = train_and_visualize_and_test(criterion = 'gini')
graph_entropy_g_overfit.render("Temp/iris_gini_overfit")

Training accuracy: 1.0
Test accuracy: 0.9333333333333333


'Temp\\iris_gini_overfit.pdf'

### Underfit Decision Tree 

In [8]:
# Train and visualize with Entropy
graph_entropy_en_underfit = train_and_visualize_and_test(criterion = 'entropy', max_depth = 1)
graph_entropy_en_underfit.render("Temp/iris_entropy_underfit")

Training accuracy: 0.68
Test accuracy: 0.6533333333333333


'Temp\\iris_entropy_underfit.pdf'

In [9]:
# Train and visualize with Gini
graph_entropy_g_underfit = train_and_visualize_and_test(criterion = 'gini', max_depth = 1)
graph_entropy_g_underfit.render("Temp/iris_gini_underfit")

Training accuracy: 0.68
Test accuracy: 0.6533333333333333


'Temp\\iris_gini_underfit.pdf'

### Reduce Overfitting to improve the accuracy 

**Pruning**

In [10]:
# Pruning with Max-Dept
graph_entropy_g_pruning_maxdepth = train_and_visualize_and_test(criterion = 'gini', max_depth = 4)
graph_entropy_g_pruning_maxdepth.render("Temp/iris_gini_pruning_maxdepth")

Training accuracy: 0.9733333333333334
Test accuracy: 0.96


'Temp\\iris_gini_pruning_maxdepth.pdf'

In [11]:
# Pruning by putting a restriction on the minimum no. of samples required to be at a leaf node
graph_entropy_g_pruning_min_samples_leaf = train_and_visualize_and_test(criterion = 'gini', min_samples_leaf = 4)
graph_entropy_g_pruning_min_samples_leaf.render("Temp/iris_gini_pruning_min_samples_leaf")

Training accuracy: 0.9466666666666667
Test accuracy: 0.9733333333333334


'Temp\\iris_gini_pruning_min_samples_leaf.pdf'

In [12]:
# Pruning by putting a restriction on the minimum no. of samples required to split an internal node
graph_entropy_g_pruning_min_samples_split = train_and_visualize_and_test(criterion = 'gini', min_samples_split = 4)
graph_entropy_g_pruning_min_samples_split.render("Temp/iris_gini_pruning_min_samples_split")

Training accuracy: 0.9733333333333334
Test accuracy: 0.96


'Temp\\iris_gini_pruning_min_samples_split.pdf'