### ID3 Algorithm

In [57]:
import numpy as np
import pandas as pd 
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix

#### Entropy:
#### H(S) = -$\sum_{i=1}^{c}$ pi * $ \log_2(pi)$

In [58]:
def entropy(target_col):
    unique_class, class_counts = np.unique(target_col, return_counts=True)
    p = class_counts / len(target_col)
    entr = -np.sum(p * np.log2(p))
    return entr

#### Information Gain = Entropy(Parent) - Average entropy(Children)

In [59]:
def InfoGain(data, split_attribute_name, target_name="class"):
    # Calculate the total entropy before splitting
    total_entr = entropy(data[target_name])

    # Calculate the values and counts for the split attribute
    v, c = np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted average after splitting
    weighted_avg = np.sum([(c[j] / np.sum(c)) * entropy(data[data[split_attribute_name]==val][target_name]) for j,val in enumerate(v)])
    return total_entr - weighted_avg

In [60]:
def ID3(data, originaldata, features, target_attribute_name="class", parent_node_class = None):
    # If all target variable or y values have the same value, then
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # If the dataset is empty, then we can return the most frequently occuring target variable value in the original dataset.
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]

    # If the feature space is empty, then return parent node class
    elif len(features) == 0:
        return parent_node_class

    parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]

    # Select the feature which best splits the dataset
    it = [InfoGain(data, f, target_attribute_name) for f in features]  # Calculate the information gain for every feature
    bf_index = np.argmax(it)
    best_feature = features[bf_index]
    tree = {best_feature: {}}
    
    features = [j for j in features if j != best_feature]
    uni = np.unique(data[best_feature])

    for k in uni:
        # Split the dataset along the value of the feature variable with the largest information gain and create sub datasets.
        sub_data = data[data[best_feature] == k]
        subtree = ID3(sub_data, data, features, target_attribute_name, parent_node_class)
        tree[best_feature][k] = subtree

    return tree

### Loading and splitting the data

In [107]:
def load_and_prepare_data():
    
    iris = load_iris()
    X = iris.data
    y = iris.target

    print('Dimension of X: ', X.shape)
    print('Dimension of y: ', y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
    
    print('Dimension of X_train: ', X_train.shape)
    print('Dimension of X_test: ', X_test.shape)
    print('Dimension of y_train: ', y_train.shape)
    print('Dimension of y_test: ', y_test.shape)

    return X_train, X_test, y_train, y_test

### Training and prediction of ID3 Algorithm

In [108]:
# Train the ID3 model on the Iris dataset
def train_id3_model(X_train, y_train, features):
    d = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name="class")], axis=1)
    id3_model = ID3(d, d, features, target_attribute_name="class", parent_node_class=None)
    return id3_model


# Predict the class of samples using the trained ID3 model
def predict_id3(tree, instance):
    
    if isinstance(tree, dict):
        root = list(tree.keys())[0]
        val = instance[root]
        if val not in tree[root]:
            return np.argmax(np.bincount(y_train))
        subtree = tree[root][val]
        return predict_id3(subtree, instance)
    return tree

### Evaluation metrics of ID3 Algorithm

In [109]:
def evaluate_id3_model(tree, X_test, y_test):

    y_test_pred = np.array([predict_id3(tree, instance) for instance in X_test])
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='macro')
    recall = recall_score(y_test, y_test_pred, average='macro')
    f1 = f1_score(y_test, y_test_pred, average='macro')
    confusion_mat = confusion_matrix(y_test,y_test_pred)
    
    print('Dimension of y_test_pred: ', y_test_pred.shape)
    print('-------------------------------------')
    print("Confusion Matrix: \n", confusion_mat)
    print("ID3 Test Accuracy:", round(accuracy* 100, 2),"%")
    print("ID3 Test Precision:", round(precision* 100, 2),"%")
    print("ID3 Test Recall:", round(recall* 100, 2),"%")
    print("ID3 Test F1 Score:", round(f1* 100, 2),"%")
    
# Train and evaluate ID3 model
X_train, X_test, y_train, y_test = load_and_prepare_data()
id3_tree = train_id3_model(X_train, y_train, list(range(X_train.shape[1])))
evaluate_id3_model(id3_tree, X_test, y_test)

Dimension of X:  (150, 4)
Dimension of y:  (150,)
Dimension of X_train:  (120, 4)
Dimension of X_test:  (30, 4)
Dimension of y_train:  (120,)
Dimension of y_test:  (30,)
Dimension of y_test_pred:  (30,)
-------------------------------------
Confusion Matrix: 
 [[11  0  0]
 [ 0  5  1]
 [ 0  3 10]]
ID3 Test Accuracy: 86.67 %
ID3 Test Precision: 84.47 %
ID3 Test Recall: 86.75 %
ID3 Test F1 Score: 84.92 %


## C4.5 Algorithm

In [110]:
def entropy(target_col):
    unique_class, class_counts = np.unique(target_col, return_counts=True)
    p = class_counts / len(target_col)
    entr = -np.sum(p * np.log2(p))
    return entr  


def info_gain(data, split_attribute_name, target_name="class"):
    # Calculate the total entropy before splitting
    total_entr = entropy(data[target_name])

    # Calculate the values and counts for the split attribute
    v, c = np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted average after splitting
    weighted_avg = np.sum([(c[j] / np.sum(c)) * entropy(data[data[split_attribute_name]==val][target_name]) for j,val in enumerate(v)])
    return total_entr - weighted_avg  

#### Gain ratio = Info gain/split info

In [111]:
def gain_ratio(data, split_attribute_name, target_name="class"):
    gain = info_gain(data, split_attribute_name, target_name)
    split_info = entropy(data[split_attribute_name])
    if split_info !=0 :
        return gain / split_info
    return 0 


def best_split(data, features, target_name="class"):
    gain_rat = [gain_ratio(data, f, target_name) for f in features]
    bf_ind = np.argmax(gain_rat)
    return features[bf_ind] 

In [112]:
def C45(data, features, target_name="class", parent_node_class=None):
    if len(np.unique(data[target_name])) <= 1:
        return np.unique(data[target_name])[0]
    elif len(data) == 0:
        return np.unique(parent_node_class)[np.argmax(np.unique(parent_node_class, return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class

    # Select the best feature to split on based on the gain ratio
    best_feature = best_split(data, features, target_name)
    tree = {best_feature: {}}

    # Remove the selected best feature from the feature space
    features = [f for f in features if f != best_feature]

    # Split the dataset based on the selected feature
    unique_values = np.unique(data[best_feature])
    for val in unique_values:
        sub_data = data[data[best_feature] == val]
        subtree = C45(sub_data, features, target_name, parent_node_class)
        tree[best_feature][val] = subtree

    return tree  

### Loading and splitting the data

In [113]:
def load_and_prepare_data():
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    print('Dimension of X: ', X.shape)
    print('Dimension of y: ', y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
    
    print('Dimension of X_train: ', X_train.shape)
    print('Dimension of X_test: ', X_test.shape)
    print('Dimension of y_train: ', y_train.shape)
    print('Dimension of y_test: ', y_test.shape)
    
    return X_train, X_test, y_train, y_test  

### Training and prediction of C4.5 Algorithm

In [114]:
# Train the C4.5 model on the Iris dataset
def train_C45_model(X_train, y_train, features):
    x = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name="class")], axis=1)
    c45_model = C45(x, features, target_name="class")
    return c45_model 

# Predict the class of samples using the trained C4.5 model
def predict_C45(tree, instance):
     if isinstance(tree, dict):
        root = list(tree.keys())[0]
        val = instance[root]
        if val not in tree[root]:
            return np.argmax(np.bincount(y_train))
        subtree = tree[root][val]
        return predict_C45(subtree, instance)
     return tree

### Evaluation metrics of C4.5 Algorithm

In [115]:
def evaluate_C45_model(tree, X_test, y_test):
    y_test_pred = np.array([predict_C45(tree, instance) for instance in X_test])
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='macro')
    recall = recall_score(y_test, y_test_pred, average='macro')
    f1 = f1_score(y_test, y_test_pred, average='macro')
    confusion_mat = confusion_matrix(y_test,y_test_pred)
    
    print('Dimension of y_test_pred: ', y_test_pred.shape)
    print('-------------------------------------')
    print("Confusion Matrix: \n", confusion_mat)
    print("C4.5 Test Accuracy:", round(accuracy* 100, 2),"%")
    print("C4.5 Test Precision:", round(precision* 100, 2),"%")
    print("C4.5 Test Recall:", round(recall* 100, 2),"%")
    print("C4.5 Test F1 Score:", round(f1* 100, 2),"%")
    

# Load and prepare the dataset
X_train, X_test, y_train, y_test = load_and_prepare_data()

# Define all four features
features = list(range(X_train.shape[1]))

# Train the C4.5 model
c45_tree = train_C45_model(X_train, y_train, features)

# Predict and evaluate the model 
evaluate_C45_model(c45_tree, X_test, y_test)

Dimension of X:  (150, 4)
Dimension of y:  (150,)
Dimension of X_train:  (120, 4)
Dimension of X_test:  (30, 4)
Dimension of y_train:  (120,)
Dimension of y_test:  (30,)
Dimension of y_test_pred:  (30,)
-------------------------------------
Confusion Matrix: 
 [[11  0  0]
 [ 0  5  1]
 [ 0  2 11]]
C4.5 Test Accuracy: 90.0 %
C4.5 Test Precision: 87.7 %
C4.5 Test Recall: 89.32 %
C4.5 Test F1 Score: 88.31 %
