In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
dataset = load_iris(as_frame=True)
df = pd.DataFrame(data=dataset.data)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
# adding target and target names to dataframe
target_zip = dict(zip(set(dataset.target), dataset.target_names))
df["target"] = dataset.target
df["target_names"] = df["target"].map(target_zip)

print(df.shape)
df.head()

(150, 6)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [5]:
x = df.iloc[:, :4]
y = df.iloc[:, -1]

In [6]:
y[:10]

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
5    setosa
6    setosa
7    setosa
8    setosa
9    setosa
Name: target_names, dtype: object

In [9]:
# split training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, shuffle=True, random_state=24)

print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)

print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (112, 4)
y_train shape:  (112,)
x_test shape:  (38, 4)
y_test shape:  (38,)


In [12]:
y_train[:10]

53     versicolor
58     versicolor
95     versicolor
22         setosa
15         setosa
20         setosa
69     versicolor
141     virginica
88     versicolor
37         setosa
Name: target_names, dtype: object

In [13]:
def partition(data, column, value):
    """
    Partition the data into left (indicating True) and right (indicating False)

    Inputs
    data: data to partition

    Outputs
    left: index of values that meet condition
    right: index of values that fail to meet the condition
    """

    left = data[data[column] <= value].index
    right = data[data[column] > value].index

    return left, right

In [14]:
left_index, right_index = partition(x_train, "petal length (cm)", 2.45)

print("petal length (cm) <= 2.45")
print(left_index.shape)
print(right_index.shape)

petal length (cm) <= 2.45
(38,)
(74,)


In [15]:
left_idx = dict(zip(np.unique(y_train.loc[left_index], return_counts=True)[0], np.unique(y_train.loc[left_index], return_counts=True)[1]))
right_idx = dict(zip(np.unique(y_train.loc[right_index], return_counts=True)[0], np.unique(y_train.loc[right_index], return_counts=True)[1]))

print(f"left index: {left_idx}")
print(f"right index: {right_idx}")

left index: {np.str_('setosa'): np.int64(38)}
right index: {np.str_('versicolor'): np.int64(42), np.str_('virginica'): np.int64(32)}


In [16]:
def gini_impurity(label, label_index):
    """
    A measure of how often a randomly chosen element from the set would be incorrectly labelled
    if it was randomly labelled accorrding to the distribution of labels in the subset 

    Inputs
    label: The class label available at current node

    Outputs
    impurity: The gini impurity of the node 
    """

    # unique labels and counts in the data 
    unique_label, unique_label_count = np.unique(label.loc[label_index], return_counts=True)

    impurity = 1.0
    for i in range(len(unique_label)):
        p_i = unique_label_count[i] / sum(unique_label_count)
        impurity -= p_i ** 2
    
    return impurity

In [17]:
impurity = gini_impurity(y_train, y_train.index)
impurity

np.float64(0.6626275510204082)

In [23]:
def information_gain(label, left_index, right_index, impurity):
    """
    For each node of the tree, the information gain represents the expected amount of information that would be needed to speficy
    whether a new instance should be classified yes or no, given that the example reached that node

    Inputs
    left: The values that met the conditions of the current node
    right: The values that failed to meet the conditions of the current node
    gini_impurity: the uncertainty at the current node

    Outputs
    info_gain: Information Gain at the node
    """

    p = float(len(left_index)) / (len(left_index) + len(right_index))
    info_gain = impurity - p * gini_impurity(label, left_index) - (1-p) * gini_impurity(label, right_index)

    return info_gain

In [24]:
info_gain = information_gain(y_train, left_index, right_index, impurity)
info_gain

np.float64(0.33830322669608387)

In [25]:
def find_best_split(df, label, index): 
    """
    Splits the data on the best column and value

    Input 
    df: training data
    label: target label
    index: index of the data

    Output: 
    best_gain: max information gain
    best_col: the column that produced best information gain
    best_val: the value of the column that produced best information gain

    """

    best_gain = 0
    best_col = None
    best_value = None

    df = df.loc[index] # convert training data to pandas dataframe
    label_index = label.loc[index].index # get the index of the labels 

    impurity = gini_impurity(label, label_index) # deterimine the impurity at the current node 

    # go through the columns and store the unique values in each column 
    # (no point testing on the same value twice)
    for col in df.columns:
        unique_values = set(df[col])

        # loop through each value and partition the data into True (left_index) and False (right_index)
        for value in unique_values:
            left_index , right_index = partition(df, col, value)

            # ignore if the index is empty (meaning there was no features that met the dicision rule)
            if len(left_index) == 0 or len(right_index) == 0:
                continue

            info_gain = information_gain(label, left_index, right_index, impurity)

            if info_gain > best_gain:
                best_gain, best_col, best_value = info_gain, col, value

    return best_gain, best_col, best_value

In [26]:
find_best_split(x_train, y_train, y_train.index)

(np.float64(0.33830322669608387), 'petal length (cm)', 1.9)

In [27]:
# helper function to count values 
def count(label, index):
    """
    To count the unique values

    Input 
    label: target labels
    index: index of rows

    Output
    dict_label_count: Dictionary of label and count 
    """

    unique_label, unique_label_counts = np.unique(label.loc[index], return_counts=True)

    dict_label_count = dict(zip(unique_label, unique_label_counts))

    return dict_label_count

In [28]:
count(y_train, y_train.index)

{np.str_('setosa'): np.int64(38),
 np.str_('versicolor'): np.int64(42),
 np.str_('virginica'): np.int64(32)}