# Purpose 

the purpose of this notebook is to demonstrate the working of the from scratch implementation of a decision tree

In [2]:
from metrics import *

import pandas as pd
import numpy as np

In [3]:
from MakeDataset import X_train,X_test,y_train,y_test

Training data shape:  (126, 500, 3)
Testing data shape:  (54, 500, 3)


# metrics

## Accuracy 

In [4]:
def accuracy(y_hat: pd.Series, y: pd.Series) -> float:
    """
    Function to calculate the accuracy.
    """
    # Assert that y_hat and y have the same length
    assert y_hat.size == y.size, "Size of y_hat and y must be equal."

    
    numerator = (y_hat == y).sum()
    #print(f"Numerator (correct predictions): {numerator}")

   
    denominator = y.size
    #print(f"Denominator (total predictions): {denominator}")

    
    accuracy = numerator / denominator
    return accuracy * 100

In [5]:
# Define y_hat and y
y_hat = pd.Series(['yes', 1, 1,1])
y = pd.Series(['yes', 1, 1,1])

# Correctly compute accuracy using the function from mn
accuracy_result = accuracy(y_hat, y)

# Display the result
print(f"Accuracy: {accuracy_result}%")

Accuracy: 100.0%


In [6]:
'''
Variations of the problem:
1) Discrete I/P, Discrete O/P => classficiation 
2) Discrete I/P, Real O/P => regression 
3) Real I/P, Discrete O/P => classification
3) Real I/P, Real O/P => regression
'''

'\nVariations of the problem:\n1) Discrete I/P, Discrete O/P => classficiation \n2) Discrete I/P, Real O/P => regression \n3) Real I/P, Discrete O/P => classification\n3) Real I/P, Real O/P => regression\n'

## Precision

In [7]:
def precision(y_hat: pd.Series, y: pd.Series, cls: Union[int, str]) -> float:
    """
    Function to calculate the precision. Defined only for 1) and 3). 
    """
    true_positives = ((y_hat == cls) & (y == cls)).sum()
    
    # Calculate predicted positives
    gnd_positives = (y == cls).sum()
    
    return true_positives / gnd_positives if gnd_positives != 0 else 0.0

In [8]:
y_hat = pd.Series(['yes', 1, 1, 1])
y = pd.Series(['yes', 1, 0, 1])
precision(y_hat, y, 1)

1.0

## recall 

In [9]:
def recall(y_hat: pd.Series, y: pd.Series, cls: Union[int, str]) -> float:
    """
    Function to calculate the precision. Defined only for 1) and 3). 
    """
    true_positives = ((y_hat == cls) & (y == cls)).sum()
    
    # Calculate predicted positives
    gnd_positives = (y == cls).sum()
    
    return true_positives / gnd_positives if gnd_positives != 0 else 0.0


In [10]:
y_hat = pd.Series(['yes', 1, 1, 1])
y = pd.Series(['yes', 1, 0, 1])
recall(y_hat, y, 1)

1.0

## rmse 

In [11]:
def rmse(y_hat: pd.Series, y: pd.Series) -> float:
    """
    Function to calculate the root-mean-squared-error(rmse). Defined only for 2) and 4).
    """

    assert y_hat.size == y.size, "Size of y_hat and y must be equal."

    y_hat = np.array(y_hat)
    y = np.array(y)
    numerator = np.sum((y_hat-y)**2)
    denominator = y.size

    return np.sqrt(numerator/denominator)


In [12]:
y_hat =pd.Series([10,20])
y = pd.Series([20,30])

rmse(y_hat,y)

10.0

## mae

In [13]:
def mae(y_hat: pd.Series, y: pd.Series) -> float:
    """
    Function to calculate the mean-absolute-error(mae). Defined only for 2) and 4).
    """
    assert y_hat.size == y.size, "Size of y_hat and y must be equal."
    assert y.size, "Ground Truth array is 0"
    assert y_hat.size, "Predicition array is 0"


    y_hat = np.array(y_hat)
    y = np.array(y)
    numerator = np.sum((np.abs(y_hat-y)))

    denominator = y.size

    return numerator/denominator


In [14]:
y_hat =pd.Series([11,20])
y = pd.Series([20,30])

mae(y_hat,y)

9.5

# trying decision tree

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)

In [16]:
import pandas as pd
import numpy as np

In [17]:
from tree.base import DecisionTree

In [18]:
# Real Input and Discrete Output

In [19]:
y

0    20
1    30
dtype: int64

In [20]:
#{'1_': {'attribute': 0.0, 'split_value': 0.16919126215699776, 'right_label': '1_1_', 'left_label': '1_2_'}, '1_1_': {'attribute': 0.0, 'split_value': -1.4394428450912664, 'right_label': '1_1_1_', 'left_label': '1_1_2_'}, '1_1_1_': 3, '1_1_2_': {'attribute': 0.0, 'split_value': -0.5320222864127546, 'right_label': '1_1_2_1_', 'left_label': '1_1_2_2_'}, '1_1_2_1_': {'attribute': 0.0, 'split_value': -0.6607754103120528, 'right_label': '1_1_2_1_1_', 'left_label': '1_1_2_1_2_'}, '1_1_2_1_1_': 0, '1_1_2_1_2_': {'attribute': 0.0, 'split_value': -0.5819970707351848, 'right_label': '1_1_2_1_2_1_', 'left_label': '1_1_2_1_2_2_'}, '1_1_2_1_2_1_': 2, '1_1_2_1_2_2_': 0, '1_1_2_2_': 3, '1_2_': 2}


In [21]:

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randint(P, size=N), dtype="category")

for criteria in ["entropy"]:
    tree_object = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree_object.fit(X, y)
    y_hat = tree_object.predict(X)
    print(y_hat)
    # tree.plot()
    print("Criteria :", criteria)
    print("Accuracy: ", accuracy(y_hat, y))
    # for cls in y.unique():
    #     print("Precision: ", precision(y_hat, y, cls))
    #     print("Recall: ", recall(y_hat, y, cls))

{'1_': {'attribute': 0.0, 'split_value': 0.16919126215699776, 'right_label': '1_1_', 'left_label': '1_2_'}, '1_1_': {'attribute': 0.0, 'split_value': -1.4394428450912664, 'right_label': '1_1_1_', 'left_label': '1_1_2_'}, '1_1_1_': 3, '1_1_2_': {'attribute': 0.0, 'split_value': -0.5320222864127546, 'right_label': '1_1_2_1_', 'left_label': '1_1_2_2_'}, '1_1_2_1_': {'attribute': 0.0, 'split_value': -0.6607754103120528, 'right_label': '1_1_2_1_1_', 'left_label': '1_1_2_1_2_'}, '1_1_2_1_1_': 0, '1_1_2_1_2_': {'attribute': 0.0, 'split_value': -0.5819970707351848, 'right_label': '1_1_2_1_2_1_', 'left_label': '1_1_2_1_2_2_'}, '1_1_2_1_2_1_': 2, '1_1_2_1_2_2_': 0, '1_1_2_2_': 3, '1_2_': 2}
[3, 2, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3]
Criteria : entropy
Accuracy:  0.1


In [22]:
N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randn(N))


for criteria in ["squared_error"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    print(y_hat)
    # tree.plot()
    # print("Criteria :", criteria)
    # print("RMSE: ", rmse(y_hat, y))
    # print("MAE: ", mae(y_hat, y))


{'1_': {'attribute': 0.0, 'split_value': 1.581876991537185, 'right_label': '1_1_', 'left_label': '1_2_'}, '1_1_': {'attribute': 0.0, 'split_value': 0.21735301410996216, 'right_label': '1_1_1_', 'left_label': '1_1_2_'}, '1_1_1_': {'attribute': 0.0, 'split_value': -0.050830976481100236, 'right_label': '1_1_1_1_', 'left_label': '1_1_1_2_'}, '1_1_1_1_': {'attribute': 0.0, 'split_value': -1.7831435655549073, 'right_label': '1_1_1_1_1_', 'left_label': '1_1_1_1_2_'}, '1_1_1_1_1_': {'attribute': 0.0, 'split_value': -2.042665895283238, 'right_label': '1_1_1_1_1_1_', 'left_label': '1_1_1_1_1_2_'}, '1_1_1_1_1_1_': 0.8261904736542662, '1_1_1_1_1_2_': -1.0176165563536546, '1_1_1_1_2_': 0.12492188813852269, '1_1_1_2_': 1.8136665758789074, '1_1_2_': -0.9339787505041518, '1_2_': 0.7521957128156073}
[0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.7521957128156073, 0.752

In [32]:
# Test case 3
# Discrete Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5)})
y = pd.Series(np.random.randint(P, size=N), dtype="category")

for criteria in ["entropy", "gini_index"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    # tree.plot()
    # print("Criteria :", criteria)
    print("Accuracy: ", accuracy(y_hat, y))
    # for cls in y.unique():
    #     print("Precision: ", precision(y_hat, y, cls))
    #     print("Recall: ", recall(y_hat, y, cls))

{'1_': {'attribute': '1_1', 'right_label': '1_2_', 'left_label': '1_1_'}, '1_1_': {'attribute': '3_1', 'right_label': '1_1_2_', 'left_label': '1_1_1_'}, '1_1_1_': {'attribute': '1_0', 'right_label': '1_1_1_2_', 'left_label': '1_1_1_1_'}, '1_1_1_1_': {'attribute': '0_1', 'right_label': '1_1_1_1_2_', 'left_label': '1_1_1_1_1_'}, '1_1_1_1_1_': {'attribute': '1_3', 'right_label': '1_1_1_1_1_2_', 'left_label': '1_1_1_1_1_1_'}, '1_1_1_1_1_1_': 3, '1_1_1_1_1_2_': 0, '1_1_1_1_2_': 2, '1_1_1_2_': 1, '1_1_2_': 1, '1_2_': 1}
Accuracy:  0.5333333333333333
{'1_': {'attribute': '1_1', 'right_label': '1_2_', 'left_label': '1_1_'}, '1_1_': {'attribute': '0_1', 'right_label': '1_1_2_', 'left_label': '1_1_1_'}, '1_1_1_': {'attribute': '1_3', 'right_label': '1_1_1_2_', 'left_label': '1_1_1_1_'}, '1_1_1_1_': {'attribute': '1_2', 'right_label': '1_1_1_1_2_', 'left_label': '1_1_1_1_1_'}, '1_1_1_1_1_': {'attribute': '2_4', 'right_label': '1_1_1_1_1_2_', 'left_label': '1_1_1_1_1_1_'}, '1_1_1_1_1_1_': 1, '1_1_

In [24]:

N = 30
P = 5
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5)})
y = pd.Series(np.random.randn(N))

for criteria in ["mse"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    print(y_hat)
    tree.plot()
    # print("Criteria :", criteria)
    # print("RMSE: ", rmse(y_hat, y))
    # print("MAE: ", mae(y_hat, y))

{'1_': {'attribute': '0_0', 'right_label': '1_2_', 'left_label': '1_1_'}, '1_1_': {'attribute': '0_4', 'right_label': '1_1_2_', 'left_label': '1_1_1_'}, '1_1_1_': {'attribute': '1_2', 'right_label': '1_1_1_2_', 'left_label': '1_1_1_1_'}, '1_1_1_1_': {'attribute': '3_3', 'right_label': '1_1_1_1_2_', 'left_label': '1_1_1_1_1_'}, '1_1_1_1_1_': {'attribute': '2_0', 'right_label': '1_1_1_1_1_2_', 'left_label': '1_1_1_1_1_1_'}, '1_1_1_1_1_1_': 0.9628530278630554, '1_1_1_1_1_2_': 1.7092924947296217, '1_1_1_1_2_': 0.6148939578869408, '1_1_1_2_': -0.39837339017694856, '1_1_2_': -0.4248790464559558, '1_2_': -0.5655991060977654}
[0.9628530278630554, -0.4248790464559558, -0.4248790464559558, 0.6148939578869408, 0.6148939578869408, 0.9628530278630554, -0.4248790464559558, -0.4248790464559558, -0.5655991060977654, -0.4248790464559558, -0.5655991060977654, 0.9628530278630554, 1.7092924947296217, 0.9628530278630554, -0.5655991060977654, -0.39837339017694856, -0.4248790464559558, 0.9628530278630554, -0

In [25]:
import numpy as np

# Input list
data = ["yes", "yes", "no"]

# Use np.unique to get the unique elements and their counts
values, counts = np.unique(data, return_counts=True)

# Find the index of the maximum count
most_occurring = values[np.argmax(counts)]

# Print the most occurring value
print(most_occurring)


yes


In [26]:
dict_ = {}
type(dict_)==dict

True

In [27]:
print(y)

0     0.668101
1    -0.189208
2     0.579303
3     0.497292
4     0.762847
5     0.637967
6    -0.442432
7     0.458710
8     0.653400
9    -1.291124
10   -0.274044
11    0.608114
12    1.709292
13    0.824900
14    1.772378
15   -0.398373
16    0.250190
17    0.805735
18   -2.326352
19    1.510803
20   -1.860910
21    0.996940
22    0.686832
23   -2.339593
24   -2.289061
25    1.432482
26    1.214722
27   -0.880864
28   -0.881879
29    0.512605
dtype: float64


In [28]:
print(y_hat)

[0.9628530278630554, -0.4248790464559558, -0.4248790464559558, 0.6148939578869408, 0.6148939578869408, 0.9628530278630554, -0.4248790464559558, -0.4248790464559558, -0.5655991060977654, -0.4248790464559558, -0.5655991060977654, 0.9628530278630554, 1.7092924947296217, 0.9628530278630554, -0.5655991060977654, -0.39837339017694856, -0.4248790464559558, 0.9628530278630554, -0.5655991060977654, 0.9628530278630554, -0.5655991060977654, -0.5655991060977654, 0.6148939578869408, -0.4248790464559558, -0.5655991060977654, 0.9628530278630554, 0.9628530278630554, -0.5655991060977654, -0.5655991060977654, 0.6148939578869408]
