In [26]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [27]:
iris = load_iris()
df = pd.DataFrame(iris.data)
df['species'] = iris.target 

In [28]:
df.head()

Unnamed: 0,0,1,2,3,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [29]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
train, test = df[df['is_train']==True], df[df['is_train']==False]
train = train.drop(['is_train'], axis = 1)
test = test.drop(['is_train'], axis = 1)

In [30]:
def entropy(data):
    counter = {}
    total = 0
    for x in data.iloc[:,data.shape[1]-1]:
        counter[x] = counter.get(x,0) + 1
    for count in counter.values():
        p = count/data.shape[0]
        total += -(p*np.log(p)/.693)
    return total

In [31]:
class Node:
    def __init__(self, data):
        self.data = data
    
    def train(self):
        if entropy(self.data) == 0:
            self.leaf = True
            self.value = self.data.iloc[0, self.data.shape[1] - 1]
            return
        self.leaf = False
        self.value = 420
        min_entropy = -1
        split_row = 0
        split_col = 0
        left = self.data
        right = self.data
        for col in range(self.data.shape[1] - 1):
            for row in range(self.data.shape[0]):
                l, r = self.split_on_row_column(row, col)
                e = entropy(l)*l.shape[0] + entropy(r)*r.shape[0]
                if min_entropy == -1 or e < min_entropy:
                    min_entropy = e
                    split_row = row
                    split_col = col
                    left = l
                    right = r
                    
        self.split_row = split_row
        self.split_col = split_col
        self.left = Node(left)
        self.right = Node(right)
        self.left.train()
        self.right.train()
            
    def split_on_row_column(self, row, column):
        left = self.data.loc[self.data.iloc[row, column] >= self.data.iloc[:, column]]
        right = self.data.loc[self.data.iloc[row, column] < self.data.iloc[:, column]]
        return left, right
        
    def inference(self, x):
        if self.leaf:
            return self.value
        if self.data.iloc[self.split_row, self.split_col] >= x.iloc[self.split_col]:
            return self.left.inference(x)
        return self.right.inference(x)

In [32]:
tree = Node(train)
tree.train()

In [33]:
tree.inference(df.iloc[60])

1

In [36]:
def validate(model, data):
    ct = 0
    corr = 0
    for i in range(test.shape[0]):
        data = test.iloc[i]
        ct += 1
        if tree.inference(data) == data['species']:
            corr += 1
    return corr/ct

validate(tree, test)

0.9767441860465116