In [9]:
import numpy as np
import pandas as pd
from collections import Counter

In [10]:
def IG_(y, y_left, y_right):
    def S_(y):
        return -sum(np.sum(y == val) * np.log(np.mean(y == val) + 1e-10) for val in set(y))
    return S_(y) - S_(y_left) - S_(y_right)


def most_common(y):
    return Counter(y).most_common()[0][0] # y.mode()[0]

In [82]:
class DecisionTree():
    def __init__(self, max_depth=3, min_samples_split=5):
        self.max_depth = max_depth 
        self.min_samples_split = min_samples_split
        
    def fit(self, X, y):
        if X.shape[0] < self.min_samples_split or self.max_depth == 1 or len(set(y)) == 1:
            self.is_leaf = True
            self.target = most_common(y)
        else:
            self.is_leaf = False
            self.feature = -1
            self.threshold = 0
            IG_max = 0
            for feature in range(X.shape[1]):
                for threshold in set(X[:, feature]):
                    y_left = y[X[:, feature] < threshold]
                    y_right = y[X[:, feature] >= threshold]
                    IG = IG_(y, y_left, y_right)
                    if IG > IG_max:
                        self.feature, self.threshold, IG_max = feature, threshold, IG
            self.left = DecisionTree(self.max_depth - 1, self.min_samples_split)
            self.right = DecisionTree(self.max_depth - 1, self.min_samples_split)

            self.left.fit(X[X[:, self.feature] < self.threshold], y[X[:, self.feature] < self.threshold])
            self.right.fit(X[X[:, self.feature] >= self.threshold], y[X[:, self.feature] >= self.threshold])
        
    def predict_item(self, x):
        if self.is_leaf:
            return self.target
        if x[self.feature] < self.threshold:
            return self.left.predict_item(x)
        else:
            return self.right.predict_item(x)
    
    def predict(self, X):
        return np.array([self.predict_item(X[i]) for i in range(X.shape[0])])    
    
    def print(self, feature_names, prefix=''):
        if self.is_leaf:
            print(f'{prefix}target = {self.target}')
        else:
            print(f'{prefix}if {feature_names[self.feature]} < {self.threshold}:')
            self.left.print(feature_names, prefix=prefix+' '*4)
            print(f'{prefix}else:')
            self.right.print(feature_names, prefix=prefix+' '*4)         

In [83]:
from sklearn.datasets import load_wine

wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
X['target'] = wine.target

X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [84]:
t = DecisionTree(4, 50)
t.fit(X.values[:, :-1], X.values[:, -1])

In [85]:
t.predict_item(X.values[10])

0.0

In [90]:
X[X['target'] != t.predict(X.values[:, :-1])]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
43,13.24,3.98,2.29,17.5,103.0,2.64,2.63,0.32,1.66,4.36,0.82,3.0,680.0,0
61,12.64,1.36,2.02,16.8,100.0,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450.0,1


In [91]:
t.print(X.columns)

if flavanoids < 1.58:
    if color_intensity < 3.85:
        target = 1.0
    else:
        target = 2.0
else:
    if proline < 735.0:
        if alcohol < 13.24:
            target = 1.0
        else:
            target = 1.0
    else:
        if color_intensity < 3.52:
            target = 1.0
        else:
            target = 0.0
