In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
def split(array):
    dic={}
    for i in np.unique(array):
        dic.update({i:np.where(array==i)[0]})
    return dic

In [5]:
print(split(np.array([0,1,2])))
print(split(np.array([1,0,1,0,0,1,0])))
print(split(np.array([1,0,3,2,0,1,1])))

{0: array([0]), 1: array([1]), 2: array([2])}
{0: array([1, 3, 4, 6]), 1: array([0, 2, 5])}
{0: array([1, 4]), 1: array([0, 5, 6]), 2: array([3]), 3: array([2])}


In [6]:
import math
def entropy(array):
    b_list=[]
    for i in np.unique(array):
        p=len(np.where(array==i)[0])/len(array)
        b_list.append(p*math.log2(p))
    return -sum(b_list)

In [7]:
print(round(entropy(np.array([0,1,0,1,1,0])),4))
print(round(entropy(np.array([1,2])),4))
print(round(entropy(np.array([1,1])),4))
print(round(entropy(np.array([1,0,0,0,0,0,0,0,0,0,0])),4))
print(round(entropy(np.array([0,0,0])),4))
print(round(entropy(np.array([1,1,1,0,1,4,4,2,1])),4))

1.0
1.0
-0.0
0.4395
-0.0
1.6577


In [8]:
def IG(x,y):
    parent_entropy=entropy(y)
    split_dict=[split(x).get(k) for k in np.unique(x)]
    for i in split_dict:
        freq=len(x[[k for k in i]])/len(x)
        child_entropy=freq*entropy(y[[k for k in i]])
        parent_entropy=parent_entropy-child_entropy
    return parent_entropy

In [9]:
x=np.array([0,1,0,1,0,1])
y=np.array([0,1,0,1,1,1])
print(round(IG(x,y),4))
x=np.array([0,0,1,1,2,2])
y=np.array([0,1,0,1,1,1])
print(round(IG(x,y),4))
x=np.array([0,1,0,1,2,1])
y=np.array([0,1,0,1,1,1])
print(round(IG(x,y),4))

0.4591
0.2516
0.9183


In [10]:
def make_tree(X,y,attribute):
    if y.shape[0]==1 or y.shape[0]==0:
        return y

    gains=[]
    if len(X.T)==1:
        gain=IG(X.T,y)
        if (gain<=1e-05):
            return y
        gains.append(gain)
    else:
        for x in X.T:
            gain=IG(x,y)

            if (gain<=1e-05):
                return y

            gains.append(gain)
    #print(gains)
    best_feature=np.argmax(gains)
    #print(best_feature)
    results={}
    
    subset_dict=split(X[:,best_feature])
    #print(subset_dict)
    for feature_value,train_example_indices in subset_dict.items():
        #print(train_example_indices)
        child_x_subset=np.delete(X[train_example_indices],best_feature,1)
        child_y_subset=y[train_example_indices]
        child_attribute=attribute[attribute != attribute[best_feature]]
        #print(child_x_subset)

        results["%s = %s" %(attribute[best_feature], feature_value)] = make_tree(child_x_subset, child_y_subset,child_attribute)

    return results

In [15]:
x=pd.read_csv("dataset/tennis.csv").iloc[:,1:-1].values
y=pd.read_csv("dataset/tennis.csv").iloc[:,-1].values
attribute=pd.read_csv("dataset/tennis.csv").iloc[:,1:-1].columns.values

In [16]:
print(attribute)
print(x)
print("label\n",y)
print("decision tree:\n",make_tree(x,y,attribute))

['outlook' 'temp' 'humidity' 'wind']
[['Sunny' 'Hot' 'High' 'Weak']
 ['Sunny' 'Hot' 'High' 'Strong']
 ['Overcast' 'Hot' 'High' 'Weak']
 ['Rain' 'Mild' 'High' 'Weak']
 ['Rain' 'Cool' 'Normal' 'Weak']
 ['Rain' 'Cool' 'Normal' 'Strong']
 ['Overcast' 'Cool' 'Normal' 'Strong']
 ['Sunny' 'Mild' 'High' 'Weak']
 ['Sunny' 'Cool' 'Normal' 'Weak']
 ['Rain' 'Mild' 'Normal' 'Weak']
 ['Sunny' 'Mild' 'Normal' 'Strong']
 ['Overcast' 'Mild' 'High' 'Strong']
 ['Overcast' 'Hot' 'Normal' 'Weak']
 ['Rain' 'Mild' 'High' 'Strong']]
label
 ['No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'No']
decision tree:
 {'outlook = Overcast': array(['Yes', 'Yes', 'Yes', 'Yes'], dtype=object), 'outlook = Rain': {'wind = Strong': array(['No', 'No'], dtype=object), 'wind = Weak': array(['Yes', 'Yes', 'Yes'], dtype=object)}, 'outlook = Sunny': {'humidity = High': array(['No', 'No', 'No'], dtype=object), 'humidity = Normal': array(['Yes', 'Yes'], dtype=object)}}


In [17]:
def _traverse(x,d,attribute):
    if isinstance(d,np.ndarray):
        return d
    for key in d:
        name,value=key.split("=")
        feature_idx=attribute.tolist().index(name.strip())
        if x[feature_idx]==value.strip():
            return _traverse(x,d[key],attribute)

In [18]:
_traverse(np.array(['Rain','Mild','High','Weak']),make_tree(x,y,attribute),attribute)

array(['Yes', 'Yes', 'Yes'], dtype=object)