## Decision Tree Classifier

Tag: ID3,C4.5,C5.0, CART, Gini Index, Impurity, Information Gain and Entropy

In [1]:
#CART(Classification and regression tree) use Gini index
#ID3 use entropy and information gain

In [1]:
import numpy as np
from math import log

#### Gini index and Entropy
Impurity of data (mix of class) can be calculated using Gini or Entropy.
Information gain: If we split the data in two parts how good the split is.  
 <img src='imgs/ginientropy.jpg' width=60%>

#### Entropy vs gini
<img src='imgs/entropy_vs_gini.png' width=40%>

In [2]:
def gini(rows):
    classes=count_class_freq(rows)
    impurity = 1
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c**2
    return impurity

In [3]:
def entropy(rows):
    classes=count_class_freq(rows)
    impurity = 0
    for c in classes:
        prob_of_c = classes[c] / float(len(rows))
        impurity -= prob_of_c* log(prob_of_c, 2)
    return impurity

In [4]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [5]:
dt=[
    [10.0,'F'],
    [0.0, 'F'],
    [80.0, 'P'],
    [100.0,'P'],
    [35.0, 'F'],
    [37.0, 'P'],
    [34.0, 'F'],
    [25.0, 'F']
]

In [6]:
def count_class_freq(rows):
    #last column is the class
    classes={} #dictionary
    for row in rows:
        c=row[-1]
        if c not in classes:
            classes[c]=1
        else:
            classes[c]+=1
    return classes

In [7]:
def split_data(dt, th):
    lb=[]
    rb=[]
    for row in dt:
    #     print(row)
        v=row[0]
    #     print(v)
        if v<=th:
            lb.append(row)
        else:
            rb.append(row)
    return lb,rb

In [8]:
ct=gini(dt)
print(ct)

0.46875


In [9]:
lb,rb=split_data(dt, 50)
print(lb)
print(rb)

[[10.0, 'F'], [0.0, 'F'], [35.0, 'F'], [37.0, 'P'], [34.0, 'F'], [25.0, 'F']]
[[80.0, 'P'], [100.0, 'P']]


In [10]:
import numpy as np
# ndt=np.array(dt)

In [11]:
print(lb)
print('')
print(rb)

[[10.0, 'F'], [0.0, 'F'], [35.0, 'F'], [37.0, 'P'], [34.0, 'F'], [25.0, 'F']]

[[80.0, 'P'], [100.0, 'P']]


In [12]:
lg=gini(lb)
rg=gini(rb)
print(lg,rg)
td=len(lb)+len(rb)

ig=ct- (len(lb)/td)*lg - (len(rb)/td)*rg
print(ig)

0.2777777777777777 0.0
0.26041666666666674


In [13]:
td=len(lb)+len(rb)

In [14]:
(len(lb)/td)*lg

0.20833333333333326

In [15]:
dt=[
    [10.0,1],
    [30.0, 2],
    [60.0, 3],
    [50.0,2],
    [20.0, 1],
    [95.0, 3],
    [85.0, 1]
]

In [16]:
dt=[
    [30.0, 2],
    [60.0, 3],
    [50.0,2],
    [95.0, 3],
    [85.0, 1]
]

In [17]:
dt=[
    [60.0, 3],
    [95.0, 3],
    [85.0, 1]
]

In [18]:
dt=[
    [17,0],
    [25, 0],
    [38, 0],
    [42,1],
    [44, 1],
    [47, 2],
    [49, 2],
    [50, 3],
    [54, 3],
    [53, 3]
]

In [19]:
data=np.array(dt)

In [20]:
ndt=np.array(dt)

### Experiment

In [21]:
from sklearn.tree import DecisionTreeClassifier

X = [[8,4],[50,40],[8,9],[15,12],[9,9.8], [11, 23], [44, 56] ]
y = [1,1,1,0,0,1,0]

data=[]
for ix, iy in zip(X,y):
    ix.append(iy)
    data.append(ix)
data=np.array(data)
print(data)

[[ 8.   4.   1. ]
 [50.  40.   1. ]
 [ 8.   9.   1. ]
 [15.  12.   0. ]
 [ 9.   9.8  0. ]
 [11.  23.   1. ]
 [44.  56.   0. ]]


In [22]:
tree = DecisionTreeClassifier() 
model = tree.fit(X,y)

In [23]:
model.predict(X)

array([1, 1, 1, 0, 0, 1, 0])

## Decision Tree

In [24]:
def split_rows(dt, col, th):
    lb=[]
    rb=[]
    for row in dt:
    #     print(row)
        v=row[col]
    #     print(v)
        if v<=th:
            lb.append(row)
        else:
            rb.append(row)
    return lb,rb

In [25]:
lb,rb=split_rows(data, 0, 12)
print(lb)
print('')
print(rb)

[array([8., 4., 1.]), array([8., 9., 1.]), array([9. , 9.8, 0. ]), array([11., 23.,  1.])]

[array([50., 40.,  1.]), array([15., 12.,  0.]), array([44., 56.,  0.])]


In [26]:
ct=gini(data)
print(ct)

0.48979591836734704


In [27]:
ig=info_gain(lb, rb, ct)
print(ig)

0.08503401360544224


In [28]:
'''
For multi column data. 
'''
def find_best_split(rows):
    imp=gini(rows)
    ncol=len(rows[0])-1
    nrow=len(rows)
    best_col=0
    best_val=0
    best_ig=0
    for ic in range(ncol):
        for ir in range ( nrow ):
            th=rows[ir][ic]
            lb,rb=split_rows(rows, ic, th)
            ig=info_gain(lb, rb, imp)
    #         print(ig)
            if ig > best_ig:
                best_ig=ig
                best_col=ic
                best_val=th
    return best_col, best_val, best_ig

In [29]:
def decide_class(rows):
    mc=count_class_freq(rows)
    ss= sorted(mc.items(), key=lambda kv: kv[1])
    return ss[-1][0]

In [30]:
best_col, best_val, best_ig=find_best_split(data)
print('best_col=', best_col,' best_val=',best_val,' best_ig=',best_ig)

best_col= 0  best_val= 8.0  best_ig= 0.14693877551020418


In [31]:
lb,rb=split_rows(data, best_col, best_val)
print(lb)
print(rb)

[array([8., 4., 1.]), array([8., 9., 1.])]
[array([50., 40.,  1.]), array([15., 12.,  0.]), array([9. , 9.8, 0. ]), array([11., 23.,  1.]), array([44., 56.,  0.])]


In [32]:
q={best_col, best_val}
print(q)

{0, 8.0}


In [33]:
tree={'col': best_col, 'val':best_val}
print(tree)

{'col': 0, 'val': 8.0}


In [34]:
best_col, best_val, best_ig=find_best_split(lb)
print('best_col=', best_col,' best_val=',best_val,' best_ig=',best_ig)

best_col= 0  best_val= 0  best_ig= 0


In [35]:
mc=count_class_freq(data)
ss= sorted(mc.items(), key=lambda kv: kv[1])
print(ss[-1][0])

1.0


In [38]:
tree['left']=decide_class(lb)
print(tree)

{'col': 0, 'val': 8.0, 'left': 1.0}


In [39]:
best_col, best_val, best_ig=find_best_split(rb)
print('best_col=', best_col,' best_val=',best_val,' best_ig=',best_ig)

best_col= 1  best_val= 12.0  best_ig= 0.21333333333333332


In [40]:
tree['right']={'col': best_col, 'val':best_val}
print(tree)

{'col': 0, 'val': 8.0, 'left': 1.0, 'right': {'col': 1, 'val': 12.0}}


In [44]:
def make_tree(data, max_depth=-1, ndepth=1):
    best_col, best_val, best_ig=find_best_split(data)
    if best_ig==0 or ndepth==max_depth: #leaf node.
        return decide_class(data)
    
    lb,rb=split_rows(data, best_col, best_val)
    lt=make_tree(lb, max_depth, ndepth+1)
    rt=make_tree(rb, max_depth, ndepth+1)
    return {'col': best_col, 'val':best_val, 'left':lt, 'right':rt}

In [45]:
# tree=make_tree(data, max_depth=10)
tree=make_tree(data)
print(tree)

{'col': 0, 'val': 8.0, 'left': 1.0, 'right': {'col': 1, 'val': 12.0, 'left': 0.0, 'right': {'col': 1, 'val': 40.0, 'left': 1.0, 'right': 0.0}}}


In [46]:
print(tree['left'], type(tree['right']))
print(tree['right'])

1.0 <class 'dict'>
{'col': 1, 'val': 12.0, 'left': 0.0, 'right': {'col': 1, 'val': 40.0, 'left': 1.0, 'right': 0.0}}


In [47]:
def predict(tree, row):
    if not isinstance(tree, dict):
        return tree
    col=tree['col']
    val=tree['val']
    if row[col]<=val:
        return predict(tree['left'], row)
    else:
        return predict(tree['right'], row)

In [48]:
if isinstance(tree['right'], dict):
    print('dict')
else:
    print('not')

dict


In [49]:
pr=[ predict(tree, dt) for dt in data]
print(pr)

[1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]


### Iris Dataset

In [67]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 

In [68]:
iris =  datasets.load_iris() 
X = iris.data
y = iris.target

In [69]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2)
print(xtrain.shape, xtest.shape)

(120, 4) (30, 4)


In [70]:
data=[]
for ix, iy in zip(xtrain,ytrain):
    ix=np.append(ix, iy)
    data.append(ix)
data=np.array(data)
print(data[:4])

[[5.  3.4 1.6 0.4 0. ]
 [6.5 3.  5.2 2.  2. ]
 [6.5 2.8 4.6 1.5 1. ]
 [6.6 3.  4.4 1.4 1. ]]


In [71]:
tree=make_tree(data)
print(tree)

{'col': 2, 'val': 1.9, 'left': 0.0, 'right': {'col': 3, 'val': 1.7, 'left': {'col': 2, 'val': 4.9, 'left': {'col': 3, 'val': 1.6, 'left': 1.0, 'right': 2.0}, 'right': {'col': 3, 'val': 1.5, 'left': 2.0, 'right': {'col': 0, 'val': 6.7, 'left': 1.0, 'right': 2.0}}}, 'right': {'col': 2, 'val': 4.8, 'left': {'col': 0, 'val': 5.9, 'left': 1.0, 'right': 2.0}, 'right': 2.0}}}


In [72]:
pr=[ predict(tree, dt) for dt in xtest]
print(pr)

[2.0, 0.0, 2.0, 0.0, 2.0, 1.0, 2.0, 0.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 2.0, 0.0, 2.0, 0.0, 1.0, 2.0]


In [73]:
accuracy_score(ytest, pr)

1.0

In [74]:
print(ytest)

[2 0 2 0 2 1 2 0 2 2 2 1 1 2 1 2 0 1 0 0 0 1 1 1 2 0 2 0 1 2]


In [79]:
#scikit-learn

dtc = DecisionTreeClassifier() 
model = dtc.fit(xtrain, ytrain)
prd=model.predict(xtest)
print(prd)
accuracy_score(ytest, prd)

[2 0 2 0 2 1 2 0 2 2 2 1 1 2 1 2 0 1 0 0 0 1 1 1 2 0 2 0 1 2]


1.0