-
Notifications
You must be signed in to change notification settings - Fork 0
/
Decision_Tree_ID3.py
100 lines (68 loc) · 2.64 KB
/
Decision_Tree_ID3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
def createDataset(filepath):
data = pd.read_csv(filepath, encoding='gbk', sep=' ')
labels = data.columns[:-1]
return data, labels
def cal_entropy(data):
labelcounts = data[u'好瓜'].value_counts()
numEntries = len(data)
numlabel = len(labelcounts)
ent = 0.0
for i in range(numlabel):
ent -= labelcounts[i]/numEntries * np.log2(labelcounts[i]/numEntries)
return ent
def split_data(data, axis, value):
labels = data.columns
data = data.values
# Have no idea how to realize this step in DataFrame tpye
row_index = data[:,axis] == value
subData = np.array(data[row_index,:])
subData = pd.DataFrame(subData)
subData.columns = labels
return subData
def choose_BestFeature(data, labels):
baseEnt = cal_entropy(data)
num_features = len(labels)
bestInfoGain = 0.0
bestFeature = -1
for i in range(num_features):
values = pd.unique(data[labels[i]]) # values in each feature
featEnt = 0
for value in values:
subData = split_data(data, i, value)
prob = len(subData) / float(len(data))
featEnt += prob * cal_entropy(subData)
infoGain = baseEnt - featEnt
# the difference between ID3 and C4.5 is that ID3 calculate infoGain while C4.5 calculate infoGateRatio
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def get_mostlabel(labellist):
labeldict = {}
for i in labellist:
labeldict[i] = label_dict.get(i,0) + 1
sorted_label = sorted(labeldict.items(), key=lambda item:item[1])
return sorted_label[0][0]
def createTree(data, labels):
label_list = data.iloc[:,-1]
if len(label_list.value_counts()) == 1:
return label_list.values[0]
if all([len(data[i].value_counts()) == 1 for i in labels]):
return get_mostlabel(label_list)
bestFeat = choose_BestFeature(data, labels)
bestFeatLabel = labels[bestFeat]
Tree = {bestFeatLabel:{}}
labels = labels.drop(labels[bestFeat])
values = pd.unique(data[bestFeatLabel])
for value in values:
subLabels = labels[:]
Tree[bestFeatLabel][value] = createTree(split_data(data,bestFeat,value),subLabels)
return Tree
if __name__ == '__main__':
filepath = 'F:\\Mymaterial\\dataset\\Decision_Tree.txt'
data, labels = createDataset(filepath)
tree = createTree(data, labels)
# tree is not right, something wrong in the final loop?