In [20]:
import pandas as pa
from math import log
from collections import OrderedDict

In [21]:
def ID3(data, label):
    rootNode = getRootNode(data, label)
    tree = (rootNode, getAttribute(data, label, rootNode))
    print("tree :", tree)
    return tree

In [22]:
def getAttribute(data, label, rootNode):
    tree = {}
    unique_values = data[rootNode].unique()
    defaultData = data[label].value_counts().idxmax()
    tree.update({'default': defaultData})
    for unique_value in unique_values:
        df = data[data[rootNode] == unique_value]
        info = information(df[label])
        if info > 0:
            df = df.drop([rootNode], axis=1)
            aNode = getRootNode(df, label)
            tup = (aNode, getAttribute(df, label, aNode))
            tree.update({str(unique_value): tup})
        else:
            d = df.filter(items=[label, rootNode]).values[0]
            key = str(d[1])
            value = d[0]
            tree.update({key: value})
    return dict(OrderedDict(sorted(tree.items(), key=lambda d: d[0])))

In [23]:
def getRootNode(data, label):
    gainDict = dict()
    info = information(data[label])
    if info > 0:
        for key in data.keys():
            if key != label:
                df = data.filter(items=[label, key])
                gainDict.update({key: gain(df, key, label, info)})
    root = max(gainDict.keys(), key=(lambda x: gainDict[x]))
    print("root: ", root)
    return root

In [24]:
def gain(data, key, label, I_total):
    data = pa.DataFrame(data=data)
    entropy_total = 0.0
    unique_values = data[key].unique()
    for value in unique_values:
        df = data[data[key] == value]
        info_value = information(df[label])
        entropy_value = entropy(s=len(df), s_total=len(data), info=info_value)
        entropy_total += entropy_value
    print("I_total - entropy_total: ", float(format((I_total - entropy_total), '.5f')))
    return float(format((I_total - entropy_total), '.5f'))

In [25]:
def entropy(s, s_total, info):
    s = abs(s)
    s_total = abs(s_total)
    if s != 0 and s_total != 0:
        result = (s / s_total) * info
    else:
        result = 0
    print("result: ", float(format(result, '.5f')))
    return float(format(result, '.5f'))

In [26]:
def information(data):
    info = 0.0
    uniqueValues = data.value_counts()
    for count in uniqueValues:
        p = probability(count, len(data))
        if p != 0:
            temp1 = p * (log(p, 2))
        else:
            temp1 = 0
        info -= temp1
    print("info: ",float(format(info, '.5f')) )
    return float(format(info, '.5f'))

In [27]:
def probability(s1, s):
    s1 = abs(s1)
    s = abs(s)
    if s1 != 0 and s != 0:
        result = s1 / s
    else:
        result = 0
    print("result: ", float(format(result, '.5f')))
    return float(format(result, '.5f'))

In [28]:
def classify(data, input):
    if isinstance(data, tuple):
        if data[0] in input:
            attribute_data = input[data[0]]
            if attribute_data in data[1]:
                value = data[1].get(attribute_data)
                result = classify(value, input)
            else:
                value = data[1].get('default')
                result = classify(value, input)
        else:
            value = data[1].get('default')
            result = classify(value, input)
    else:
        result = data
    print("Result: ", result)
    return result

In [29]:
def dataPreprocesing(data, label):
    dicList = list()
    for row in data:
        temp = {label: row[1]}
        row[0].update(temp)
        dicList.append(row[0])
    dataFrame = pa.DataFrame(data=dicList)
    return dataFrame

In [30]:
training_data = [
    ({'a1': 'True', 'a2': 'Hot', 'a3': 'High'}, "No"),
    ({'a1': 'True', 'a2': 'Hot', 'a3': 'High'}, "No"),
    ({'a1': 'False', 'a2': 'Hot', 'a3': 'High'}, "Yes"),
    ({'a1': 'False', 'a2': 'Cool', 'a3': 'Normal'}, "Yes"),
    ({'a1': 'False', 'a2': 'Cool', 'a3': 'Normal'}, "Yes"),
    ({'a1': 'True', 'a2': 'Cool', 'a3': 'High'}, "No"),
    ({'a1': 'True', 'a2': 'Hot', 'a3': 'High'}, "No"),
    ({'a1': 'True', 'a2': 'Hot', 'a3': 'Normal'}, "Yes"),
    ({'a1': 'False', 'a2': 'Cool', 'a3': 'Normal'}, "Yes"),
    ({'a1': 'False', 'a2': 'Cool', 'a3': 'High'}, "Yes")
]

label = 'Result'
dataFrame = dataPreprocesing(training_data, label)
dt = ID3(dataFrame, label)

print('\nID3 classification result : \n\n')
c1 = {"a1": "True", "a2": "Cool", "a3": "Normal"}
c2 = {"a1": "False", "a2": "Hot", "a3": "High"}

print("\nClassify1 = ", c1, '\n')
print("Classify1 Result = ", classify(dt, c1), '\n')
print("Classify2 = ", c2, '\n')
print("Classify2 Result = ", classify(dt, c2), '\n')


result:  0.6
result:  0.4
info:  0.97095
result:  0.8
result:  0.2
info:  0.72193
result:  0.36096
result:  1.0
info:  0.0
result:  0.0
I_total - entropy_total:  0.60999
result:  0.6
result:  0.4
info:  0.97095
result:  0.48547
result:  0.8
result:  0.2
info:  0.72193
result:  0.36096
I_total - entropy_total:  0.12452
result:  0.66667
result:  0.33333
info:  0.91829
result:  0.55097
result:  1.0
info:  0.0
result:  0.0
I_total - entropy_total:  0.41998
root:  a1
result:  0.8
result:  0.2
info:  0.72193
result:  0.8
result:  0.2
info:  0.72193
result:  0.75
result:  0.25
info:  0.81128
result:  0.64902
result:  1.0
info:  0.0
result:  0.0
I_total - entropy_total:  0.07291
result:  1.0
info:  0.0
result:  0.0
result:  1.0
info:  0.0
result:  0.0
I_total - entropy_total:  0.72193
root:  a3
result:  1.0
info:  0.0
result:  1.0
info:  0.0
result:  1.0
info:  0.0
tree : ('a1', {'False': 'Yes', 'True': ('a3', {'High': 'No', 'Normal': 'Yes', 'default': 'No'}), 'default': 'Yes'})

ID3 classific