In [4]:
import numpy as np
import pandas as pd
import json
import pydot

class ID3:
    def __init__(self, train_data, class_column):
        self.X = train_data.drop(columns=class_column)
        self.Y = train_data[class_column]
        
    def generate_tree(self):
        tmp_x = self.X.copy()
        tmp_y = self.Y.copy()
        class_attribute = self.Y.name
        features = tmp_x.columns
        tmp_x[class_attribute] = self.Y
        self.t = self.create_decision_tree(tmp_x, tmp_x, features, class_attribute)
        
    def report_accuracy(self, test, pred):
        total_samples = len(test)
        correctly_classified = 0
        for i in range(len(test)):
            if test[i] == pred[i]:
                correctly_classified += 1
        print("Accuracy Result")
        print("Total Test Samples : ", total_samples)
        print("Correctly Classified : ", correctly_classified)
        print("Miss Classified: ", total_samples-correctly_classified)
        print("Accuracy : ", correctly_classified/total_samples * 100, "%")

    def predict(self, d, class_name):
        y = d[class_name]
        d = d.drop(columns=class_name)
        samples = d.to_dict(orient='records')
        predictions = []
        for s in samples:
            pc = self.make_prediction(s, self.t, 1.0)
            predictions.append(pc)
        self.report_accuracy(y, predictions)
        return predictions
    
    def find_IG(self, data, feature, target):
        total = self.find_entropy(data[target])
        V, C = np.unique(data[feature], return_counts=True)
        entropies_w = []
        sum_C = 0
        for i in range(len(C)):
            sum_C += C[i]
        for i in range(len(V)):
            p = C[i]/sum_C
            e = self.find_entropy(data.where(data[feature]==V[i]).dropna()[target])
            entropies_w.append(p*e)
        total_w=0
        for e in entropies_w:
            total_w+=e
        IG = total - total_w
        return IG
    
    def print_tree(self, t=None, indent=None):
        if t==None:
            t=self.t
        if indent==None:
            indent=0
        for key, value in t.items():
            print('\t' * indent + str(key))
            if isinstance(value, dict):
                self.print_tree(value, indent+1)
            else:
                print('\t' * (indent+1) + str(value))
                
    def find_entropy(self, column_name):
        V, C = np.unique(column_name, return_counts=True)
        sum_C = 0
        for i in range(len(C)):
            sum_C += C[i]
        entropies = []
        for i, v in enumerate(V):
            p = C[i]/sum_C
            e = (-1)*np.log2(p)*p
            entropies.append((-1)*p*np.log2(p))
        total=0
        for e in entropies:
            total+=e
        return total

    def create_decision_tree(self,
                             d,
                             original,
                             features,
                             target,
                             parent=None):
        u_classes = np.unique(d[target])
        if len(u_classes) <= 1:
            return u_classes[0]
        elif len(d) == 0:
            main_c = np.argmax(np.unique(original[target], return_counts=True)[1])
            return np.unique(original[target])[main_c]
        elif len(features) == 0:
            return parent
        else:
            main_c = np.argmax(np.unique(d[target], return_counts=True)[1])
            parent = u_classes[main_c]
            IGs = [self.find_IG(d, f, target) for f in features]
            best_i = np.argmax(IGs)
            best = features[best_i]
            t = {best: {}}
            feature = [f for f in features if f != best]
            parent = np.unique(d[best])
        for v in parent:
            sub_d = d.where(d[best] == v).dropna()
            sub = self.create_decision_tree(sub_d, original, features, target, parent)
            t[best][v] = sub
        return t

    def make_prediction(self, s, t, default=1):
        for feature in list(s.keys()):
            if feature in list(t.keys()):
                try:
                    res = t[feature][s[feature]]
                except:
                    return default
                res = t[feature][s[feature]]
                if isinstance(res, dict):
                    return self.make_prediction(s, res)
                else:
                    return res
                
def draw_tree(graph, node_dict, parent_node=None):
    global i
    for key, value in node_dict.items():
        if isinstance(value, dict):
            if parent_node:
                node = pydot.Node(i, label=key, style="filled", fillcolor="green")
                graph.add_node(node)
                i += 1
                edge = pydot.Edge(parent_node, node)
                graph.add_edge(edge)
                draw_tree(graph, value, node)
            else:
                i=0
                parent_node = pydot.Node(i, label=key, style="filled", fillcolor="red")
                graph.add_node(parent_node)
                i+=1
                draw_tree(graph, value, parent_node)
        else:            
            node1 = pydot.Node(i, label=key, style="filled", fillcolor="green")
            graph.add_node(node1)
            i += 1
            node2 = pydot.Node(i, label=value, style="filled", fillcolor="yellow")
            graph.add_node(node2)
            i += 1
            edge1 = pydot.Edge(node1, node2)
            edge2 = pydot.Edge(parent_node, node1)
            graph.add_edge(edge1)
            graph.add_edge(edge2)

            
df = pd.read_csv("PlayTennis.csv")
model = ID3(df, "PlayTennis")
model.generate_tree()
model.print_tree()

graph = pydot.Dot(graph_type='graph')
draw_tree(graph, model.t)
graph.write_png('PlayeTennis.png')

Outlook
	Overcast
		Yes
	Rain
		Wind
			Strong
				No
			Weak
				Yes
	Sunny
		Humidity
			High
				No
			Normal
				Yes


In [2]:
df_train = pd.read_csv("mushrooms-train.csv")
df_test = pd.read_csv("mushrooms-test.csv")

model = ID3(df_train, "class")
model.generate_tree()
predictions = model.predict(df_test, "class")

graph = pydot.Dot(graph_type='graph')
draw_tree(graph, model.t)
graph.write_png('Mushrooms.png')

Accuracy Result
Total Test Samples :  1125
Correctly Classified :  1121
Miss Classified:  4
Accuracy :  99.64444444444445 %
