In [1]:
import pandas as pd
import numpy as np

In [53]:
data = pd.read_csv("data.csv", index_col="RID")
discrete_atts = ["age", "income", "student", "credit rating"]

In [114]:
data

Unnamed: 0_level_0,age,income,student,credit rating,buys computer
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,young,high,no,excellent,yes
2,young,high,no,good,yes
3,middle-aged,high,no,excellent,no
4,senior,medium,no,excellent,no
5,senior,low,yes,excellent,no
6,senior,low,yes,good,yes
7,middle-aged,low,yes,good,yes
8,young,medium,no,excellent,yes
9,young,low,yes,excellent,yes
10,senior,medium,yes,excellent,no


In [55]:
row.to_frame().T

Unnamed: 0,age,income,student,credit rating,buys computer
1,young,high,no,excellent,yes


In [56]:
attribute_vals = {att: set() for att in data.columns}
for _, row in data.iterrows():
    for att, att_val in row.items():
        if att_val not in attribute_vals[att]:
            attribute_vals[att].add(att_val)

In [57]:
attribute_vals

{'age': {'middle-aged', 'senior', 'young'},
 'income': {'high', 'low', 'medium'},
 'student': {'no', 'yes'},
 'credit rating': {'excellent', 'good'},
 'buys computer': {'no', 'yes'}}

In [115]:
class Node:        
    def __init__(self, parent, criterion, criterion_vals=None, discrete=True, leaf=False): 
        # branch constructor 
        self.parent = parent
        self.is_leaf = leaf
        if self.is_leaf:
            self.label = criterion
        else:
            self.discrete = discrete
            self.criterion = criterion 
            if discrete:
                self.children = {val: None for val in criterion_vals}
            else:
                self.children = {True: None, False: None}
            
    def add_child(self, branch, n):
        self.children[branch] = n
    
    def apply_criterion(self, row):       
        if self.discrete:
            return self.children[row[self.criterion]]
        else:
            return self.children[row[self.criterion] <= self.criterion_vals]
        
    def __str__(self):
        if self.is_leaf:
            return  f"parent: {self.parent}\n label: {self.label}"
        else:
            return f"parent: {self.parent}\n criterion: {self.criterion}"
            

In [127]:
def pure_leaf(D, cls):
    cls_val = D.iloc[0][cls] 
    # if all tuples in D has same cls_val then its pure
    for _, row in D.iterrows():
        if row[cls] != cls_val:
            return False
    return True

def majority_voting(D, cls):
    cls_vals = {}
    for _, row in D.iterrows():
        cls_val = row[cls]
        if cls_val not in cls_vals:
            cls_vals[cls_val] = 1
        else:
            cls_vals[cls_val] += 1
    return max(cls_vals, key=cls_vals.get)

def compute_info(D, cls): # Entropy
    n = len(D)
    cls_vals = {}
    for _, row in D.iterrows():
        cls_val = row[cls]
        if cls_val not in cls_vals:
            cls_vals[cls_val] = 1
        else:
            cls_vals[cls_val] += 1
    
    info = 0
    for _, val in cls_vals.items():
        p = float(val/n)
        info += p*np.log2(p + 1e-6)
    
    return -info

def make_partitions_discrete(D, attribute, attribute_values):
    # Make partitions of D based on each value of attribute
    partitions = {k: None for k in attribute_values[attribute]}
    # print(partitions)
    for _, row in D.iterrows():
        att_val = row[attribute]
        if partitions[att_val] is None:
            partitions[att_val] = row.copy().to_frame().T
        else:
            partitions[att_val] = pd.concat([partitions[att_val], row.copy().to_frame().T], ignore_index=False)
    
    return partitions
    
def get_split_point(D, attribute, atrribute_values):
    values = sorted(atrribute_values[attribute])
    split_point = (values[0] + values[1])/2.
    
    
    
def attribute_selection_method(D, attribute_list, attribute_values, discrete_atts, cls):
    info = compute_info(D, cls)
    max_info_gain = -1e6
    partitioning_att = attribute_list[0]
    final_partitions = {}
    split_val = 0
    for attribute in attribute_list:
        info_att = 0
        if attribute in discrete_atts:
            partitions = make_partitions_discrete(D, attribute, attribute_values)
            for partition in partitions.values():
                if partition is not None:
                    info_part = compute_info(partition, cls)
                    info_att += (len(partition)/len(D))*info_part
                
            gain_A = info - info_att
            if gain_A > max_info_gain:
                # print("{} selected".format(attribute))
                partitioning_att = attribute
                final_partitions = partitions
                max_info_gain = gain_A
        else:
            # For continous-valued
            split_point, partitions = get_split_point(D, attribute, attribute_values)
            for partition in partitions.values():
                if partition is not None:
                    info_part = compute_info(partition, cls)
                    info_att += (len(partition)/len(D))*info_part
                
    
    return partitioning_att, final_partitions


def generate_decision_tree(root,
                           D, 
                           attribute_list, 
                           attribute_values,
                           discrete_atts,
                           cls,
                           multiway = True):
    
    # print("cur att_list:", attribute_list)
    
    if pure_leaf(D, cls): 
        label = D.iloc[0][cls]
        print("[] pure leaf with label {} created".format(label))
        N = Node(root, label, leaf=True)
        return N
    
    if len(attribute_list) == 0:
        
        label = majority_voting(D, cls)
        N = Node(root, label, leaf=True)
        print("[] leaf with label {} created through majority voting".format(label))
        return N
    
    attribute, partitions = attribute_selection_method(D, attribute_list, attribute_values, discrete_atts, cls)
    print("{} selected as criterion".format(attribute))

    N = Node(root, attribute, partitions.keys())
    
    if attribute in discrete_atts and multiway:
        attribute_list.remove(attribute)
        
    for condition, outcome in partitions.items():
        print("{} conditional {}".format(attribute, condition))
        # print(outcome)
        Dj = outcome
        if Dj is None:
            label = majority_voting(D, cls)
            leaf = Node(N, label, leaf=True)
            print("[] leaf with label {} created after exhaustion".format(label))
            N.add_child(condition, leaf)
        else:
            child = generate_decision_tree(N, Dj, attribute_list, attribute_values, cls)
            N.add_child(condition, child)
    
    return N
    

In [128]:
root = generate_decision_tree(None, data, ["age", "income", "student", "credit rating"], attribute_vals, discrete_atts, "buys computer")

age selected as criterion
age conditional senior
credit rating selected as criterion
credit rating conditional excellent
[] pure leaf with label no created
credit rating conditional good
student selected as criterion
student conditional no
[] pure leaf with label no created
student conditional yes
[] pure leaf with label yes created
age conditional young
income selected as criterion
income conditional high
[] pure leaf with label yes created
income conditional low
[] pure leaf with label yes created
income conditional medium
[] leaf with label yes created through majority voting
age conditional middle-aged
[] leaf with label yes created through majority voting


In [124]:
root.children

{'senior': <__main__.Node at 0x7f0a11a8e3a0>,
 'young': <__main__.Node at 0x7f0a11a8e370>,
 'middle-aged': <__main__.Node at 0x7f0a11ab5970>}

In [129]:
num_correct = 0

def classify_tuple(root, row):
    cur_node = root
    while not cur_node.is_leaf:
        cur_node = cur_node.apply_criterion(row)
    return cur_node.label

for _, row in data.iterrows():
    label = classify_tuple(root, row[:-1])
    if label == row[-1]:
        num_correct += 1

In [130]:
num_correct / len(data)

0.8333333333333334