In [None]:
import math
import pandas as pd
import queue
import random

In [None]:
random.seed(200)

mushroom = pd.read_csv('data/agaricus-lepiota.data')

# check for missing values that are ? in the dataset
missing_values = mushroom.isin(['?']).sum()
for column in list(mushroom.loc[:,]):
    # print(mushroom[column].value_counts())
    pass

# replace values with most common in that column
mushroom = mushroom.replace('?', 'b')
# print(mushroom)

# using 30% of data points as a training dataset
mushroom_testing = mushroom.sample(frac=0.3, random_state=200)
print(mushroom_testing)
mushroom_training = mushroom.drop(mushroom_testing.index)
print(mushroom_training)
mushroom_testing = mushroom_testing.reset_index(drop=True)
mushroom_training = mushroom_training.reset_index(drop=True)

mushroom = mushroom_training
print(len(mushroom))
print(mushroom)
features = mushroom.columns[1:]
print(features)

variable_info = {}

for feature in features:
    variable_info[feature] = mushroom[feature].unique()

print(variable_info)

In [None]:
# entropy calculation
def entropy(num_pos, num_neg):
    if num_pos == 0 or num_neg == 0:
        return 0
    p = num_pos / (num_pos + num_neg)
    n = num_neg / (num_pos + num_neg)
    return -(p * math.log2(p) + n * math.log2(n))

# printing binary trees, with child node names no and yes
def printTree(node, level=0):
    if node != None:
        printTree(node.yes, level+1)
        print('     '* 4 * level + '-> ' + str(node))
        printTree(node.no, level+1)


In [None]:
# For binary tree approach

class binary_tree_node():
    def __init__(self, feature, value, no, yes):
        self.feature = feature
        self.value = value
        self.no = no
        self.yes = yes

    def add_no(self, no):
        self.no = no
    
    def add_yes(self, yes):
        self.yes = yes

    def __str__(self):
        if self.value == None:
            return f"{self.feature}"
        return f"{self.feature},{self.value}"

class DecisionTreeBinary():

    root = None

    @staticmethod
    def information_gain(entropy_prev, expected_entropy):
        return entropy_prev - expected_entropy

    @staticmethod
    def expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature, num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total):
        return (num_in_feature/total) * entropy(num_pos_in_feature, num_neg_in_feature) + (num_not_in_feature/total) * entropy(num_pos_not_in_feature, num_neg_not_in_feature)


    def greedy_recrusive_splitting(self, data, remaining_features):
        Queue = queue.Queue()

        split_type = 'root'
        self.root = binary_tree_node(None, None, None, None)
        current_node = None

        Queue.put((data, split_type, None))
        
        while Queue.empty() == False:
            data, split_type, current_node = Queue.get()
            if len(data) == 0 or len(remaining_features) == 0:
                print('No data')
                continue
            max_gain = 0
            best_feature = None
            best_split = None
            total = len(data)
            total_pos = len(data[data['class'] == 'e'])
            total_neg = len(data[data['class'] == 'p'])
            # print(total_pos, total_neg)
            entropy_prev = entropy(total_pos, total_neg)
            # print(entropy_prev)
            if entropy_prev == 0:
                if total_pos > total_neg:
                    if split_type == 'yes':
                        current_node.add_yes(binary_tree_node('Edible', None, None, None))
                    else:
                        current_node.add_no(binary_tree_node('Edible', None, None, None))
                else:
                    if split_type == 'yes':
                        current_node.add_yes(binary_tree_node('Poisonous', None, None, None))
                    else:
                        current_node.add_no(binary_tree_node('Poisonous', None, None, None))
                print('Entropy is 0')
                continue

            for feature in remaining_features:
                for value in variable_info[feature]:
                    num_in_feature = len(data[data[feature] == value])
                    num_not_in_feature = len(data[data[feature] != value])
                    num_pos_in_feature = len(data[(data[feature] == value) & (data['class'] == 'e')])
                    num_neg_in_feature = len(data[(data[feature] == value) & (data['class'] == 'p')])
                    num_pos_not_in_feature = len(data[(data[feature] != value) & (data['class'] == 'e')])
                    num_neg_not_in_feature = len(data[(data[feature] != value) & (data['class'] == 'p')])
                    # print(num_in_feature, num_not_in_feature, feature, value, (f"{num_pos_in_feature}/{total_pos}"),num_pos_in_feature/total_pos, (f"{num_neg_in_feature}/{total_neg}"), num_neg_in_feature/total_neg)
                    expected_ent = DecisionTreeBinary.expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature, num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total)
                    # print(expected_ent)
                    gain = DecisionTreeBinary.information_gain(entropy_prev, expected_ent)
                    # print(gain)
                    if gain > max_gain:
                        max_gain = gain
                        best_feature = feature
                        best_split = value
            print(best_feature, best_split, max_gain)
            if best_feature is None:
                continue
            remaining_features.remove(best_feature)
            if split_type == 'root':
                self.root = binary_tree_node(best_feature, best_split, None, None)
                current_node = self.root
            elif split_type == 'yes':
                current_node.add_yes(binary_tree_node(best_feature, best_split, None, None))
                current_node = current_node.yes
            else:
                current_node.add_no(binary_tree_node(best_feature, best_split, None, None))
                current_node = current_node.no
            
            Queue.put(((data[data[best_feature] == best_split]), 'yes', current_node))
            Queue.put(((data[data[best_feature] != best_split]), 'no', current_node))


DTB = DecisionTreeBinary()
DTB.greedy_recrusive_splitting(mushroom, list(features))

printTree(DTB.root)


In [None]:
# For binary tree approach with depth

class binary_tree_node():
    def __init__(self, feature, value, no, yes):
        self.feature = feature
        self.value = value
        self.no = no
        self.yes = yes

    def add_no(self, no):
        self.no = no
    
    def add_yes(self, yes):
        self.yes = yes

    def __str__(self):
        if self.value == None:
            return f"{self.feature}"
        return f"{self.feature},{self.value}"

class DecisionTreeBinary():

    def __init__(self, training_data_set, testing_data_set, traget_feature, pos='p', neg='n'):
        self.root = None
        self.data = training_data_set
        self.test_data = testing_data_set
        self.data_features = training_data_set.columns[1:]
        self.feature_values = {}
        for feature in self.data_features:
            self.feature_values[feature] = self.data[feature].unique()
        self.target_feature=traget_feature
        self.feat_is_positive = pos
        self.feat_is_negative = neg

    @staticmethod
    def information_gain(entropy_prev, expected_entropy):
        return entropy_prev - expected_entropy

    @staticmethod
    def expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature, num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total):
        return (num_in_feature/total) * entropy(num_pos_in_feature, num_neg_in_feature) + (num_not_in_feature/total) * entropy(num_pos_not_in_feature, num_neg_not_in_feature)


    def train(self, max_depth=10):
        remaining_features = list(self.data_features)
        Queue = queue.Queue()

        split_type = 'root'
        self.root = binary_tree_node(None, None, None, None)
        current_node = None
        depth = 0

        Queue.put((self.data, split_type, None, depth))

        while Queue.empty() == False:
            data, split_type, current_node, depth = Queue.get()
            if depth > max_depth:
                continue
            if len(data) == 0 or len(remaining_features) == 0:
                continue
            max_gain = 0
            best_feature = None
            best_split = None
            total = len(data)
            total_pos = len(data[data[self.target_feature] == self.feat_is_positive])
            total_neg = len(data[data[self.target_feature] == self.feat_is_negative])

            entropy_prev = entropy(total_pos, total_neg)

            if entropy_prev == 0:
                if total_pos > total_neg:
                    if split_type == 'yes':
                        current_node.add_yes(binary_tree_node(self.feat_is_positive, None, None, None))
                    else:
                        current_node.add_no(binary_tree_node(self.feat_is_positive, None, None, None))
                else:
                    if split_type == 'yes':
                        current_node.add_yes(binary_tree_node(self.feat_is_negative, None, None, None))
                    else:
                        current_node.add_no(binary_tree_node(self.feat_is_negative, None, None, None))
                print('Entropy is 0')
                continue

            for feature in remaining_features:
                for value in self.feature_values[feature]:

                    num_in_feature = len(data[data[feature] == value])
                    num_not_in_feature = len(data[data[feature] != value])
                    num_pos_in_feature = len(data[(data[feature] == value) & (data[self.target_feature] == self.feat_is_positive)])
                    num_neg_in_feature = len(data[(data[feature] == value) & (data[self.target_feature] == self.feat_is_negative)])
                    num_pos_not_in_feature = len(data[(data[feature] != value) & (data[self.target_feature] == self.feat_is_positive)])
                    num_neg_not_in_feature = len(data[(data[feature] != value) & (data[self.target_feature] == self.feat_is_negative)])

                    expected_ent = DecisionTreeBinary.expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature, num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total)

                    gain = DecisionTreeBinary.information_gain(entropy_prev, expected_ent)

                    if gain > max_gain:
                        max_gain = gain
                        best_feature = feature
                        best_split = value

            print(best_feature, best_split, max_gain)
            remaining_features.remove(best_feature)

            if split_type == 'root':
                self.root = binary_tree_node(best_feature, best_split, None, None)
                current_node = self.root
            elif split_type == 'yes':
                current_node.add_yes(binary_tree_node(best_feature, best_split, None, None))
                current_node = current_node.yes
            else:
                current_node.add_no(binary_tree_node(best_feature, best_split, None, None))
                current_node = current_node.no
            
            Queue.put(((data[data[best_feature] == best_split]), 'yes', current_node, depth+1))
            Queue.put(((data[data[best_feature] != best_split]), 'no', current_node, depth+1))

    def test_accuracy(self):
        pass

    def test_accuracy_at_depth(self):
        pass
        

DTB = DecisionTreeBinary(mushroom_training, mushroom_testing, traget_feature='class', pos='e', neg='p')
DTB.train(max_depth=2)

printTree(DTB.root)