# Introduction


# Pre-Modelling

## Imports

Firstly we will need to import all required libraries for this task

In [48]:
import math
import pandas as pd
import queue
import random

random.seed(200)

## Load Data

Load the data in our directory, this will be different based on where you put the data file.

On examining the file I see that the .data file is actually comma separated files. During this examination I also see that the data doesn't have a feature row, but there is information about which coloumn is which feature so I created a row above all the data with each feature's name

### Missing data

The data sheet say's that the data is missing some values and all the cells that are '?' is missing data. I need find these values and replace with the average of the column, I will use the mode of the column as the data is categorical. I found that the feature stalk-root has the missing data and the mode in that feature was 'b' - bulbous, so I replaced all the '?' with 'b'

### Separating Training/Validation/Test data

For making this decision tree model and optimising it, I need to be able to test the model on data it has not seen so we have to separate the testing data.

The ultimate goal is to create a model based on all the data but we can't test how the model will react to unseen data if we don't have any more data to test on.

The validation data will be within the training data set, this way we can split the data later for doing different splits (30/70, 90/10, 2-fold cross validation, 10-fold cross validation).

After all the training/validating we will test the seperate models on the separated test data.


In [49]:

mushroom = pd.read_csv('agaricus-lepiota.data')

# check for missing values that are ? in the dataset
missing_values = mushroom.isin(['?']).sum()
# print(missing_values)
for column in list(mushroom.loc[:,]):
    # print(mushroom[column].value_counts())
    pass

# replace values with most common in that column
mushroom = mushroom.replace('?', 'b')
# print(mushroom)

# using 30% of data points as a training dataset
mushroom_testing = mushroom.sample(frac=0.3, random_state=200)
# print(mushroom_testing)
mushroom_training = mushroom.drop(mushroom_testing.index)
# print(mushroom_training)

# reset index so that loops won't get stuck and crash the program
mushroom_testing = mushroom_testing.reset_index(drop=True)
mushroom_training = mushroom_training.reset_index(drop=True)

## Entropy Calculations



In [50]:
# entropy calculation
def entropy(num_pos, num_neg):
    if num_pos == 0 or num_neg == 0:
        return 0
    p = num_pos / (num_pos + num_neg)
    n = num_neg / (num_pos + num_neg)
    return -(p * math.log2(p) + n * math.log2(n))


def information_gain(entropy_prev, expected_entropy):
    return entropy_prev - expected_entropy


def expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature, num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total):
    return (num_in_feature/total) * entropy(num_pos_in_feature, num_neg_in_feature) + (num_not_in_feature/total) * entropy(num_pos_not_in_feature, num_neg_not_in_feature)

# modified printing non-binary trees


def printNonBinaryTree(node, level=0, split_type=None):
    if node != None:
        keys = list(node.split.keys())
        num_of_values = len(keys)
        for i in range(int(num_of_values/2)):
            printNonBinaryTree(node.split[keys[i]], level+1, keys[i])
        if split_type == None:
            print('     ' * 4 * level + '-> ' + str(node))
        else:
            print('     ' * 4 * level + str(split_type) + ' -> ' + str(node))
        for i in range(int(num_of_values - (num_of_values/2))):
            printNonBinaryTree(
                node.split[keys[i+int(num_of_values/2)]], level+1, keys[i+int(num_of_values/2)])

## Tree Nodes


## Dataset_targets



In [51]:
# For non-binary tree approach

class Decision_tree_node():
    def __init__(self, feature):
        self.feature = feature
        self.split = {}

    def add(self, split_name, node):
        self.split[split_name] = node

    def __str__(self):
        return f"{self.feature}"


class Dataset_targets():

    def __init__(self, target_feature, feature_positive, feature_negative):
        self.target_feature = target_feature
        self.feature_positive = feature_positive
        self.feature_negative = feature_negative

    def load(self):
        return self.target_feature, self.feature_positive, self.feature_negative

# Decision Tree Model - Final Implementation

In [75]:
class DecisionTree():

    def __init__(self, training_data, validation_data, targets=None, predict_on_stopping_depth=True):
        self.root = None
        # If split is not set just use dataset as training and testing
        self.training_data = training_data
        self.validation_data = validation_data
        self.data_features = training_data.columns[1:]
        self.feature_values = {}
        for feature in self.data_features:
            self.feature_values[feature] = self.training_data[feature].unique()

        self.target_feature, self.feat_is_positive, self.feat_is_negative = targets.load()
        self.predict_on_stopping_depth = predict_on_stopping_depth
        self.tree_depth = 0

    def train(self, stopping_depth=1000, do_output=False):

        if do_output:
            print(f"Training Decision Tree to depth of {stopping_depth} -->")
        remaining_features = list(self.data_features)
        Queue = queue.Queue()

        split_type = 'root'
        self.root = Decision_tree_node(None)
        current_node = None
        depth = 0

        Queue.put((self.training_data, split_type, None, depth))

        while Queue.empty() == False:
            data, split_type, current_node, depth = Queue.get()
            if do_output:
                print(f'depth: {depth}', end=f":  {split_type} -> ")
            if len(data) == 0 or len(remaining_features) == 0:
                if do_output:
                    print(" everything classified")
                continue
            max_gain = 0
            best_feature = None
            total = len(data)
            total_pos = len(data[data[self.target_feature]
                            == self.feat_is_positive])
            total_neg = len(data[data[self.target_feature]
                            == self.feat_is_negative])

            entropy_prev = entropy(total_pos, total_neg)

            if self.predict_on_stopping_depth:
                if depth > stopping_depth-1:
                    if total_pos > total_neg:
                        current_node.add(
                            split_type, Decision_tree_node(self.feat_is_positive))
                    else:
                        current_node.add(
                            split_type, Decision_tree_node(self.feat_is_negative))
                    if do_output:
                        print(" reached desired depth")
                    self.tree_depth = depth+1
                    continue
            else:
                if depth > stopping_depth:
                    if do_output:
                        print(" reached desired depth")
                    continue

            if entropy_prev == 0:
                if total_pos > total_neg:
                    current_node.add(
                        split_type, Decision_tree_node(self.feat_is_positive))
                else:
                    current_node.add(
                        split_type, Decision_tree_node(self.feat_is_negative))
                self.tree_depth = depth
                if do_output:
                    print(' Entropy is 0')
                continue

            for feature in remaining_features:
                for value in self.feature_values[feature]:

                    num_in_feature = len(data[data[feature] == value])
                    num_not_in_feature = len(data[data[feature] != value])
                    num_pos_in_feature = len(data[(data[feature] == value) & (
                        data[self.target_feature] == self.feat_is_positive)])
                    num_neg_in_feature = len(data[(data[feature] == value) & (
                        data[self.target_feature] == self.feat_is_negative)])
                    num_pos_not_in_feature = len(data[(data[feature] != value) & (
                        data[self.target_feature] == self.feat_is_positive)])
                    num_neg_not_in_feature = len(data[(data[feature] != value) & (
                        data[self.target_feature] == self.feat_is_negative)])

                    expected_ent = expected_entropy(num_in_feature, num_pos_in_feature, num_neg_in_feature,
                                                    num_not_in_feature, num_pos_not_in_feature, num_neg_not_in_feature, total)

                    gain = information_gain(entropy_prev, expected_ent)

                    if gain > max_gain:
                        max_gain = gain
                        best_feature = feature

            if split_type == 'root':
                self.root = Decision_tree_node(best_feature)
                current_node = self.root
            else:
                current_node.add(split_type, Decision_tree_node(best_feature))
                current_node = current_node.split[split_type]

            for value in self.feature_values[best_feature]:
                Queue.put(((data[data[best_feature] == value]),
                          value, current_node, depth+1))

            if do_output:
                print(best_feature, max_gain)
            remaining_features.remove(best_feature)
            self.tree_depth = depth

        if do_output:
            printNonBinaryTree(self.root)

    def printTree(self):
        printNonBinaryTree(self.root)

    def classify_item(self, item):
        current_node = self.root
        while current_node != None:
            feature = current_node.feature
            value = item[feature]
            if value not in list(current_node.split.keys()):
                break
            current_node = current_node.split[value]

            if current_node.feature == self.feat_is_positive:
                return self.feat_is_positive
            elif current_node.feature == self.feat_is_negative:
                return self.feat_is_negative

        # if the tree couldn't identify the item randomly select positive or negative
        # If we did this it will skew the testing as the 50/50 prediction may get too many wrong or right based on pure chance
        # rand_num = random.randint(0,1).
        #
        # if rand_num == 0:
        #     return self.feat_is_negative
        # else:
        #     return self.feat_is_positive

        # if the tree couldn't identify the item return 'does not know'
        return 'dnk'

    def training_accuracy_at_depth(self, depth):
        self.train(stopping_depth=depth)
        total = len(self.training_data)
        correct = 0
        training_test_answers = self.training_data.loc[:, [
            self.target_feature]]
        predictions = []
        train_data_test = self.training_data.drop(self.target_feature, axis=1)
        for i in range(len(train_data_test)):
            item = train_data_test.loc[i]
            predictions.append(self.classify_item(item))
        for i in range(total):
            if training_test_answers.iloc[i, 0] == predictions[i]:
                correct += 1
        return correct/total

    def validate(self):
        if self.tree_depth == 0:
            print("Train the tree first before you validate")
            return
        predictions = []
        validation_data = self.validation_data.drop(
            self.target_feature, axis=1)
        for i in range(len(validation_data)):
            item = validation_data.loc[i]
            predictions.append(self.classify_item(item))
        return predictions

    def validation_accuracy(self):
        if self.tree_depth == 0:
            print("Train the tree first before you validate")
            return
        test_answers = self.validation_data.loc[:, [self.target_feature]]
        total = len(test_answers)
        correct = 0
        test_predictions = self.validate()
        for i in range(total):
            if test_answers.iloc[i, 0] == test_predictions[i]:
                correct += 1
        return correct, total

    def validation_error(self):
        return 1-self.validation_accuracy()

    def validation_accuracy_at_depth(self, depth, do_output=False):
        if self.tree_depth != depth:
            # print(f"re-training tree to depth {depth}")
            self.train(stopping_depth=depth)
        correct, total = self.validation_accuracy()
        if do_output:
            print(
                f'Validation accuracy: {correct}/{total} -> {(correct/total)*100:.2f}%\n')
        return correct/total

    def testing_accuracy_at_depth(self, depth=1000, testing_data=[]):
        if len(testing_data) == 0:
            raise ValueError("No testing data")
        if self.tree_depth != depth:
            # print(f"re-training tree to depth {depth}")
            self.train(stopping_depth=depth)
        total = len(testing_data)
        correct = 0
        test_answers = testing_data.loc[:, [self.target_feature]]
        predictions = []
        testing_data = testing_data.drop(self.target_feature, axis=1)
        for i in range(len(testing_data)):
            item = testing_data.loc[i]
            predictions.append(self.classify_item(item))
        for i in range(total):
            if test_answers.iloc[i, 0] == predictions[i]:
                correct += 1
        return correct/total


mushroom_targets = Dataset_targets(
    target_feature='class', feature_positive='e', feature_negative='p')

                    p -> p
                    a -> e
                    l -> e
                                        k -> e
                                        n -> e
                                        h -> e
                                                                                                    n -> e
                                                                                s ->  cap-color
                                                                                                    w -> p
                                                            n ->  stalk-surface-below-ring
                                                                                y -> p
                                        w ->  gill-size
                                                            b -> e
                    n ->  spore-print-color
                                        r -> p
                                        o -> e
                             

In [None]:
dt = DecisionTree(mushroom_training, mushroom_training,
                  mushroom_targets, predict_on_stopping_depth=True)
dt.train()
dt.printTree()
print(f'{dt.tree_depth} depth tree --> Test accuracy {dt.testing_accuracy_at_depth(depth=dt.tree_depth, testing_data=mushroom_testing)*100:.2f}%')

In [None]:
dt2 = DecisionTree(mushroom_training, mushroom_training,
                   mushroom_targets, predict_on_stopping_depth=False)
dt2.train(2)
dt2.printTree()
print(f'{dt2.tree_depth} depth tree (without final depth prediction) --> Test accuracy {dt2.testing_accuracy_at_depth(depth=dt2.tree_depth, testing_data=mushroom_testing)*100:.2f}%')

dt3 = DecisionTree(mushroom_training, mushroom_training,
                   mushroom_targets, predict_on_stopping_depth=True)
dt3.train(2)
dt3.printTree()
print(f'{dt3.tree_depth} depth tree (with final depth prediction) --> Test accuracy {dt3.testing_accuracy_at_depth(depth=dt3.tree_depth, testing_data=mushroom_testing)*100:.2f}%')

In [76]:
# Validation and testing -->
def single_split_validation(data_set, data_targets, training_validation_split=0.5, tree_depth=1, predict_on_stopping_depth=True):
    training_data = data_set.sample(
        frac=training_validation_split, random_state=200)
    validation_data = data_set.drop(training_data.index)
    training_data = training_data.reset_index(drop=True)
    validation_data = validation_data.reset_index(drop=True)
    print("\n-----------------------------------------------")
    print(
        f"Decision tree with {int(training_validation_split*100)}/{int((1-training_validation_split)*100)} split at depth {tree_depth}")
    print("-----------------------------------------------")
    dt = DecisionTree(training_data=training_data, validation_data=validation_data,
                      targets=data_targets, predict_on_stopping_depth=predict_on_stopping_depth)
    validation_accuracy = dt.validation_accuracy_at_depth(tree_depth)
    print(f'Validation Accuracy --> {validation_accuracy}')
    print("-----------------------------------------------\n")

    return validation_accuracy


def cross_validation(data_set, data_targets, folds=2, tree_depth=1, predict_on_stopping_depth=True):
    fold_frac = 1/folds
    data_folds = []
    validation_accuracies = []
    all_data = data_set
    rest_of_data = data_set
    for i in range(folds):
        if i == folds-1:
            data_folds.append(rest_of_data)
        else:
            data_fold = rest_of_data.sample(frac=fold_frac, random_state=200)
            rest_of_data = rest_of_data.drop(data_fold.index)
            data_folds.append(data_fold)
    print("\n-----------------------------------------------")
    print(
        f"Decision trees using {folds}-fold cross-validation at depth {tree_depth}")
    print("-----------------------------------------------")
    for i in range(folds):
        validation_data = data_folds[i]
        training_data = all_data.drop(validation_data.index)
        training_data = training_data.reset_index(drop=True)
        validation_data = validation_data.reset_index(drop=True)

        dt = DecisionTree(training_data=training_data, validation_data=validation_data,
                          targets=data_targets, predict_on_stopping_depth=predict_on_stopping_depth)
        validation_accuracies.append(
            dt.validation_accuracy_at_depth(tree_depth))
        print(
            f'Fold {i+1}: Validation Accuracy --> {validation_accuracies[i]}')
    print(f'Overall Validation Accuracy = {sum(validation_accuracies)/folds}')
    print("-----------------------------------------------\n")

    return sum(validation_accuracies)/folds


def leave_one_out(data_set, data_targets, tree_depth=1, predict_on_stopping_depth=False):
    folds = len(data_set)-1
    return cross_validation(data_set=data_set, data_targets=data_targets, folds=folds, tree_depth=tree_depth, predict_on_stopping_depth=predict_on_stopping_depth)


def find_best_hyper_parameters(data_set, data_targets, max_depth=10, predict_on_stopping_depth=True):
    decision_tree_accuracies = {}
    for depth in range(1, max_depth+1):
        for folds in range(2, 11):
            decision_tree_name = f'cross_validation_tree_depth_of_{depth}_with_{folds}_folds'
            decision_tree_accuracies[decision_tree_name] = cross_validation(
                data_set=data_set, data_targets=data_targets, folds=folds, tree_depth=depth, predict_on_stopping_depth=predict_on_stopping_depth)
        for training_validation_split in range(50, 91, 10):
            decision_tree_name = f'single_tree_depth_of_{depth}_training_validation_split_of_{training_validation_split:.2f}'
            decision_tree_accuracies[decision_tree_name] = single_split_validation(data_set=data_set, data_targets=data_targets, training_validation_split=(
                training_validation_split/100), tree_depth=depth, predict_on_stopping_depth=predict_on_stopping_depth)
    for decision_tree_name in decision_tree_accuracies:
        best = 0
        if decision_tree_accuracies[decision_tree_name] > best:
            best = decision_tree_accuracies[decision_tree_name]
            best_decision_tree = decision_tree_name
    print(f'{best_decision_tree} has the best accuracy of {best}')

# # Tree depth 1
# single_split_validation(mushroom_training, mushroom_targets,
#                         training_validation_split=0.7, tree_depth=1)
# cross_validation(mushroom_training, mushroom_targets,
#                  mushroom_testing, folds=10, tree_depth=1)

# # Tree depth 2
# single_split_validation(mushroom_training, mushroom_targets,
#                         training_validation_split=0.7, tree_depth=2)
# cross_validation(mushroom_training, mushroom_targets,
#                  folds=10, tree_depth=2)

# # Tree depth 3
# single_split_validation(mushroom_training, mushroom_targets,
#                         training_validation_split=0.7, tree_depth=3)
# cross_validation(mushroom_training, mushroom_targets,
#                  folds=10, tree_depth=3)

# # Tree depth 4
# single_split_validation(mushroom_training, mushroom_targets,
#                         training_validation_split=0.7, tree_depth=4)
# cross_validation(mushroom_training, mushroom_targets,
#                  folds=10, tree_depth=4)

# # Tree depth 5
# single_split_validation(mushroom_training, mushroom_targets,
#                         mushroom_testing, training_validation_split=0.7, tree_depth=5)
# cross_validation(mushroom_training, mushroom_targets,
#                  folds=10, tree_depth=5)

# Leave one out takes way too long for the entire dataset
# leave_one_out(mushroom, mushroom_targets)


find_best_hyper_parameters(data_set=mushroom_training,
                           data_targets=mushroom_targets, max_depth=4, predict_on_stopping_depth=False)
find_best_hyper_parameters(data_set=mushroom_training,
                           data_targets=mushroom_targets, max_depth=4)


-----------------------------------------------
Decision trees using 2-fold cross-validation at depth 1
-----------------------------------------------
Fold 1: Validation Accuracy --> 0.5562587904360057
Fold 2: Validation Accuracy --> 0.5747449876890609
Overall Validation Accuracy = 0.5655018890625333
-----------------------------------------------


-----------------------------------------------
Decision trees using 3-fold cross-validation at depth 1
-----------------------------------------------
Fold 1: Validation Accuracy --> 0.5385021097046413
Fold 2: Validation Accuracy --> 0.5909810126582279
Fold 3: Validation Accuracy --> 0.5730114760585675
Overall Validation Accuracy = 0.5674981994738122
-----------------------------------------------


-----------------------------------------------
Decision trees using 4-fold cross-validation at depth 1
-----------------------------------------------
Fold 1: Validation Accuracy --> 0.5541490857946554
Fold 2: Validation Accuracy --> 0.59474