In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Node class

In [3]:
class Node():
    def __init__(self, left=None, right=None, information_gain=None,
                 threshold=None, column_name=None, column_index=None, operator=None, value=None):
        
        #children
        self.left = left  
        self.right = right
        
        #decision node
        self.information_gain = information_gain
        self.threshold = threshold
        self.column_name = column_name
        self.column_index = column_index
        self.operator = operator
        
        #leef node
        self.value = value

# Tree class

In [4]:
class DecisionTree():
    def __init__(self, min_participant=2, max_depth=2):
        self.column_names = {}
        self.root = None
        
        self.min_participant = min_participant
        self.max_depth = max_depth
        
        self.x_col_names=None
        
    def split_input_label(self, dataset):
        return dataset[:, :22], dataset[:, 22:]

    def build_tree(self, dataset, depth=0):
        #print(depth)
        count_participant, _ = np.shape(dataset)
        
        if count_participant >= self.min_participant and depth <= self.max_depth:
            split = self.get_split_value(dataset)
            if split['information_gain'] > 0 :
                left = self.build_tree(split['left'], depth=depth + 1)
                right = self.build_tree(split['right'], depth=depth + 1)
                
                if left.value != None and right.value != None and left.value == right.value:
                    return Node(value=left.value)

                return Node(left, right, split['information_gain'],
                            split['threshold'], split['column_name'], split['column_index'], split['operator'])
        
        leaf_value = self.compute_leaf_value(dataset)
        return Node(value=leaf_value)
    
    def get_split_value(self, dataset):
        max_gain = float('-inf')
        split = {}
        
        inputs, labels = self.split_input_label(dataset)

        columns = len(self.x_col_names)

        for i, column in enumerate(self.x_col_names):
        
            binary_col = False
            col = dataset[:, i]
            possible_thresholds = np.unique(col)
            
            if len(possible_thresholds) == 2:
                binary_col = True
            else:
                possible_thresholds = possible_thresholds[1:-1]
                
            #print("possible_thre", possible_thresholds)
            
            for threshold in possible_thresholds:
                left, right = self.split(dataset, threshold, i, binary_col)
                gain = self.compute_information_gain(dataset, left, right)
                
                if gain > max_gain:
                    max_gain = gain
                    split['information_gain'] = gain
                    split['threshold'] = threshold
                    split['left'] = left
                    split['right'] = right
                    split['column_name'] = column
                    split['column_index'] = i
                    split['operator'] = "=" if binary_col else "<="
                if binary_col:
                    break
        #print("gain", gain, split)
        return split
    
    def split(self, dataset, threshold, column_index, binary_col):
        if binary_col:
            left = dataset[np.where(dataset[:, column_index] == threshold)]
            right = dataset[np.where(dataset[:, column_index] != threshold)]
        else:
            left = dataset[np.where(dataset[:, column_index] <= threshold)]
            right = dataset[np.where(dataset[:, column_index] > threshold)]
        
        return left, right
    
    def compute_information_gain(self, dataset, left, right):
        inputs, labels = self.split_input_label(dataset)
        left_inputs, left_labels = self.split_input_label(left)
        right_inputs, right_labels = self.split_input_label(right)
        
        weight_left = len(left) / len(dataset)
        weight_right = len(right) / len(dataset)
        
        childs_entropy = self.entropy(left_labels) * weight_left + self.entropy(right_labels) * weight_right
        
        information_gain = self.entropy(labels) - childs_entropy
        
        return information_gain
    
    def entropy(self, labels):
        res = 0
        for value in [0, 1]: #possible values
            value_proportion = len(labels[labels == value]) / len(labels)
            if value_proportion != 0:
                res += -value_proportion * np.log2(value_proportion)
        return res
    
    def compute_leaf_value(self, dataset):
        inputs, labels = self.split_input_label(dataset)
        
        values, counts = np.unique(labels, return_counts=True)

        index = np.argmax(counts)
        return values[index]
        
    def fit(self, X, Y, x_col_names):
        dataset = np.concatenate((X, Y), axis=1)
        self.x_col_names = x_col_names
        self.root = self.build_tree(dataset)
        
    def evaluate(self, x, decision_tree):
        #leaf
        if decision_tree.value != None:
            return decision_tree.value

        x_column_value = x[decision_tree.column_index]

        if x_column_value <= decision_tree.threshold:
            return self.evaluate(x, decision_tree.left)
        return self.evaluate(x, decision_tree.right)
    
    def predict(self, X):
        return [self.evaluate(x, self.root) for x in X]
    
    def evaluate_with_list(self, x, decision_tree, evaluation_list=[], display_evaluation_flow=True):
        #leaf
        if decision_tree.value != None:
            if display_evaluation_flow:
                print(*evaluation_list, sep='\n')
            print("\x1b[6;30;42mValue :\x1b[0m", decision_tree.value)
            return evaluation_list, decision_tree.value

        x_column_value = x[decision_tree.column_index]

        if x_column_value <= decision_tree.threshold:
            evaluation_list.append(decision_tree.column_name + " : " + str(x_column_value) + " " + decision_tree.operator + " " + str(decision_tree.threshold))
            return self.evaluate_with_list(x, decision_tree.left, evaluation_list, display_evaluation_flow)
        
        if decision_tree.operator == "=":
            operator = ' != '
        else:
            operator = ' > '
            
        evaluation_list.append(decision_tree.column_name + " : " + str(x_column_value) + operator + str(decision_tree.threshold))
        return self.evaluate_with_list(x, decision_tree.right, evaluation_list,display_evaluation_flow)
    
    def predict_one_element(self, x, display_evaluation_flow=True):
        return self.evaluate_with_list(x, self.root, display_evaluation_flow=display_evaluation_flow)
    
    def pretty_print(self, information_gain=True, tiret="|---"):
        if self.root == None:
            print("No tree, please use the fit method")
            return
        
        self.pretty_print_node(self.root)
        
    def pretty_print_node(self, node, information_gain=True, tiret="|---"):
        if node.value != None:
            print(tiret, "value", node.value)
            return

        print(tiret, node.column_name, node.operator, node.threshold, end=' ')
        
        if node.left.value != None and node.right.value != None:
            print("value", node.left.value , "else", node.right.value)
            return
        if node.left.value != None:
            print("value", node.left.value)
            self.pretty_print_node(node.right, tiret=tiret+"|---")
            return
        
        if node.right.value != None:
            print("else ", node.right.value)
            self.pretty_print_node(node.left, tiret=tiret+"|---")
            return
        print()
            
        self.pretty_print_node(node.left, tiret=tiret+"|---")
        self.pretty_print_node(node.right, tiret=tiret+"|---")

# Confusion matrix

In [5]:
# Confusion matrix 

def prediction_analyse(model, X_test, Y_test):
    Y_pred = model.predict(X_test) 
    accuracy = accuracy_score(Y_test.flatten(), Y_pred)
    print("accuracy ==>", accuracy)
    
    #print(all_predictions)
    cm = confusion_matrix(Y_test.flatten(), Y_pred)
    
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')
    plt.show()
    
    
    TP = cm[1,1] # true positive 
    TN = cm[0,0] # true negatives
    FP = cm[0,1] # false positives
    FN = cm[1,0] # false negatif

    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    #positive predictive value
    PPV = TP / (TP + FP)
    #negative predictive value
    NPV = TN / (TN + FN)


    print(f'sensitivity : {sensitivity}, specificity : {specificity}, PPV : {PPV}, NPV : {NPV}')
    
    return accuracy