<a href="https://colab.research.google.com/github/nguyen-nhat-mai/ensemble_project/blob/main/Decision_tree_classifier_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DECISION TREE CLASSIFIER and REGRESSOR**
By Haiwei FU, Mengyu LIANG, Nhat Mai NGUYEN, Jinji SHEN and Vanshika SHARMA

# **Setup**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import stats
from collections import Counter
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer,make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error,mean_absolute_error,r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

# **Self-build DT**

In [2]:
#class to control tree node
class Node:
    #initializer
    def __init__(self):
        self.__Bs    = None
        self.__Bf    = None
        self.__left  = None
        self.__right = None
        self.leafv   = None
    #set the split,feature parameters for this node
    def set_params(self,Bs,Bf):
        self.__Bs = Bs
        self.__Bf = Bf
        
    #get the split,feature parameters for this node
    def get_params(self):
        return(self.__Bs,self.__Bf)    
        
    #set the left/right children nodes for this current node
    def set_children(self,left,right):
        self.__left  = left
        self.__right = right
        
    #get the left child node
    def get_left_node(self):
        return(self.__left)
    
    #get the right child node
    def get_right_node(self):
        return(self.__right)

#base class to encompass the decision tree algorithm
class DecisionTree:
    #initializer
    def __init__(self,max_depth=None,min_samples_split=2):
        self.tree              = None
        self.max_depth         = max_depth
        self.min_samples_split = min_samples_split
    
    #protected function to define the impurity
    def _impurity(self,D):
         pass
        
    #protected function to compute the value at a leaf node
    def _leaf_value(self,D):
         pass
    
    #private recursive function to grow the tree during training
    def __grow(self,node,D,level):       
        #are we in a leaf node? let's do some check...
        depth = (self.max_depth is None) or (self.max_depth >= (level+1))
        msamp = (self.min_samples_split <= D.shape[0])
        n_cls = np.unique(D[:,-1]).shape[0] != 1
        
        #not a leaf node
        if depth and msamp and n_cls:
        
            #initialize the function parameters
            ip_node = None
            feature = None
            split   = None
            left_D  = None
            right_D = None
            #iterrate through the possible feature/split combinations
            for f in range(D.shape[1]-1):
                for s in np.unique(D[:,f]):
                    #for the current (f,s) combination, split the dataset
                    D_l = D[D[:,f]<=s]
                    D_r = D[D[:,f]>s]
                    #ensure we have non-empty arrays
                    if D_l.size and D_r.size:
                        #calculate the impurity
                        ip  = (D_l.shape[0]/D.shape[0])*self._impurity(D_l) + (D_r.shape[0]/D.shape[0])*self._impurity(D_r)
                        #now update the impurity and choice of (f,s)
                        if (ip_node is None) or (ip < ip_node):
                            ip_node = ip
                            feature = f
                            split   = s
                            left_D  = D_l
                            right_D = D_r
            #set the current node's parameters
            node.set_params(split,feature)
            #declare child nodes
            left_node  = Node()
            right_node = Node()
            node.set_children(left_node,right_node)
            #investigate child nodes
            self.__grow(node.get_left_node(),left_D,level+1)
            self.__grow(node.get_right_node(),right_D,level+1)
                        
        #is a leaf node
        else:
            
            #set the node value & return
            node.leafv = self._leaf_value(D)
            return
        
    #private recursive function to traverse the (trained) tree
    def __traverse(self,node,Xrow):
        #check if we're in a leaf node?
        if node.leafv is None:
            #get parameters at the node
            (s,f) = node.get_params()
            #decide to go left or right?
            if (Xrow[f] <= s):
                return(self.__traverse(node.get_left_node(),Xrow))
            else:
                return(self.__traverse(node.get_right_node(),Xrow))
        else:
            #return the leaf value
            return(node.leafv)
      
    #train the tree model
    def train(self,Xin,Yin):
        #prepare the input data
        D = np.concatenate((Xin,Yin.reshape(-1,1)),axis=1)
        #set the root node of the tree
        self.tree = Node()
        #build the tree
        self.__grow(self.tree,D,1)
        
    #make predictions from the trained tree
    def predict(self,Xin):
        #iterrate through the rows of Xin
        p = []
        for r in range(Xin.shape[0]):
            p.append(self.__traverse(self.tree,Xin[r,:]))
        #return predictions
        return(np.array(p).flatten())

In [3]:
#Decision Tree Classifier
class DecisionTreeClassifier(DecisionTree):
    #initializer
    def __init__(self,max_depth=2,min_samples_split=5,loss='gini'):
        DecisionTree.__init__(self,max_depth,min_samples_split)
        self.loss = loss   
    
    #private function to define the gini impurity
    def __gini(self,D):
        #initialize the output
        G = 0
        #iterrate through the unique classes
        for c in np.unique(D[:,-1]):
            #compute p for the current c
            p = D[D[:,-1]==c].shape[0]/D.shape[0]
            #compute term for the current c
            G += p*(1-p)
        #return gini impurity
        return(G)
    
    #private function to define the shannon entropy
    def __entropy(self,D):
        #initialize the output
        H = 0
        #iterrate through the unique classes
        for c in np.unique(D[:,-1]):
            #compute p for the current c
            p = D[D[:,-1]==c].shape[0]/D.shape[0]
            #compute term for the current c
            H -= p*np.log2(p)
        #return entropy
        return(H)
    
    #protected function to define the impurity
    def _impurity(self,D):
        #use the selected loss function to calculate the node impurity
        ip = None
        if self.loss == 'gini':
            ip = self.__gini(D)
        elif self.loss == 'entropy':
            ip = self.__entropy(D)
        #return results
        return(ip)
    
    #protected function to compute the value at a leaf node
    def _leaf_value(self,D):
         return(stats.mode(D[:,-1])[0])

In [4]:
#Decision Tree Regressor
class DecisionTreeRegressor(DecisionTree):
    #initializer
    def __init__(self,max_depth=None,min_samples_split=2,loss='mse'):
        DecisionTree.__init__(self,max_depth,min_samples_split)
        self.loss = loss   
    
    #private function to define the mean squared error
    def __mse(self,D):
        #compute the mean target for the node
        y_m = np.mean(D[:,-1])
        #compute the mean squared error wrt the mean
        E = np.sum((D[:,-1] - y_m)**2)/D.shape[0]
        #return mse
        return(E)
    
    #private function to define the mean absolute error
    def __mae(self,D):
        #compute the mean target for the node
        y_m = np.mean(D[:,-1])
        #compute the mean absolute error wrt the mean
        E = np.sum(np.abs(D[:,-1] - y_m))/D.shape[0]
        #return mae
        return(E)
    
    #protected function to define the impurity
    def _impurity(self,D):
        #use the selected loss function to calculate the node impurity
        ip = None
        if self.loss == 'mse':
            ip = self.__mse(D)
        elif self.loss == 'mae':
            ip = self.__mae(D)
        #return results
        return(ip)
    
    #protected function to compute the value at a leaf node
    def _leaf_value(self,D):
         return(np.mean(D[:,-1]))

# **Test and compare**

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html

#### Test with classification problem

In [5]:
data = load_wine()
X    = data.data
y    = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
clf = DecisionTreeClassifier(min_samples_split=2, max_depth=5)
clf.train(X_train,y_train)
preds = clf.predict(X_test)
print("Predicted classes: ", preds)
print("True classes",y_test)
print("Test accuracy: ", accuracy_score(y_test, preds))

Predicted classes:  [0. 0. 2. 0. 1. 0. 1. 2. 1. 2. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 2.
 2. 2. 1. 1. 1. 0. 0. 1. 2. 0. 0. 0.]
True classes [0 0 2 0 1 0 1 2 1 2 0 2 0 1 0 1 1 1 0 1 0 1 1 2 2 2 1 1 1 0 0 1 2 0 0 0]
Test accuracy:  0.9166666666666666


#### Result of Decision Tree classifier from the scikit-learn library

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(min_samples_split=2, max_depth=5)  # Create a new classifier object
clf.fit(X_train, y_train)       # Train the classifier on the training data
pred = clf.predict(X_test)      # Use the classifier to make predictions on the test data

print("Predicted classes: ", pred)
print("True classes",y_test)
print("Test accuracy: ", accuracy_score(y_test, pred))

Predicted classes:  [0 0 2 0 1 0 1 2 1 2 1 0 0 1 0 1 1 1 0 1 0 1 1 2 2 2 1 1 1 0 0 1 2 0 0 0]
True classes [0 0 2 0 1 0 1 2 1 2 0 2 0 1 0 1 1 1 0 1 0 1 1 2 2 2 1 1 1 0 0 1 2 0 0 0]
Test accuracy:  0.9444444444444444


#### Test with regression problem

In [13]:
X,y = make_regression(n_samples=2000, n_features=5, n_informative=5, n_targets=1, noise=1, random_state=42)
## do train/test split ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
## declare the regressor and train the model ##
dt = DecisionTreeRegressor(max_depth=5,loss='mae')
dt.train(X_train,y_train)
## make predictions ##
yp = dt.predict(X_test)

In [17]:
## evaluate model performance ##
print("rmse: %.2f" % np.sqrt(mean_squared_error(y_test,yp)))
print("mae: %.2f" % mean_absolute_error(y_test,yp))
print("r2: %.2f" % r2_score(y_test,yp))

rmse: 27.65
mae: 22.35
r2: 0.83


#### Result of Decision Tree regressor from the scikit-learn library

In [19]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()  # Create a new regressor object
dt.fit(X_train, y_train)      # Train the regressor on the training data
pred = dt.predict(X_test)     # Use the regressor to make predictions on the test data

In [20]:
print("rmse: %.2f" % np.sqrt(mean_squared_error(y_test,pred)))
print("mae: %.2f" % mean_absolute_error(y_test,pred))
print("r2: %.2f" % r2_score(y_test,pred))

rmse: 18.43
mae: 14.33
r2: 0.93


Based on the above results, the prebuild decision tree model from scikit-learn library outforms our self-build decision tree in both classification and regression problem.