# Decision Tree Classifier

### 1 - Define the internal node

In [153]:
class Node():
    
    def __init__(self, left=None, right=None, split_feature=None, split_value=None):
        """
        Initialize the parameters of internal node in decision tree
        
        Arguments:
        left -- left child node of the node
        right -- right child node of the node
        split_feature -- the feature will be choosen to split current dataset
        split_value -- the value of feature will be choosen to split current dataset
        
        """
        self.left = left
        self.right = right
        self.split_feature = split_feature
        self.split_value = split_value
        
        return None
    

### 2 - Define the leaf node

In [154]:
class Leaf():
    
    def __init__(self, label=None):
        """
        Initialize the parameter of leaf node in decision tree
        
        Argument:
        label -- the result of classification 
       
        """
        self.label = label
        
        return
    

### 3 - Decision Tree Classifier

In [174]:
import numpy as np

# CART Model 
class DecisionTree():
    
    
    def __init__(self, min_samples_leaf=1):
        """
        Initialize the parameters of Decision Tree
        
        Arguments:
        min_samples_leaf -- the minimum number of samples on each leaf node
        root -- the root node of decision tree
        
        """
        self.min_samples_leaf = min_samples_leaf
        self.root = None
        
        return None
    
    
    def fit(self, X_train, Y_train):
        """
        Fit model with training data
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        
        """
        # Create decision tree and get its root node
        self.root = self.generate_tree(X_train, Y_train)
        return None
    
    
    def predict(self, query):
        """
        Predict the unknown query data by model
        
        Argument:
        query -- unknown feature vector waiting to be predicted
        
        Return:
        pred -- prediction of input feature vector
        
        """
        # Set the root node as current node
        temp_root = self.root
        # Search along decision tree untile finding the leaf node
        while isinstance(temp_root, Node):
            # Get the dimension and value that we choose them to split dataset
            dim = temp_root.split_feature
            value = temp_root.split_value
            # Find the next direction where current node gose
            if query[dim] == value:
                temp_root = temp_root.left
            else:
                temp_root = temp_root.right
        # Get the label of last leaf node       
        pred = list(temp_root.label)[0]   
        return pred
    
    
    def generate_tree(self, features, labels):
        """
        Generate decision tree by gini index
        
        Arguments:
        features -- a list of feature vectors, with the shape of [n_samples, n_features]
        labels -- the corresponding labels of feature vectors
        
        Returns:
        root -- the root node of decision tree
        leaf -- the final leaf node 
        
        """
        # Get distribution of probs 
        probs = self.compute_prob(labels)
        # Recursion baseline: whether end the growth of decision tree
        if len(probs) == 1:
            label = probs.keys()
            return Leaf(label)
        if len(labels)<=self.min_samples_leaf:
            label = max(probs, key=probs.get)
            return Leaf(label)
        
        # Get length and dimensions of dataset
        num_data = features.shape[0]
        dim_data = features.shape[1]
        # Initialize gini index and the optimal spliting dimension and value 
        min_gini = 1
        split_dim = None
        split_value = None
        # Traverse all the possible dimensions and values
        for dim in range(dim_data):
            for value in np.unique(features[:,dim]):
                con_gini = self.compute_con_gini(features, labels, split_dim=dim, split_value=value)
                if con_gini <= min_gini:
                    min_gini = con_gini
                    split_dim = dim
                    split_value = value          
        # Create current node by optimal spliting dimension and value 
        root = Node(split_feature=split_dim, split_value=split_value)
        
        # Recursion call: create the child node under root
        left_index = np.where(features[:,split_dim]==split_value)
        root.left = self.generate_tree(features[left_index], labels[left_index])
        right_index = np.where(features[:,split_dim]!=split_value)
        root.right = self.generate_tree(features[right_index], labels[right_index])
        
        return root
        
    
    def compute_prob(self, data_list):
        """
        Compute probabilities of each kind of value
        
        Argument:
        data_list -- a list of labels, with the length of n_samples
       
        Return:
        probs -- probabilities of each value in data_list
        
        """
        # Get the total number of data points
        num = len(data_list)
        # Calculate probabilities respectively
        probs = {}
        for n in data_list:
            if n in probs:
                probs[n] += 1 / num
            else:
                probs[n] = 1 / num
                
        return probs
    
    
    def compute_gini(self, data_list):
        """
        Compute the gini index
        
        Argument:
        data_list -- a list of labels, with the length of n_samples
       
        Return:
        gini -- gini index 
        
        """
        # Calculate gini index of input data_list 
        gini = 1
        probs = self.compute_prob(data_list)
        for i in probs.values():
            gini -= np.square(i)
        
        return gini 
    

    def compute_con_gini(self, data_list, label_list, split_dim, split_value):
        """
        Compute the conditional gini index under splitted dataset
        
        Arguments:
        data_list -- a list of feature vectors, with the shape of [n_samples, n_features]
        label_list -- the corresponding labels of feature vectors
        split_dim -- the dimension that dataset will be splitted by
        split_value -- the value that dataset will be splitted by
                
        Return:
        con_gini -- conditional gini index 
        
        """
        # Split label_list into two groups by split_dim and split_value
        index = np.where(data_list[:,split_dim]==split_value)
        subset_1 = label_list[index]
        index = np.where(data_list[:,split_dim]!=split_value)
        subset_2 = label_list[index]
        # Calculate the weights of each splitted group
        w1 = len(subset_1) / len(label_list)
        w2 = len(subset_2) / len(label_list)
        # Get the conditional gini index
        con_gini = w1*self.compute_gini(subset_1) + w2*self.compute_gini(subset_2)
        
        return con_gini
    

### Test

*P58 例 5.1*  
如下表所示，是一个由15个样本组成的贷款申请训练数据。数据包括贷款申请人的4个特征（属性）：第1个特征是年龄，有3个可能值：青年，中年，老年；第2个特征是有工作，有2个可能值：是，否；第3个特征是有自己的房子，有2个可能值：是，否；第4个特征是信贷情况，有3个可能值：非常好，好，一般。表的最后一列是类别，是否同意贷款，取2个值：是，否。

|  ID | 年龄 | 有工作 | 有自己的房子 | 信贷情况 | 类别 |
| :--:| :--: | :----: | :----------: | :------: | :--: |
|  1  | 青年 |  否   |    否     |  一般   |  否  |
|  2  | 青年 |  否   |    否     |   好   |  否  |
|  3  | 青年 |  是   |    否     |   好   |  是  |
|  4  | 青年 |  是   |    是     |  一般   |  是  |
|  5  | 青年 |  否   |    否     |  一般   |  否  |
|  6  | 中年 |  否   |    否     |  一般   |  否  |
|  7  | 中年 |  否   |    否     |   好   |  否  |
|  8  | 中年 |  是   |    是     |   好   |  是  |
|  9  | 中年 |  否   |    是     |  非常好  |  是  |
| 10  | 中年 |  否   |    是     |  非常好  |  是  |
| 11  | 老年 |  否   |    是     |  非常好  |  是  |
| 12  | 老年 |  否   |    是     |   好   |  是  |
| 13  | 老年 |  是   |    否     |   好   |  是  |
| 14  | 老年 |  是   |    否     |  非常好  |  是  |
| 15  | 老年 |  否   |    否     |  一般   |  否  |

#### generate artificial samples

In [168]:
x = np.array([
['青年', '否', '否', '  一般'],
['青年', '否', '否', '    好'],
['青年', '是', '否', '    好'],
['青年', '是', '是', '  一般'],
['青年', '否', '否', '  一般'],
['中年', '否', '否', '  一般'],
['中年', '否', '否', '    好'],
['中年', '是', '是', '    好'],
['中年', '否', '是', '非常好'],
['中年', '否', '是', '非常好'],
['老年', '否', '是', '非常好'],
['老年', '否', '是', '    好'],
['老年', '是', '否', '    好'],
['老年', '是', '否', '非常好'],
['老年', '否', '否', '  一般']
])

y = np.array(
    ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否' ])

#### creating and training model

In [171]:
model = DecisionTree()
model.fit(x,y)

#### predicting

In [172]:
query = np.array(['老年', '是', '否', '非常好'])
pred = model.predict(query)
print('Prediction: class = ', pred)

Prediction: class =  是
