In [1]:
import numpy as np

In [2]:
class TreeNode():
    """树节点"""
    def __init__(self, feature_idx = None, feature_val = None, feature_name = None, node_val = None, child = None):
        """
        feature_idx: 划分特征的索引
        feature_val: 划分特征对应的值
        feature_name: 划分特征名
        node_val: 叶节点所存储的值，只有叶节点才能存储类别
        child: 子树,非叶节点存储划分信息
        """
        self._feature_idx = feature_idx
        self._feature_val = feature_val
        self._feature_name = feature_name
        self._node_val = node_val
        self._child = child

class DecisionTree():
    def __init__(self, feature_name, etype = "gain", epsilon = 0.01):
        """
        feature: 表示每列特证名
        etype: 其中“gain”为信息增益，"ratio"为信息增益比
        epsilon: 当信息增益或信息增益比小于某一阈值时，直接把该节点看成叶节点
        """
        self._root = None
        self._fea_name = feature_name
        self._etype = etype
        self._epsilon = epsilon
    
    def fit(self, X, y):
        """模型训练"""
        self._root = self._build_tree(X, y)
        
    def predict(self, X):
        """模型预测"""
        return self._predict(X, self._root)
        
    def _build_tree(self, X, y):
        """构建树的过程就是训练的过程"""
        # 只剩下一个特证时投票判断类别
        if X.shape[1] == 1:
            node_val = self._vote_label(y)
            return TreeNode(node_val = node_val)
        
        # 子树只剩下一个类别时就是叶节点所属类别
        if np.unique(y).shape[0] == 1:
            return TreeNode(node_val = y[0])
        
        n_feature = X.shape[1]
        max_gain = -np.inf
        max_fea_idx = 0 # 信息增益或信息增益比最大所对应的特征索引
        for i in range(n_feature):
            if self._etype == "gain":
                gain = self._calc_gain(X[:, i], y)
            else:
                gain = self._calc_gain_ration(X[:, i], y)
            if gain > max_gain:
                max_gain = gain
                max_fea_idx = i
                
        # 如果该特征的信息增益或信息增益比太小则不进行划分,直接是叶节点
        if max_gain < self._epsilon:
            node_val = self._vote_label(y)
            return TreeNode(node_val = node_val)
        
        #找到划分的特征
        feature_name = self._fea_name[max_fea_idx]
        child_tree = dict() # 使用字典树来存储{key(特征每一个可能的值)， value(根据此特征构建子树)}
        feature_val = np.unique(X[:, max_fea_idx]) #查看划分特征每一个可能的值
        for fea_val in feature_val:
            child_X = X[X[:, max_fea_idx] == fea_val]
            child_y = y[X[:, max_fea_idx] == fea_val]
            child_X = np.delete(child_X, max_fea_idx, 1)
            child_tree[fea_val] = self._build_tree(child_X, child_y) #构建子树
        return TreeNode(max_fea_idx, feature_name = feature_name, child = child_tree) 
            
    def _predict(self, X, tree = None):
        """预测的过程就是输入给定样本后，将其划分到所属的叶节点"""
        if tree is None:
            tree = self._root
        
        if tree._node_val is not None:
            return tree._node_val
        
        fea_idx = tree._feature_idx
        for fea_val, child_node in tree._child.items():
            if X[fea_idx] == fea_val:
                # 若是叶节点则直接返回类别
                if child_node._node_val is not None:
                    return child_node._node_val
                else:
                    #否则去子树中找
                    return self._predict(X, child_node)                
                      
    def _vote_label(self, y):
        """统计y中出现次数最多的类别"""
        label, num_label = np.unique(y, return_counts = True)
        return label[np.argmax(num_label)]
    
    def _calc_entropy(slef, y):
        """计算熵"""
        entropy = 0
        _, numc = np.unique(y, return_counts = True)
        for n in numc:
            p = n / y.shape[0]
            entropy -= p * np.log2(p)
        return entropy
    
    def _calc_condition_entropy(self, X, y):
        """计算条件熵"""
        cond_entropy = 0
        # 特征X可能的取值以及对应的次数
        xval, num_x = np.unique(X, return_counts = True)
        
        for v,n in zip(xval, num_x):
            y_sub = y[X == v]
            sub_entropy = self._calc_entropy(y_sub)
            p = n / y.shape[0]
            cond_entropy += p * sub_entropy
        return cond_entropy          
        
    def _calc_gain(self, X, y):
        """计算信息增益"""
        return self._calc_entropy(y) - self._calc_condition_entropy(X, y)        
        
    def _calc_gain_ration(self, X, y):
        """计算信息增益比"""
        return self._calc_gain(X, y) / self._calc_entropy(X)          

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
xtrain, _, ytrain, _ = train_test_split(iris.data, iris.target, train_size=0.8, shuffle=True)
feature_name = iris.feature_names
etype =  "gain" 
epsilon = 0.01

model = DecisionTree(feature_name, etype, epsilon)
model.fit(xtrain, ytrain)

n_train = xtrain.shape[0]
n_right = 0
for i in range(n_train):
    y_pred = model.predict(xtrain[i])
    if y_pred == ytrain[i]:
        n_right += 1
print("决策树（ID3/C4.5）在训练集上的准确率为：{}%".format((n_right * 100) / n_train))

决策树（ID3/C4.5）在训练集上的准确率为：100.0%
