In [1]:
import numpy as np
from random import seed, randrange, random

In [2]:
class Random_Forest():
    def __init__(self, max_depth = None, min_size = None, sample_size = None, n_trees = None, n_features = None):
        """
            max_depth   决策树的最大深度
            min_size    叶子结点的大小
            sample_size 训练数据集的样本采样比例
            n_trees     决策树的个数
            n_features  选取特征的个数
        """     
        self._trees = None
        self._max_depth = max_depth
        self._min_size = min_size
        self._sample_size = sample_size
        self._n_trees = n_trees
        self._n_features = n_features
        
    def fit(self, X, y):
        self._trees = list()
        for i in range(self._n_trees):  #以sample_size的比例对数据进行采样，并构建每一颗决策树
            sample_X, sample_y = self.subsample(X, y, self._sample_size)
            tree = self.build_tree(sample_X, sample_y, max_depth, min_size, n_features)
            self._trees.append(tree)        
    
    def predict(self, X):
        #对于每一个测试样本，bagging预测后的结果是
        return self.bagging_predict(X, self._trees)
        
    def subsample(self, X, y, ratio):
        """有放回的重采样"""
        sample_X = list()
        sample_y = list()
        n_sample = round(len(X) * ratio)
        while len(sample_X) < n_sample:
            index = randrange(len(X))
            sample_X.append(X[index])
            sample_y.append(y[index])
        return sample_X, sample_y   

    def test_split(self, index, value, X, y):
        left_X, left_y, right_X, right_y = list(), list(), list(), list()  # left和right中存储的是y值
        for idx in range(len(X)):
            if X[idx][index] <= value:
                left_X.append(X[idx])
                left_y.append(y[idx])
            else:
                right_X.append(X[idx])
                right_y.append(y[idx])
        return left_X, left_y, right_X, right_y
    
    def gini_index(self, groups, class_val):
        gini = 0
        D = len(groups[1]) + len(groups[3])
        for i in range(2):
            sub_gini = 1.0
            size = len(groups[i * 2 + 1])
            for cls in class_val:    
                if size == 0: continue
                p = groups[i * 2 + 1].count(cls) / float(size)
                sub_gini -= p ** 2
            gini += len(groups[i * 2 + 1]) / D * sub_gini
        return gini     
    
    def get_split(self, X, y, n_features):
        """找出分割数据集的最优特征index，最优特征值each[index以及分割完后的数据groups(left_X, left_y, right_X, right_y)"""
        class_val = list(set(y)) #list(set(each[-1] for each in train))
        b_index, b_value, b_score, b_groups = 100000, 100000, 100000, None
        features = list()
        while len(features) < n_features:  #有放回的重采样特征
            index = randrange(len(X[0])) # randrange(10)表示从0都9中选
            if index not in features:
                features.append(index)
        for index in features:
            for idx in range(len(X)):
                # 遍历每一个index索引下的每一个训练样本可能value值来作为分类值，找出最优的分类特征和特征值
                groups = self.test_split(index, X[idx][index], X, y)
                gini = self.gini_index(groups, class_val)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index, X[idx][index], gini, groups
        return {'index': b_index, 'value': b_value, 'groups': b_groups}    

    def to_leaf(self, y):
        return max(set(y), key = y.count) # 在max函数中，以key的函数对象为判断标准（key相当于属性），这里是输出次数（属性）最大的标签

    def split(self, node, max_depth, min_size, n_features, depth):
        left_X, left_y, right_X, right_y = node['groups']
        del(node['groups'])

        if not left_y or not right_y:
            node['left'] = node['right'] = self.to_leaf(left_y + right_y)
            return
                              
        if depth >= max_depth:
            node['left'], node['right'] = self.to_leaf(left_y), self.to_leaf(right_y)
            return

        if len(left_y) <= min_size:
            node['left'] = self.to_leaf(left_y)
        else:
            node['left'] = self.get_split(left_X, left_y, n_features)  # node['left']是一个多层字典的形式
            self.split(node['left'], max_depth, min_size, n_features, depth+1)

        if len(right_y) <= min_size:
            node['right'] = self.to_leaf(right_y)
        else:
            node['right'] = self.get_split(right_X, right_y, n_features)  # node['right']是一个多层字典的形式
            self.split(node['right'], max_depth, min_size, n_features, depth+1)   
    
    def build_tree(self, X, y, max_depth, min_size, n_features):
        """创建一颗决策树
        输入：
            X, y：     训练集数据和标签
            max_depth:  最大深度
            min_size:   叶子结点的大小
            n_features: 选取特征的个数
        输出：
            root:       返回一颗决策树
        """
        root = self.get_split(X, y, n_features) # 找到这颗决策树分割的最优特征和最优特征值
        self.split(root, max_depth, min_size, n_features, 1) # 然后递归处理这颗决策树
        return root    
    
    def pred(self, node, X):
        """预测一个测试样本在决策树上所属类别"""
        if X[node['index']] <= node['value']:
            if isinstance(node['left'], dict): # isinstance判断该对象是否已知
                return self.pred(node['left'], X)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict): # isinstance判断该对象是否已知
                return self.pred(node['right'], X)
            else:
                return node['right']

    def bagging_predict(self, X, trees = None):
        """
        输入:
            trees: 树的集合
            each:  测试数据集的每一行数据
        输出：
            投票法得到所属类别        
        """
        if trees == None:
            trees = self._trees
        preds = [self.pred(tree, X) for tree in trees]
        return max(set(preds), key = preds.count)

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris_data, iris_y = load_iris(return_X_y=True) #return_X_y为True，表示因变量和自变量独立导出
xtrain, xtest, ytrain, ytest = train_test_split(iris_data, iris_y, train_size=0.8, shuffle=True)

max_depth = 3
min_size = 2      
sample_size = 1.0 
n_trees = 200
n_features = int(np.sqrt(len(iris_data[0])))
model = Random_Forest(max_depth, min_size, sample_size, n_trees, n_features)
model.fit(xtrain, ytrain)

n_test = xtest.shape[0]
n_right = 0
for i in range(n_test):
    y_pred = model.predict(xtest[i])
    if y_pred == ytest[i]:
        n_right += 1
print("随机森林在测试集上的准确率为：{}%".format((n_right * 100) / n_test))

随机森林在测试集上的准确率为：93.33333333333333%


In [4]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth, min_samples_split=min_size, max_features = n_features, bootstrap=True, criterion='gini')
clf.fit(xtrain, ytrain)
print("sklearn随机森林分类模型在测试集上准确率为：{}%".format(100 * clf.score(xtest, ytest)))

sklearn随机森林分类模型在测试集上准确率为：93.33333333333333%
