# Capter8 提升方法 
## 梯度提升回归树模型
### 构建cart回归树（同Capter5 BoostingTree)

In [1]:
import pandas as pd
import numpy as np
from graphviz import Digraph
class CartTree():
    def __init__(self, tree_type='regress', eps = 0.01):
        self.tree_type = tree_type
        self.nid = 0
        self.eps = eps

    def get_gini_loss(self, data_df, total):
        data_num = data_df.shape[0]
        label_count = data_df.groupby('labels')['labels'].count()
        label = label_count.idxmax()
        loss = data_num/float(total) * (1 - (label_count / data_num).pow(2).sum())
        return label, loss

    def get_mse_loss(self, data_ser):
        label = data_ser.mean()
        loss = (data_ser - label).pow(2).sum()
        return label, loss

    def get_split_point(self, data_df):
        res = {}
        min_loss = None
        total = data_df.shape[0]
        for col in data_df.columns:
            if col == 'labels':
                continue
            s_arr = list(data_df[col].unique())
            for idx in s_arr:
                left_data = data_df[data_df[col] <= idx]
                right_data = data_df[data_df[col] > idx]
                l_label = None
                l_loss = None
                r_label = None
                r_loss = None
                loss = 0
                if left_data.shape[0] > 0:
                    if self.tree_type == 'regress':
                        l_label, l_loss = self.get_mse_loss(left_data['labels'])
                    else:
                        l_label, l_loss = self.get_gini_loss(left_data, total)
                    loss = loss + l_loss
                if right_data.shape[0] > 0:
                    if self.tree_type == 'regress':
                        r_label, r_loss = self.get_mse_loss(right_data['labels'])
                    else:
                        r_label, r_loss = self.get_gini_loss(right_data, total)
                    loss = loss + r_loss
                if min_loss == None or min_loss > loss:
                    res['col'] = col
                    res['idx'] = idx
                    res['l_loss'] = l_loss
                    res['l_label'] = l_label
                    res['r_loss'] = r_loss
                    res['r_label'] = r_label
                    res['l_data'] = left_data
                    res['r_data'] = right_data
                    min_loss = loss
        return res

    def build_tree(self, data_df, level = 0, parent = None, label = None, loss = None):
        if data_df.shape[0] == 0:
            return None
        if label == None:
            if self.tree_type == 'regress':
                label, loss = self.get_mse_loss(data_df['labels'])
            elif self.tree_type == 'class':
                label, loss = self.get_gini_loss(data_df, data_df.shape[0])
            else:
                print('算法类型不支持')
                return None
        node={'id' : self.nid, 'level' : level, 'parent' : parent, 'label' : label, 'loss' :loss}
        self.nid = self.nid + 1
        spt = self.get_split_point(data_df)
        node['spt'] = {'col':spt['col'], 'idx':spt['idx']}
        left_data = spt['l_data']
        if left_data.shape[0] == 0 or spt['l_loss'] < self.eps or left_data.shape == data_df.shape:
            label = node['label'] if spt['l_label'] == None else spt['l_label']
            loss = node['loss'] if spt['l_loss'] == None else spt['l_loss']
            node['left'] = {'id':self.nid, 'level':level+1, 'parent':node, 'label':label, 'loss':loss}
            self.nid = self.nid + 1
        else:
            node['left'] = self.build_tree(left_data, level+1, node, spt['l_label'], spt['l_loss'])
        right_data = spt['r_data']
        if right_data.shape[0] == 0 or spt['r_loss'] < self.eps or left_data.shape == data_df.shape:
            label = node['label'] if spt['r_label'] == None else spt['r_label']
            loss = node['loss'] if spt['r_loss'] == None else spt['r_loss']
            node['right'] = {'id':self.nid, 'level':level+1, 'parent':node, 'label':label, 'loss':loss}
            self.nid = self.nid + 1
        else:
            node['right'] = self.build_tree(right_data, level+1, node, spt['r_label'], spt['r_loss'])
        return node

    def predict(self, x_df, node):
        if 'spt' in node:
            col = node['spt']['col']
            if x_df[col] <= node['spt']['idx']:
                return self.predict(x_df, node['left'])
            else:
                return self.predict(x_df, node['right'])
        else:
            return node['label']

    def format_dot(self, dot, node, edge_label = ''):
        if not node:
            return
        if 'spt' in node:
            node_label = "{}-{}-{}-{}".format(node['level'], node['spt']['col'], round(node['loss'], 2), round(node['label'], 2))
        else:
            node_label = "{}-{}-{}".format(node['level'], round(node['loss'], 2), round(node['label'], 2))
        dot.node(str(node['id']), label=node_label)
        if len(edge_label) != 0:
            dot.edge(str(node['parent']['id']), str(node['id']), label = edge_label)
        if 'left' in node:
            self.format_dot(dot, node['left'], '<=' + str(node['spt']['idx']))
        if 'right' in node:
            self.format_dot(dot, node['right'], '>' + str(node['spt']['idx']))

    def print_tree(self, root):
        dot = Digraph(comment='Cart Tree')
        self.format_dot(dot, root)
        return dot

### 梯度提升回归树 

In [2]:
class GradBoostRegTree():
    def __init__(self, G_num = 5, eps = 1):
        self.G_num = G_num
        self.tree = CartTree('regress', eps)

    def G(self, x, y):
        df = x
        df['labels'] = y
        root = self.tree.build_tree(df)
        del df['labels']
        return root

    def G_pre(self, root, x_df):
        y_pre = x_df.apply(self.tree.predict, axis=1, args=(root,))
        return y_pre.values

    def fit(self, train_x, train_y):
        self.G_lst = []
        labels = train_y.copy()
        for i in range(self.G_num):
            root = self.G(train_x, labels)
            pre = self.G_pre(root, train_x)
            #使用均方差损失函数
            r = labels-pre
            labels = r
            self.G_lst.append(root)
            print('%dth model error is %.2f' % (i, np.average(r**2)))
    
    def predict(self, test_x):
        pre = np.zeros(test_x.shape[0])
        for root in self.G_lst:
            pre = pre + self.G_pre(root, test_x)
        return pre

    def cal_mse(self, gt_y, pre_y):
        return np.average(np.power(gt_y - pre_y, 2))

## 模型训练与预测

In [3]:
%matplotlib widget
from sklearn import datasets
import matplotlib.pyplot as plt
data = datasets.load_boston()
x_data = data.data
y_data = data.target
x_names = data.feature_names
data_np = np.column_stack((x_data, y_data))
np.random.shuffle(data_np)
data_cols = list(x_names.copy())
data_cols.append('labels')
df = pd.DataFrame(data_np, columns=data_cols)
for col in x_names:
    if col != 'CHAS':
        df[col] = pd.cut(df[col], 5, labels=range(5))
    df[col].astype('category')
train_num = 400
train_x = df.iloc[:train_num, :len(x_names)]
train_y = df.iloc[:train_num, len(x_names):].values.reshape([train_num,])
test_x = df.iloc[train_num:, :len(x_names)]
test_y = df.iloc[train_num:, len(x_names):].values.reshape([df.shape[0] - train_num,])

reg_model=GradBoostRegTree(5, 10)
reg_model.fit(train_x, train_y)
pre_y = reg_model.predict(test_x)

print('predict mse: %.2f' % reg_model.cal_mse(test_y, pre_y))
show_df = pd.DataFrame(pre_y, columns=['pre_y'])
show_df['test_y'] = test_y
show_df.plot()

0th model error is 2.46
1th model error is 2.25
2th model error is 2.18
3th model error is 2.13
4th model error is 2.10
predict mse: 31.95


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x12301b7d0>