    L3 决策树模型

决策树模型是一个非线性的分类模型

决策树模型既可以做分类也可以做回归分析

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

经典案例：爱丽丝花的分类

In [2]:
iris = pd.read_csv('/Users/nanyueming/Documents/学习/python/机器学习与量化投资/机器学习模型/iris.data' , 
                   header = None , names = ['x1','x2','x3','x4','type'])

In [3]:
iris.head()

Unnamed: 0,x1,x2,x3,x4,type
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
#将type转为数值型分类
#pd.Categorical(iris.type).codes

用决策树对数据集进行分类或拟合回归可以使用sklearn包

参考：https://blog.csdn.net/healingwounds/article/details/83786729

In [5]:
from sklearn import tree

ps: sklearn分类算法默认使用CART算法

随机划分训练集和测试集

In [6]:
x = iris[['x1','x2','x3','x4']]
y = iris['type']
num = x.shape[0] # 样本总数
ratio = 7/3 # 划分比例，训练集数目:测试集数目
num_test = int(num/(1+ratio)) # 测试集样本数目
num_train = num -  num_test # 训练集样本数目
index = np.arange(num) # 产生样本标号
np.random.shuffle(index) # 洗牌
x_test = x.iloc[index[:num_test]] # 取出洗牌后前 num_test 作为测试集
y_test = y.iloc[index[:num_test]]
x_train = x.iloc[index[num_test:]] # 剩余作为训练集
y_train = y.iloc[index[num_test:]]

In [7]:
clf = tree.DecisionTreeClassifier()

In [8]:
clf.fit(x_test , y_test)

DecisionTreeClassifier()

In [9]:
clf.predict(x_test)

array(['Iris-setosa', 'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor'], dtype=object)

    现在来手撕一个决策树

计算熵

In [10]:
def CalEntropy(data):
    n = len(data)
    labels = list(set(data))
    
    entropy = 0.0
    for label in labels:
        prob = float( data[data==label].count() / n )
        entropy -= prob * np.log(prob)
    
    return entropy

计算信息增益

In [11]:
def CalInfogain(df , feature):
    ls = list(set(df[feature]))
    
    NewEntropy = 0.0
    for i in range(len(ls)):
        data = df[df[feature] == ls[i]].type
        NewEntropy += float(len(data) / len(df)) * CalEntropy(data) #条件熵
    
    infogain = CalEntropy(df.type) - NewEntropy  
    
    return infogain

信息增益比

In [12]:
def CalInfogainRatio(df , feature):
    
    infogain = CalInfogain(df , feature)
    entropy = CalEntropy(df[feature])
    
    gr = infogain / entropy
    
    return gr

获取最大信息增益的特征

In [13]:
def GetBestFeature(df , method):
    
    # method是 ‘infogain’ 或 ‘infogainratio’
    
    BestInfoGain = 0.0
    if method == 'infogain':
        for i in range(len(df.columns) - 1):
            feature = df.columns[i]
            infogain = CalInfogain(df , feature)
        
            if (infogain > BestInfoGain):
                BestInfoGain = infogain
                BestFeature = feature
    elif method == 'infogainratio':
        for i in range(len(df.columns) - 1):
            feature = df.columns[i]
            gr = CalInfogainRatio(df , feature)
        
            if (gr > BestInfoGain):
                BestInfoGain = gr
                BestFeature = feature
    else :print('error')
        
    return BestFeature

In [14]:
GetBestFeature(iris , 'infogain')

'x3'

In [15]:
GetBestFeature(iris , 'infogainratio')

'x4'

# ID3算法

ID3算法利用信息增益判断节点

In [17]:
def ID3Tree(df):
    
    #1、如果待分类的对象的type仅有一种时，停止分类
    #返回该对象的特征
    if len(set(df.type)) == 1 :
        return list(set(df.type))[0]
    
    #2、如果遍历所有特征后仍不能完全分类，停止分类,返回最多的type
    if len(df.columns) == 2 :
        return list(df.type.mode())[0]
    
    #3、获取最佳特征
    bestFeature = GetBestFeature(df , 'infogain')
    
    #4、种树
    DTree = {bestFeature:{}}
    bestFeatureValues = list(set(df[bestFeature]))
    
    for bfv in bestFeatureValues:
        subdf = df[df[bestFeature]==bfv]
        subdf = subdf.drop(bestFeature , axis = 1)
        DTree[bestFeature][bfv] = ID3Tree(subdf)
    
    return DTree

In [18]:
ID3Tree(iris)

{'x3': {1.7: 'Iris-setosa',
  1.4: 'Iris-setosa',
  1.6: 'Iris-setosa',
  1.3: 'Iris-setosa',
  1.5: 'Iris-setosa',
  1.1: 'Iris-setosa',
  1.2: 'Iris-setosa',
  1.0: 'Iris-setosa',
  1.9: 'Iris-setosa',
  4.7: 'Iris-versicolor',
  4.5: {'x1': {4.9: 'Iris-virginica',
    5.6: 'Iris-versicolor',
    6.0: 'Iris-versicolor',
    5.7: 'Iris-versicolor',
    6.4: 'Iris-versicolor',
    6.2: 'Iris-versicolor',
    5.4: 'Iris-versicolor'}},
  4.9: {'x2': {2.5: 'Iris-versicolor',
    3.0: 'Iris-virginica',
    3.1: 'Iris-versicolor',
    2.8: 'Iris-virginica',
    2.7: 'Iris-virginica'}},
  4.0: 'Iris-versicolor',
  5.0: {'x1': {6.3: 'Iris-virginica',
    5.7: 'Iris-virginica',
    6.7: 'Iris-versicolor',
    6.0: 'Iris-virginica'}},
  6.0: 'Iris-virginica',
  3.5: 'Iris-versicolor',
  3.0: 'Iris-versicolor',
  4.6: 'Iris-versicolor',
  4.4: 'Iris-versicolor',
  4.1: 'Iris-versicolor',
  5.1: {'x1': {5.8: 'Iris-virginica',
    6.9: 'Iris-virginica',
    6.3: 'Iris-virginica',
    6.0: 'Iris-ve

# C4.5算法

C4.5算法利用信息增益比判断节点

In [19]:
def C45Tree(df):
    
    #1、如果待分类的对象的type仅有一种时，停止分类
    #返回该对象的特征
    if len(set(df.type)) == 1 :
        return list(set(df.type))[0]
    
    #2、如果遍历所有特征后仍不能完全分类，停止分类,返回最多的type
    if len(df.columns) == 2 :
        return list(df.type.mode())[0]
    
    #3、获取最佳特征
    bestFeature = GetBestFeature(df , 'infogainratio')
    
    #4、种树
    DTree = {bestFeature:{}}
    bestFeatureValues = list(set(df[bestFeature]))
    
    for bfv in bestFeatureValues:
        subdf = df[df[bestFeature]==bfv]
        subdf = subdf.drop(bestFeature , axis = 1)
        DTree[bestFeature][bfv] = ID3Tree(subdf)
    
    return DTree

In [20]:
C45Tree(iris)

{'x4': {0.2: 'Iris-setosa',
  0.4: 'Iris-setosa',
  0.3: 'Iris-setosa',
  0.5: 'Iris-setosa',
  0.6: 'Iris-setosa',
  1.4: {'x2': {2.9: 'Iris-versicolor',
    3.1: 'Iris-versicolor',
    2.7: 'Iris-versicolor',
    3.2: 'Iris-versicolor',
    3.0: 'Iris-versicolor',
    2.8: 'Iris-versicolor',
    2.6: 'Iris-virginica'}},
  1.5: {'x3': {4.7: 'Iris-versicolor',
    4.2: 'Iris-versicolor',
    4.5: 'Iris-versicolor',
    4.9: 'Iris-versicolor',
    4.6: 'Iris-versicolor',
    5.0: 'Iris-virginica',
    5.1: 'Iris-virginica'}},
  1.3: 'Iris-versicolor',
  1.6: {'x1': {7.2: 'Iris-virginica',
    6.3: 'Iris-versicolor',
    6.0: 'Iris-versicolor'}},
  1.0: 'Iris-versicolor',
  1.1: 'Iris-versicolor',
  2.5: 'Iris-virginica',
  2.0: 'Iris-virginica',
  2.1: 'Iris-virginica',
  1.2: 'Iris-versicolor',
  1.7: {'x1': {4.9: 'Iris-virginica', 6.7: 'Iris-versicolor'}},
  0.1: 'Iris-setosa',
  2.2: 'Iris-virginica',
  2.3: 'Iris-virginica',
  1.8: {'x1': {5.9: {'x2': {3.0: 'Iris-virginica', 3.2: 'I