In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from math import log

In [2]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    return df

data = create_data()

In [19]:
data[:10]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [4]:
class Node:
    def __init__(self, isleaf = False, label = None, feature_name=None,feature=None):
        self.isleaf = isleaf
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {'isleaf':self.isleaf, 'label':self.label, 'feature_name':self.feature_name, 'tree':self.tree}
    def __repr__(self):
        return "{}".format(self.result)
    def add_edge(self,key,val):
        self.tree[key]=val
    def predict(self,features):
        if self.isleaf == True:
            return self.label
        new_feature = features.copy()
        new_feature.pop(self.feature)
        return self.tree[features[self.feature]].predict(new_feature)

# 使用ID3算法实现决策树，挑选特征基于信息增益最大化

In [5]:
class DTree:
    def __init__(self,eps=0.0):
        self.eps = eps
        self.tree = {}
        
    # 计算熵
    @staticmethod
    def cal_ent(dataset):
        length = len(np.array(dataset))
        ret = 0.0
        for i in dataset.iloc[:,-1].value_counts():
            ret += -i/length * log(i/length,2)
        return ret
    
    # 计算经验条件熵
    def cal_expent(self,dataset,axis):
        length = len(np.array(dataset))
        ret = 0.0
        for i in dataset.iloc[:,axis].value_counts().index:
            new_data = dataset[dataset.iloc[:,axis]==i]
            cnt = len(np.array(new_data))
            ret += cnt/length * self.cal_ent(new_data)
        return ret
    
    @staticmethod
    def info_gain(ent,expent):
        return ent - expent
    
    def max_info_gain(self,dataset):
        ent = self.cal_ent(dataset)
        best =[]
        for i in range(len(dataset.columns)-1):
            best.append((i,self.info_gain(ent,self.cal_expent(dataset,i))))
        return max(best,key = lambda x:x[-1])
            
    
    def build(self,dataset):
        """
        输入：数据集dataset，DataFrame格式
        输出：决策树
        """
        x_train,y_train = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        features = dataset.columns[:-1]
        
        #情况1：所有样本的类别都相同
        if len(y_train.value_counts())==1:
            return Node(isleaf=True,label=y_train.value_counts().index[0])
        
        #情况2：只有一个特征feature
        if len(features)==1:
            return Node(isleaf=True,label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        max_feature, max_info = self.max_info_gain(dataset)
        max_feature_name = features[max_feature]
        
        #情况3：最大信息增益<eps,则选择类别数最多的作为label
        if max_info < self.eps:
            return Node(isleaf=True,label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        #情况4：提取最大信息增益的特征
        node_tree = Node(isleaf=False,feature_name=max_feature_name,feature=max_feature)
        feature_list = dataset.iloc[:,max_feature].value_counts().index
        for var in feature_list:
            sub_data = dataset.loc[dataset.iloc[:,max_feature] == var].drop([max_feature_name], axis=1)
            node_tree.add_edge(var,self.build(sub_data))
        
        return node_tree
    
    def fit(self,dataset):
        self.tree = self.build(dataset)
        return self.tree
    
    def predict(self,test_data):
        return self.tree.predict(test_data)

In [6]:
dt = DTree()
tree = dt.fit(data)

In [7]:
tree

{'isleaf': False, 'label': None, 'feature_name': 'petal length (cm)', 'tree': {1.5: {'isleaf': True, 'label': 0, 'feature_name': None, 'tree': {}}, 1.4: {'isleaf': True, 'label': 0, 'feature_name': None, 'tree': {}}, 5.1: {'isleaf': False, 'label': None, 'feature_name': 'sepal length (cm)', 'tree': {5.8: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 6.3: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 6.9: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 5.9: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 6.5: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 6.0: {'isleaf': True, 'label': 1, 'feature_name': None, 'tree': {}}}}, 4.5: {'isleaf': False, 'label': None, 'feature_name': 'sepal length (cm)', 'tree': {6.0: {'isleaf': True, 'label': 1, 'feature_name': None, 'tree': {}}, 4.9: {'isleaf': True, 'label': 2, 'feature_name': None, 'tree': {}}, 6.2: {'isleaf': True, 'label': 1, 'feature_name': None, 'tree

In [8]:
print(dt.predict(data.loc[0].tolist()))
print(data.loc[0][-1])

print(dt.predict(data.loc[149].tolist()))
print(data.loc[149][-1])

0
0.0
2
2.0


In [9]:
# 书上题目5.1
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

In [10]:
datasets, labels = create_data()

In [11]:
train_data = pd.DataFrame(datasets, columns=labels)

In [12]:
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


In [13]:
dt = DTree()
tree = dt.fit(train_data)

In [14]:
tree

{'isleaf': False, 'label': None, 'feature_name': '有自己的房子', 'tree': {'否': {'isleaf': False, 'label': None, 'feature_name': '有工作', 'tree': {'否': {'isleaf': True, 'label': '否', 'feature_name': None, 'tree': {}}, '是': {'isleaf': True, 'label': '是', 'feature_name': None, 'tree': {}}}}, '是': {'isleaf': True, 'label': '是', 'feature_name': None, 'tree': {}}}}

In [15]:
dt.predict(['老年', '否', '否', '一般'])

'否'

In [16]:
# 将非数值转为数值
# 年龄：青年：0，中年:1，老年：2
# 有工作：否：0，是：1
# 有自己的房子：否:0，是：1
# 信贷情况：一般：0，好：1，非常好：2
# 类别：否：0，是：1
train_data['年龄'] = train_data['年龄'].apply(lambda x:0 if x == '青年' else (1 if x == '中年' else 2))
train_data['有工作'] = train_data['有工作'].apply(lambda x:0 if x == '否' else 1)
train_data['有自己的房子'] = train_data['有自己的房子'].apply(lambda x:0 if x == '否' else 1 )
train_data['信贷情况'] = train_data['信贷情况'].apply(lambda x:0 if x == '一般' else (1 if x =='好' else 2) )
train_data['类别'] = train_data['类别'].apply(lambda x:0 if x=='否' else 1)

In [17]:
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,0,0,0,0,0
1,0,0,0,1,0
2,0,1,0,1,1
3,0,1,1,0,1
4,0,0,0,0,0
5,1,0,0,0,0
6,1,0,0,1,0
7,1,1,1,1,1
8,1,0,1,2,1
9,1,0,1,2,1


# 练习使用sklearn的决策树分类器

In [18]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

clf.fit(train_data.iloc[:,:-1],train_data.iloc[:,-1])
clf.predict([[1, 1, 1, 1],[2,0,0,0]])

array([1, 0])