# 使用 DictVectorizer 处理离散特征，即独热编码，这样 sklearn 的决策树模型才能运用

In [2]:
import pandas as pd

origin_data = pd.read_csv('./AllElectronics.csv')
origin_data

Unnamed: 0,RID,age,income,student,credit_rating,Class_bugs_computer
0,1,youth,high,no,fair,no
1,2,youth,high,no,excellent,no
2,3,middle_aged,high,no,fair,yes
3,4,senior,medium,no,fair,yes
4,5,senior,low,yes,fair,yes
5,6,senior,low,yes,excellent,no
6,7,middle_aged,low,yes,excellent,yes
7,8,youth,medium,no,fair,no
8,9,youth,low,yes,fair,yes
9,10,senior,medium,yes,fair,yes


In [3]:
labels = origin_data['Class_bugs_computer']
labels.value_counts()

yes    9
no     5
Name: Class_bugs_computer, dtype: int64

In [4]:
from sklearn.preprocessing import LabelBinarizer

# 二值化
lb = LabelBinarizer()
lb.fit_transform(labels)

array([[0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0]])

In [5]:
feature_data = origin_data[['age','credit_rating','income','student']]
feature_data

Unnamed: 0,age,credit_rating,income,student
0,youth,fair,high,no
1,youth,excellent,high,no
2,middle_aged,fair,high,no
3,senior,fair,medium,no
4,senior,fair,low,yes
5,senior,excellent,low,yes
6,middle_aged,excellent,low,yes
7,youth,fair,medium,no
8,youth,fair,low,yes
9,senior,fair,medium,yes


In [6]:
import csv


fr = open('./AllElectronics.csv')
reader = csv.reader(fr)
headers = next(reader)
headers

['RID', 'age', 'income', 'student', 'credit_rating', 'Class_bugs_computer']

In [7]:
feature_list = []

for row in reader:
    row_dict = {}
    for i in range(1,len(row)-1):
        row_dict[headers[i]] = row[i]
    feature_list.append(row_dict)

In [8]:
feature_list

[{'age': 'youth', 'credit_rating': 'fair', 'income': 'high', 'student': 'no'},
 {'age': 'youth',
  'credit_rating': 'excellent',
  'income': 'high',
  'student': 'no'},
 {'age': 'middle_aged',
  'credit_rating': 'fair',
  'income': 'high',
  'student': 'no'},
 {'age': 'senior',
  'credit_rating': 'fair',
  'income': 'medium',
  'student': 'no'},
 {'age': 'senior', 'credit_rating': 'fair', 'income': 'low', 'student': 'yes'},
 {'age': 'senior',
  'credit_rating': 'excellent',
  'income': 'low',
  'student': 'yes'},
 {'age': 'middle_aged',
  'credit_rating': 'excellent',
  'income': 'low',
  'student': 'yes'},
 {'age': 'youth',
  'credit_rating': 'fair',
  'income': 'medium',
  'student': 'no'},
 {'age': 'youth', 'credit_rating': 'fair', 'income': 'low', 'student': 'yes'},
 {'age': 'senior',
  'credit_rating': 'fair',
  'income': 'medium',
  'student': 'yes'},
 {'age': 'youth',
  'credit_rating': 'excellent',
  'income': 'medium',
  'student': 'yes'},
 {'age': 'middle_aged',
  'credit_rat

In [9]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer()
dv.fit(feature_list)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [11]:
dummy_X = dv.transform(feature_list).toarray()
dummy_X

array([[0., 0., 1., 0., 1., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 1., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0.]])

In [12]:
dummy_X.shape

(14, 10)

In [13]:
dv.get_feature_names()

['age=middle_aged',
 'age=senior',
 'age=youth',
 'credit_rating=excellent',
 'credit_rating=fair',
 'income=high',
 'income=low',
 'income=medium',
 'student=no',
 'student=yes']

In [14]:
labels

0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
Name: Class_bugs_computer, dtype: object

In [15]:
len(labels)

14

In [16]:
len(dummy_X)

14

## 使用决策树算法进行训练

In [17]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(dummy_X,labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## 可视化决策树模型

+ 第 1 步：生成 dot 文件

In [18]:
import sklearn.tree as tree

with open('2018_08_04.dot','w') as fw:
    f = tree.export_graphviz(clf,feature_names=dv.get_feature_names(),out_file=fw)

In [19]:
ls

[31m2018_08_04.dot[m[m*
[31m2018_08_04.pdf[m[m*
[31mAllElectronics.csv[m[m*
[31mUntitled.ipynb[m[m*
[34m__pycache__[m[m/
[30m[43mfiles[m[m/
[31mloan_example.csv[m[m*
[31mtrees.py[m[m*
[31m《机器学习实战》第 3 章 决策树学习笔记.ipynb[m[m*
[31m使用 DictVectorizer 处理离散特征，即独热编码，这样 sklearn 的决策树模型才能运用.ipynb[m[m*
[31m使用决策树对鸢尾花数据分类.ipynb[m[m*
[30m[43m唐宇迪-决策树[m[m/


+ 第 2 步：使用 dot 命令生成 pdf 文件

In [20]:
!dot -Tpdf 2018_08_04.dot -o 2018_08_04.pdf