##### 决策树的原理
1. 信息增益：
特征A对训练数据集D的信息增益g(D,A),定义为集合D的信息熵H(D)与特征A给定条件下D的信息条件熵H(D|A)之差，即公式为：
$$g(D, A) = H(D) - H(D|A)$$
信息增益表达得知特征X的信息而使得Y的信息不确定性减少的程度

##### 决策树API
``````python
class sklearn.tree.DecisionTreeClassifier(criterion = "gini", max_depth = None, random_state = None)
    criterion: 判断优劣的算法，默认是“gini”
    max_depth: 树的深度
    random_state: 随机数种子
``````

##### 决策树的优劣
- 优点：直观
- 缺点：数据太过复杂会出现过拟合

In [9]:
import pandas as pd
# load titanic datasets
data = pd.read_csv("data/titanic/train.csv")
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [10]:
data.shape

(891, 12)

In [58]:
data = data[["Pclass", "Sex", "Embarked", "Survived"]]
data = data.dropna()
data = data.astype("category")
print(data.shape)
print(data.describe())

x = data[["Pclass", "Sex", "Embarked"]]
y = data[["Survived"]]
featureNames = list(data.columns.values)
#print(x.head(10))
#print(y.head(10))

# convert x to dict
x = x.to_dict(orient = "record")
y = y.to_dict(orient = "record")
print(x)


(889, 4)
        Pclass   Sex Embarked  Survived
count      889   889      889       889
unique       3     2        3         2
top          3  male        S         0
freq       491   577      644       549
[{'Pclass': 3, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 1, 'Sex': 'female', 'Embarked': 'C'}, {'Pclass': 3, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 1, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'male', 'Embarked': 'Q'}, {'Pclass': 1, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 2, 'Sex': 'female', 'Embarked': 'C'}, {'Pclass': 3, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 1, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'male', 'Embarked': 'S'}, {'Pclass': 3, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 2, 'Sex': 'female', 'Embarked': 'S'}, {'Pclass': 3, 'Sex':

In [57]:
def DecisionTree(x, y):
    
    from sklearn.feature_extraction import DictVectorizer
    dict = DictVectorizer(sparse = False)
    x = dict.fit_transform(x)
    y = dict.fit_transform(y)
    
    # split test and train data
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
    
    # import DecisionTreeClassifier
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier(criterion = "gini", max_depth = 5)
    
    # fit the model
    dt.fit(x_train, y_train)
    
    # from sklearn.tree import export_graphviz
    # export_graphviz(dt, out_file = "dt.dot")
    
    #print(dt.decision_path(x_test))
    print(dt.tree_.__getstate__())
    print("=" * 30)
    
    print(dt.score(x_test, y_test))
    
    from sklearn import tree
    dotfile = open("dt.dot", 'w')
    tree.export_graphviz(dt, out_file=dotfile, feature_names = featureNames)
    dotfile.close()

    return None


if __name__ == "__main__":
    DecisionTree(x, y)

{'max_depth': 5, 'node_count': 31, 'nodes': array([( 1, 16,  5,  0.5, 0.47185925, 666, 666.),
       ( 2, 11,  3,  2.5, 0.38132807, 234, 234.),
       ( 3,  8,  2,  0.5, 0.1171875 , 128, 128.),
       ( 4,  7,  3,  1.5, 0.04996713,  39,  39.),
       ( 5,  6,  0,  0.5, 0.06054688,  32,  32.),
       (-1, -1, -2, -2. , 0.        ,   1,   1.),
       (-1, -1, -2, -2. , 0.06243496,  31,  31.),
       (-1, -1, -2, -2. , 0.        ,   7,   7.),
       ( 9, 10,  3,  1.5, 0.1449312 ,  89,  89.),
       (-1, -1, -2, -2. , 0.10226443,  37,  37.),
       (-1, -1, -2, -2. , 0.17381657,  52,  52.),
       (12, 15,  2,  0.5, 0.499822  , 106, 106.),
       (13, 14,  0,  0.5, 0.44938017,  44,  44.),
       (-1, -1, -2, -2. , 0.4032    ,  25,  25.),
       (-1, -1, -2, -2. , 0.48753463,  19,  19.),
       (-1, -1, -2, -2. , 0.48126951,  62,  62.),
       (17, 22,  3,  1.5, 0.30178326, 432, 432.),
       (18, 21,  1,  0.5, 0.463391  ,  85,  85.),
       (19, 20,  2,  0.5, 0.46570295,  84,  84.),
      

ValueError: Length of feature_names, 4 does not match number of features, 6

NameError: name 'dt' is not defined