In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [110]:
# 获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")

In [111]:
titan.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [112]:
# 数据基本处理

In [113]:
# 1.缺失值处理
# 年龄存在缺失值
titan['age'].fillna(titan['age'].mean(),inplace=True)

In [114]:
# 2.确定特征值和目标值
x = titan[['pclass','age','sex']]
y = titan['survived']

In [115]:
# 3.数据切割
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

In [116]:
# 特征工程

In [117]:
# 1.特征预处理，归一化，标准化
# 特征不是数字

In [118]:
# 2.特征抽取
# one-hot编码
# sparse=True返回sparse矩阵
feature_dict = DictVectorizer(sparse=False)
# orient="records" 一行一行处理
x_train = feature_dict.fit_transform(x_train.to_dict(orient="records"))

In [119]:
feature_dict.get_feature_names()

['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']

In [120]:
x_test = feature_dict.transform(x_test.to_dict(orient="records"))

In [121]:
print(x_train)

[[  0.3333       0.           0.           1.           0.           1.        ]
 [ 31.19418104   0.           0.           1.           0.           1.        ]
 [ 25.           1.           0.           0.           0.           1.        ]
 ..., 
 [ 18.           1.           0.           0.           1.           0.        ]
 [ 31.19418104   0.           0.           1.           0.           1.        ]
 [ 31.19418104   1.           0.           0.           0.           1.        ]]


In [122]:
# 3.特征降维

## 决策树法

In [123]:
# 建立模型，机器学习，选择算法
# 用决策树进行预测
estimator = DecisionTreeClassifier(max_depth=10)
estimator.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [124]:
# 模型评估

In [125]:
# 1.准确率
score = estimator.score(x_test,y_test)
print("准确率为",score)
# 2.平均绝对误差

准确率为 0.784194528875


In [49]:
# 决策树的本地保存
# 导包
# 装软件
sudo apt-get install graphviz
# 保存文件
export_graphviz(estimator,out_file="./titan_tree.dot",feature_names=['n年龄', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])

# 转换图像
# dot -Tpng ./titan_tree.dot -o titan_tree.png

## 随机森林进行预测

In [126]:
# 建立模型，机器学习，选择算法
# 用随机森林预测
rf = RandomForestClassifier()
# 网格搜索与交叉验证
param = {'n_estimators':[10,20,30,50,80,120],'max_depth':[2,3,4,5,8]}
gc = GridSearchCV(rf,param_grid=param,cv=2)
# 进行训练
gc.fit(x_train,y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30, 50, 80, 120], 'max_depth': [2, 3, 4, 5, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [127]:
# 模型评估
score = gc.score(x_test,y_test)
print("准确率",score)
print("最优参数模型:",gc.best_estimator_)

准确率 0.784194528875
最优参数模型: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
