In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [2]:
# 1、获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")

In [4]:
titan.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [6]:
# 2. 数据基本处理
#    2.1 缺失值处理
np.any(pd.isnull(titan))

row.names    False
pclass       False
survived     False
name         False
age           True
embarked      True
home.dest     True
room          True
ticket        True
boat          True
sex          False
dtype: bool

In [8]:
titan['age'].fillna(titan['age'].mean(),inplace =True)

In [11]:
#    2.2 确定特征值目标值
x = titan[['pclass','age','sex']]
y = titan['survived']

In [12]:
#    2.3 切割数据
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state =22)

In [20]:
# 3. 特征工程
#    特征提取

transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient='records'))
x_test = transfer.fit_transform(x_test.to_dict(orient='records'))

In [36]:
# 4. 建立模型
#    决策树算法
estimator = DecisionTreeClassifier(criterion='entropy')

In [37]:
estimator.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [38]:
# 5. 模型评估
pre = estimator.predict(x_test)

In [39]:
print("预测值与真实值",pre == y_test)

预测值与真实值 831      True
261     False
1210     True
1155     True
255      True
762      True
615      True
507      True
1175     True
301      True
1134     True
177     False
183     False
125     False
1093     True
1304    False
1124     True
798     False
1101     True
1239    False
1153     True
1068    False
846      True
148      True
478      True
642      True
1298     True
540      True
28       True
130      True
        ...  
863      True
785     False
5       False
102      True
652      True
487     False
806      True
1309     True
309     False
248      True
655      True
198      True
230      True
1088     True
187      True
476      True
1122    False
599      True
1259     True
379     False
936      True
1181     True
1005    False
127     False
136      True
741      True
1253     True
128      True
605      True
271      True
Name: survived, Length: 263, dtype: bool


In [40]:
estimator.score(x_test,y_test)

0.779467680608365

# 实现模型可视化

In [41]:
export_graphviz(estimator,out_file='./python28.dot',feature_names=transfer.get_feature_names())

# 随机森林

In [44]:
#    2.2 确定特征值目标值
x = titan[['pclass','age','sex']]
y = titan['survived']
#    2.3 切割数据
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state =22)
# 3. 特征工程
#    特征提取

transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train.to_dict(orient='records'))
x_test = transfer.fit_transform(x_test.to_dict(orient='records'))
# 4. 建立模型
#    决策树算法
estimator = RandomForestClassifier(n_estimators=10,max_depth=5)
param = {'n_estimators':[10,50,100],"max_depth":[5,8,10]}
estimator = GridSearchCV(estimator,param_grid=param,cv=4)

In [45]:
estimator.fit(x_train,y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100], 'max_depth': [5, 8, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
# 5. 模型评估
pre = estimator.predict(x_test)

In [47]:
estimator.score(x_test,y_test)

0.7908745247148289

In [48]:
print("最优模型\n",estimator.best_estimator_)

最优模型
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [50]:
print("最优结果",estimator.best_score_)

最优结果 0.8304761904761905
