In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

#### 数据加载

In [2]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

#### 像之前一样，填充缺失值

In [3]:
# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)

In [4]:
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

In [5]:
print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [6]:
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)

特征值
     Pclass     Sex        Age  SibSp  Parch     Fare Embarked
0         3    male  22.000000      1      0   7.2500        S
1         1  female  38.000000      1      0  71.2833        C
2         3  female  26.000000      0      0   7.9250        S
3         1  female  35.000000      1      0  53.1000        S
4         3    male  35.000000      0      0   8.0500        S
..      ...     ...        ...    ...    ...      ...      ...
886       2    male  27.000000      0      0  13.0000        S
887       1  female  19.000000      0      0  30.0000        S
888       3  female  29.699118      1      2  23.4500        S
889       1    male  26.000000      0      0  30.0000        C
890       3    male  32.000000      0      0   7.7500        Q

[891 rows x 7 columns]


#### 对特征作向量化处理

In [8]:
from sklearn.feature_extraction import DictVectorizer
dvec=DictVectorizer(sparse=False)
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


#### 根据TPOT的结果，定义一个GradientBoostingClassifier的分类器并且使用TPOT建议的参数

In [9]:
# Average CV score on the training set was: 0.8305316678174627
exported_pipeline = GradientBoostingClassifier(learning_rate=0.1, max_depth=8, max_features=0.2, min_samples_leaf=8, min_samples_split=13, n_estimators=100, subsample=0.3)

In [10]:
exported_pipeline.fit(train_features, train_labels)

GradientBoostingClassifier(max_depth=8, max_features=0.2, min_samples_leaf=8,
                           min_samples_split=13, subsample=0.3)

In [11]:
test_features=dvec.transform(test_features.to_dict(orient='record'))
# 预测测试集
pred_labels = exported_pipeline.predict(test_features)

In [12]:
# 得到决策树准确率(基于训练集)
acc_decision_tree = round(exported_pipeline.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

score准确率为 0.8822


#### 使用K折交叉验证 统计决策树准确率

In [13]:
from sklearn.model_selection import cross_val_score

print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(exported_pipeline, train_features, train_labels, cv=10)))

cross_val_score准确率为 0.8238


#### 通过对比交叉验证的结果可以发现，虽然GradientBoostingClassifier对于训练集的准确率没有那么高，但是在交叉验证的结果上比决策树要好一些。