# 导入训练数据

In [1]:
import pandas #ipython notebook
titanic = pandas.read_csv("titanic_train.csv")
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 数据预处理

In [3]:
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [5]:
print(titanic["Sex"].unique())

# Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

['male' 'female']


In [6]:
print(titanic["Embarked"].unique())
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

['S' 'C' 'Q' nan]


In [7]:
titanic_test = pandas.read_csv("titanic_test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

# 模型训练

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
target='Survived'# Survived的值就是分类的输出  
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = titanic[predictors]# 分割 x 向量变量
y = titanic[target]# 目标训练字段
gbdt = GradientBoostingClassifier(random_state=10)#分类模型
gbdt.fit(X,y)#模型训练

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=10, subsample=1.0, verbose=0,
              warm_start=False)

# 测试模型

In [21]:
from sklearn import metrics  
y_pred = gbdt.predict(X)# 预测
y_predprob = gbdt.predict_proba(X)[:,1] # 预测概率
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))
print(metrics.classification_report(y.values, y_pred))

Accuracy : 0.8889
AUC Score (Train): 0.946583
             precision    recall  f1-score   support

          0       0.87      0.96      0.91       549
          1       0.92      0.78      0.84       342

avg / total       0.89      0.89      0.89       891



# 用测试集进行测试

In [19]:
Xt = titanic_test[predictors]# 分割 x 向量变量
y_pred = gbdt.predict(Xt)# 预测
y_predprob = gbdt.predict_proba(Xt)[:,1] # 预测概率
y_predprob

array([ 0.0453087 ,  0.31492968,  0.2271579 ,  0.11208302,  0.36061187,
        0.1050144 ,  0.45814015,  0.18052626,  0.88709337,  0.08657651,
        0.08225089,  0.15499689,  0.94357795,  0.25032299,  0.90897853,
        0.93400018,  0.08487987,  0.17675905,  0.5280773 ,  0.52461293,
        0.27856014,  0.64440851,  0.94724671,  0.48356076,  0.96206427,
        0.05806317,  0.97135221,  0.17675905,  0.54486608,  0.15912965,
        0.06465929,  0.15110409,  0.45525861,  0.21734469,  0.6697365 ,
        0.15468511,  0.41079323,  0.38708377,  0.10070989,  0.49177213,
        0.06762399,  0.52791144,  0.06768424,  0.88584518,  0.9270119 ,
        0.14188046,  0.1233614 ,  0.13733638,  0.95666762,  0.58546077,
        0.31655857,  0.16210628,  0.86166477,  0.85749495,  0.19073429,
        0.03459709,  0.07058627,  0.09937164,  0.10284057,  0.97260726,
        0.08753385,  0.1917797 ,  0.09690674,  0.83424506,  0.6254223 ,
        0.8817299 ,  0.74347597,  0.15328593,  0.27730358,  0.91

# 模型调参 (加分项)








































首先我们从步长(learning rate)和迭代次数(n_estimators)入手。一般来说,开始选择一个较小的步长来网格搜索最好的迭代次数。这里，我们将步长初始值设置为0.1。对于迭代次数进行网格搜索如下：


In [22]:
from sklearn.grid_search import GridSearchCV
param_test1 = {'n_estimators':[i for i in range(20,81,10)]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.85553, std: 0.01667, params: {'n_estimators': 20},
  mean: 0.86189, std: 0.02003, params: {'n_estimators': 30},
  mean: 0.86272, std: 0.02391, params: {'n_estimators': 40},
  mean: 0.86358, std: 0.02543, params: {'n_estimators': 50},
  mean: 0.86602, std: 0.02466, params: {'n_estimators': 60},
  mean: 0.86667, std: 0.02410, params: {'n_estimators': 70},
  mean: 0.86516, std: 0.02558, params: {'n_estimators': 80}],
 {'n_estimators': 70},
 0.8666668213138685)

找到了一个合适的迭代次数，现在我们开始对决策树进行调参。首先我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索

In [30]:
param_test2 = {'max_depth':[i for i in range(3,14,2)], 'min_samples_split':[i for i in range(100,401,100)]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=70, min_samples_leaf=20, 
      max_features='sqrt', subsample=0.8, random_state=10), 
   param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.87117, std: 0.02827, params: {'max_depth': 3, 'min_samples_split': 100},
  mean: 0.87104, std: 0.02773, params: {'max_depth': 3, 'min_samples_split': 200},
  mean: 0.86467, std: 0.02377, params: {'max_depth': 3, 'min_samples_split': 300},
  mean: 0.86369, std: 0.01934, params: {'max_depth': 3, 'min_samples_split': 400},
  mean: 0.87008, std: 0.02834, params: {'max_depth': 5, 'min_samples_split': 100},
  mean: 0.87101, std: 0.02950, params: {'max_depth': 5, 'min_samples_split': 200},
  mean: 0.86639, std: 0.02496, params: {'max_depth': 5, 'min_samples_split': 300},
  mean: 0.86447, std: 0.02042, params: {'max_depth': 5, 'min_samples_split': 400},
  mean: 0.87075, std: 0.02974, params: {'max_depth': 7, 'min_samples_split': 100},
  mean: 0.86961, std: 0.03127, params: {'max_depth': 7, 'min_samples_split': 200},
  mean: 0.86667, std: 0.02410, params: {'max_depth': 7, 'min_samples_split': 300},
  mean: 0.86375, std: 0.02138, params: {'max_depth': 7, 'min_samples_split': 400},
  me

我们把它定下来，对于内部节点再划分所需最小样本数min_samples_split，我们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。

In [32]:
param_test3 = {'min_samples_split':[ i for i in range(10,101,10)], 'min_samples_leaf':[ i for i in range(10,101,10)]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=70,max_depth=9,
                                     max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X,y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.86629, std: 0.03300, params: {'min_samples_leaf': 10, 'min_samples_split': 10},
  mean: 0.86629, std: 0.03300, params: {'min_samples_leaf': 10, 'min_samples_split': 20},
  mean: 0.86756, std: 0.02948, params: {'min_samples_leaf': 10, 'min_samples_split': 30},
  mean: 0.86940, std: 0.03045, params: {'min_samples_leaf': 10, 'min_samples_split': 40},
  mean: 0.86566, std: 0.03050, params: {'min_samples_leaf': 10, 'min_samples_split': 50},
  mean: 0.87183, std: 0.03025, params: {'min_samples_leaf': 10, 'min_samples_split': 60},
  mean: 0.86999, std: 0.03293, params: {'min_samples_leaf': 10, 'min_samples_split': 70},
  mean: 0.86993, std: 0.03069, params: {'min_samples_leaf': 10, 'min_samples_split': 80},
  mean: 0.86992, std: 0.02969, params: {'min_samples_leaf': 10, 'min_samples_split': 90},
  mean: 0.86900, std: 0.03124, params: {'min_samples_leaf': 10, 'min_samples_split': 100},
  mean: 0.87251, std: 0.03051, params: {'min_samples_leaf': 20, 'min_samples_split': 10},
  mean: 0

现在我们再对最大特征数max_features进行网格搜索。

In [35]:
param_test4 = {'max_features':[ i for i in range(1,7,1)]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=70,max_depth=9, min_samples_leaf =20, 
               min_samples_split =70, subsample=0.8, random_state=10), 
                       param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(X,y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.87178, std: 0.02983, params: {'max_features': 1},
  mean: 0.87547, std: 0.02975, params: {'max_features': 2},
  mean: 0.87612, std: 0.02722, params: {'max_features': 3},
  mean: 0.87102, std: 0.02612, params: {'max_features': 4},
  mean: 0.87321, std: 0.02942, params: {'max_features': 5},
  mean: 0.87623, std: 0.02815, params: {'max_features': 6}],
 {'max_features': 6},
 0.876230350028832)

现在我们再对子采样的比例进行网格搜索

In [36]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=70,max_depth=9, min_samples_leaf =20, 
               min_samples_split =70, max_features=6, random_state=10), 
                       param_grid = param_test5, scoring='roc_auc',iid=False, cv=5)
gsearch5.fit(X,y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.87283, std: 0.02361, params: {'subsample': 0.6},
  mean: 0.87336, std: 0.02541, params: {'subsample': 0.7},
  mean: 0.87425, std: 0.02541, params: {'subsample': 0.75},
  mean: 0.87623, std: 0.02815, params: {'subsample': 0.8},
  mean: 0.87507, std: 0.02509, params: {'subsample': 0.85},
  mean: 0.87183, std: 0.02644, params: {'subsample': 0.9}],
 {'subsample': 0.8},
 0.876230350028832)

现在我们基本已经得到我们所有调优的参数结果了。这时我们可以减半步长，最大迭代次数加倍来增加我们模型的泛化能力。再次拟合我们的模型：

In [37]:
gbdt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=70,max_depth=9, min_samples_leaf =20, 
               min_samples_split =70, max_features=6, subsample=0.8, random_state=10)
gbdt.fit(X,y)
y_pred = gbdt.predict(X)
y_predprob = gbdt.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))
print(metrics.classification_report(y.values, y_pred))

Accuracy : 0.9001
AUC Score (Train): 0.961285
             precision    recall  f1-score   support

          0       0.90      0.95      0.92       549
          1       0.91      0.82      0.86       342

avg / total       0.90      0.90      0.90       891

