In [1]:
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
train_df=pd.read_csv('./titanic/train.csv')
test_df=pd.read_csv('./titanic/test.csv')
import warnings
warnings.filterwarnings('ignore')

#重复上一节的操作...
train_df['Cabin'].fillna('missing',inplace=True)
test_df['Cabin'].fillna('missing',inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0],inplace=True)
train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
test_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
test_df['Fare'].fillna(train_df['Fare'].mean(),inplace=True)
import category_encoders as ce
del train_df['Name']
del train_df['Ticket']
del test_df['Name']
del test_df['Ticket']
del train_df['PassengerId']
del test_df['PassengerId']
label=train_df["Survived"]
del train_df["Survived"]
# target 
target_encoder = ce.TargetEncoder(cols=['Embarked','Cabin']).fit(train_df,label)
train_df=target_encoder.transform(train_df)
test_df=target_encoder.transform(test_df)

# one hot
onehot_encoder = ce.OneHotEncoder(cols=['Sex']).fit(train_df)
train_df=onehot_encoder.transform(train_df)
test_df=onehot_encoder.transform(test_df)

from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer
#z-score归一化为例
standard_scaler=StandardScaler()
standard_scaler.fit(train_df)
new_train_df=pd.DataFrame(standard_scaler.transform(train_df),columns=train_df.columns)
new_test_df=pd.DataFrame(standard_scaler.transform(test_df),columns=train_df.columns)

from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=2,include_bias=False,interaction_only=False)#
poly_fea_np=poly.fit_transform(train_df)
poly_fea_df=pd.DataFrame(poly_fea_np,columns=poly.get_feature_names())

模型的优化，可以从两方面考虑：

（1）单模型优化：超参搜索；

（2）多模型集成：集成学习；

### 一.超参数搜索
超参是指需要人为设定的参数，比如前面gbdt中的n_estimators,max_depth,learning_rate等；目前常见的超参搜索有网格搜索、随机搜索、贝叶斯优化搜索，还有基于强化学习的，比如google vizier...，其实比较好的方法是“人工智能”搜索（只需要一个excel表，并记录到相关操作对结果的改变就好了<坏结果也要保留>）...

#### 网格搜索  

网格搜索有个缺点那就是很容易有“漏网之鱼”，网格搜索的参数都位于“交点”上，而最优解不一定落在这上面

In [2]:
from sklearn.model_selection import GridSearchCV
#定义搜索空间
gdbt_parameters = {'max_depth': [3,4,5],'learning_rate':[0.1,0.15,0.2],'n_estimators':[50,80,100,150]}
#定义模型
gbdt=GradientBoostingClassifier()
#进行搜索
grid = GridSearchCV(gbdt, gdbt_parameters,scoring='f1')
grid.fit(poly_fea_df, label)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  pre

In [3]:
grid.best_score_,grid.best_params_

(0.7657182671569628,
 {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150})

In [4]:
classifier=GradientBoostingClassifier(n_estimators=150,max_depth=3,learning_rate=0.2)
scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)
np.mean(scores),np.std(scores)

(0.7771387926391069, 0.052881600589403784)

#### 随机搜索
随机搜索会在超参数空间内生成很多随机的点，然后利用这些点的超参进行模型训练

更多：https://blog.csdn.net/qq_36810398/article/details/86699842

In [5]:
from sklearn.model_selection import RandomizedSearchCV
#定义搜索空间
gdbt_parameters = {'max_depth': [3,4,5],'learning_rate':[0.1,0.15,0.2],'n_estimators':[50,80,100,150]}
#定义模型
gbdt=GradientBoostingClassifier()
#进行搜索
random_search = RandomizedSearchCV(gbdt, gdbt_parameters,scoring='f1')
random_search.fit(poly_fea_df, label)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                

In [6]:
random_search.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 80}

In [7]:
classifier=GradientBoostingClassifier(n_estimators=80,max_depth=3,learning_rate=0.1)
scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)
np.mean(scores),np.std(scores)

(0.7796924064819074, 0.037484958581777465)

#### 贝叶斯优化
这里推荐使用Hyperopt工具  
更多：https://www.jianshu.com/p/35eed1567463|

In [8]:
from hyperopt import fmin, tpe, hp,STATUS_OK,Trials

#定义loss函数
def hyperopt_train_test(params):
    clf = GradientBoostingClassifier(**params)
    return cross_val_score(clf, poly_fea_df,label,cv=5,scoring='f1').mean()
#定义搜索空间
space4gbdt = {
    'max_depth': hp.choice('max_depth', [3,4,5]),
    'n_estimators': hp.choice('n_estimators', [50,80,100,150]),
    'learning_rate': hp.choice('learning_rate', [0.1,0.15,0.2])
}
#定义优化目标-最小化-f1
def f(params):
    f1 = hyperopt_train_test(params)
    return {'loss': -f1, 'status': STATUS_OK}
#查找最佳参数
trials = Trials()
best = fmin(f, space4gbdt, algo=tpe.suggest, max_evals=300, trials=trials)
print('best:',best)

100%|█████████████████████████████████████████████████| 300/300 [04:01<00:00,  1.53it/s, best loss: -0.797873471113155]
best: {'learning_rate': 1, 'max_depth': 0, 'n_estimators': 1}


In [9]:
classifier=GradientBoostingClassifier(n_estimators=50,max_depth=3,learning_rate=0.2)
scores = cross_val_score(classifier,poly_fea_df,label, scoring='f1', cv = 5)
np.mean(scores),np.std(scores)

(0.7776312721540197, 0.03726768752987142)

### 二.集成学习
最后我们还可以将多个模型的输出结果进行集成，常见的bagging(代表是rf),boosting(代表是gbdt)；另外gbdt的多种实现版本，大家可以在各种竞赛(特别是kaggle)中经常见到，比如xgboost,lightgbm,catboost等，这里我介绍另外一种比较暴力的集成学习方法：**stacking**，它将模型的预测结果作为上一层模型的特征输入，结构如图：  
![avatar](./source/stacking.jpg)

更多： https://github.com/zhulei227/Stacking_Ensembles  
更多stacking集成工具：https://www.jianshu.com/p/59313f43916f

In [10]:
from stacking_classifier import *
#定义模型结构
classifier = StackingClassifier(
    base_classifiers=[
        RandomForestClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        GradientBoostingClassifier(),
        LightGBMClassifier(),
        SVMClassifier(),
        NaiveBayesClassifier(),
    ],
    meta_classifier=LogisticRegression(),
    subsample_features_rate=0.9,
    n_jobs=-1
)
classifier.build_model()

In [11]:
X_train,X_test, y_train, y_test =train_test_split(poly_fea_df, label,test_size=0.2)
classifier.fit(X_train,y_train)
y_predict=classifier.predict(X_test)
f1_score=metrics.f1_score(y_test,y_predict)
f1_score

0.8159999999999998