
# 模型融合

## 接任务五内容：
使用五折交叉验证法，GridSearch来寻找模型的最优参数¶

ref:https://github.com/toAlgorithm/machineLearning/blob/master/data_mining/task5/task5.ipynb


## 数据导入与切分

In [23]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('./dataset/task2_proc.csv')

labels = dataset['status']
features = dataset.drop(['status'], axis=1)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

random_state = 2018
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3,
                                                   random_state=random_state)

# 标准化
scaler = StandardScaler()
x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.fit_transform(x_test)


## 模型训练与调优

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

### 使用逻辑回归运行五折交叉验证，网格搜索来获取最优参数
C为正则化系数λ的倒数，必须为整数，默认为1，值越小，代表正则化越强。一般来说只需要调节这个参数

In [41]:
from sklearn.model_selection import GridSearchCV

grid_lr = GridSearchCV(LogisticRegression(), 
                       param_grid={'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1, 10]},                         
                       cv=5  # 
                      )
grid_lr.fit(x_train_scale, y_train)
print('Best parameters:{} with a score of {:.3f}'.format(grid_lr.best_params_, grid_lr.best_score_))

Best parameters:{'C': 0.05} with a score of 0.797


### 使用SVM运行五折交叉验证，网格搜索获取最后参数
惩罚系数C，核函数参数gamma，可以调节这两个参数


In [43]:
grid_svm = GridSearchCV(SVC(probability=True),
                       param_grid={'C': [0.1, 0.5, 1, 10, 20],
                                   'gamma': [1, 0,5, 0.1, 0.01]})
grid_svm.fit(x_train_scale, y_train)


Best parameters:{'C': 0.05} with a score of 0.797


In [49]:
print('Best parameters:{} with a score of {:.3f}'.format(grid_svm.best_params_, grid_svm.best_score_))

Best parameters:{'C': 1, 'gamma': 0.01} with a score of 0.788


### 使用决策树运行五折交叉验证，网格搜索获取最后参数
决策树的模型一般只需要调节最大深度即可


In [50]:
grid_dt = GridSearchCV(DecisionTreeClassifier(),
                      param_grid={'max_depth':[i for i in range(1, 10)]},
                      cv=5)
grid_dt.fit(x_train_scale, y_train)


print('Best parameters:{} with a score of {:.3f}'.format(grid_dt.best_params_, grid_dt.best_score_))

Best parameters:{'max_depth': 5} with a score of 0.763


### 使用随机森林运行五折交叉验证，网格搜索获取最后参数
[随机森林各参数含义以及调参实例传送门](https://www.cnblogs.com/pinard/p/6160412.html)

**这里先采用了分四步的搜索方法**

In [64]:
# 1、先对步长和迭代次数进行调参
param_test_1 = {'n_estimators': range(10, 71, 10)}
gs_rf_step_1 = GridSearchCV(estimator=RandomForestClassifier(min_samples_split=100,
                                                           min_samples_leaf=20,
                                                           max_depth=8,
                                                           max_features='sqrt',
                                                           random_state=10),
                          param_grid=param_test_1,
                          scoring='roc_auc',
                          cv=5)
gs_rf_step_1.fit(x_train_scale, y_train)
print('Best parameters:{} with a score of {:.3f}'.format(gs_rf_step_1.best_params_, gs_rf_step_1.best_score_))

Best parameters:{'n_estimators': 60} with a score of 0.786


In [80]:
# 2、对其弱分类器决策树进行调参
param_test_2 = {'max_depth':range(3, 14, 2), 'min_samples_split': range(10, 201, 10)}
gs_rf_step_2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=60,
                                                            min_samples_leaf=20,
                                                            max_features='sqrt',
                                                            oob_score=True, 
                                                            random_state=10),
                           param_grid=param_test_2,
                           scoring='roc_auc',
                           iid=False,
                           cv=5)
gs_rf_step_2.fit(x_train_scale, y_train)

print('Best parameters:{} with a score of {:.3f}'.format(gs_rf_step_2.best_params_, gs_rf_step_2.best_score_))

Best parameters:{'max_depth': 11, 'min_samples_split': 10} with a score of 0.788


In [74]:
# 3、对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参
param_test_3 = {'min_samples_leaf':range(10, 60, 10)}
gs_rf_step_3 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=60, 
                                                            max_depth=9,
                                                            min_samples_split=110,
                                                            max_features='sqrt',
                                                            oob_score=True,
                                                            random_state=10),
                           param_grid=param_test_3,
                           scoring='roc_auc', 
                           iid=False,
                           cv=5)
gs_rf_step_3.fit(x_train_scale, y_train)
print('Best parameters:{} with a score of {:.3f}'.format(gs_rf_step_3.best_params_, gs_rf_step_3.best_score_))

Best parameters:{'min_samples_leaf': 20} with a score of 0.787


In [81]:
# 4、最后我们再对最大特征数max_features做调参:
param_test_4 = {'max_features':(3,11,2)}
gs_rf_step_4 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=60,
                                                            max_depth=9,
                                                            min_samples_leaf=20,
                                                            min_samples_split=10,
                                                            oob_score=True,
                                                            random_state=10),
                           param_grid=param_test_4,
                           scoring='roc_auc',
                           iid=False,
                           cv=5)
gs_rf_step_4.fit(x_train_scale, y_train)
print('Best parameters:{} with a score of {:.3f}'.format(gs_rf_step_4.best_params_, gs_rf_step_4.best_score_))
                                                             

Best parameters:{'max_features': 11} with a score of 0.787


**模型融合**

In [3]:
# 模型融合
model_list = [Lr_cv, svc_cv, dt_cv,  rf_cv, xgb_cv]
S_train, S_test = stacking(model_list, x_train, y_train,  x_test, regression=False, n_folds=5)

# # Initialize 2-nd level model
model = GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 100, max_depth = 3)

# # Fit 2-nd level model
model_s = model.fit(S_train, y_train)

# # Predict
y_pred = model_s.predict(S_test)

# Final prediction score
# print('Final prediction score: [%.8f]' % metrics.accuracy_score(y_test, y_pred))
acc_score_test = metrics.accuracy_score(y_pred, y_test)
precision_score_test = metrics.precision_score(y_pred, y_test)
recall_score_test = metrics.recall_score(y_pred, y_test)
f1_score_test = metrics.f1_score(y_pred, y_test)
roc_auc_score_test = metrics.roc_auc_score(y_pred, y_test)

print('Final 测试集准确率：{}\n'.format(acc_score_test))
print('Final 测试集精确率：{}\n'.format(precision_score_test))
print('Final 测试集召回率：{}\n'.format(recall_score_test))
print('Final 测试集f1评分：{}\n'.format(f1_score_test))
print('Final 测试集AUC值：{}\n'.format(roc_auc_score_test))
        
        

# model_est(model_dict_s, x_train, x_test, y_train, y_test) 



NameError: name 'Lr_cv' is not defined

In [4]:

from mlxtend.classifier import StackingClassifier
sclf = StackingClassifier(classifiers=[Lr_cv, dt_cv, rf_cv, xgb_cv, svc_cv],
                          meta_classifier=xgb_cv)
model_dict_mlxtend = {'融合模型':sclf}
model_est(model_dict_mlxtend, x_train, x_test, y_train, y_test)


ModuleNotFoundError: No module named 'mlxtend'