# 数据挖掘实践任务

<b>任务描述</b>

- 模型融合方式任意，并结合Task5给出你的最优结果
- 例如Stacking融合，用你目前评分最高的模型作为基准模型，和其他模型进行stacking融合，得到最终模型及评分结果

In [40]:
from sklearn.model_selection import train_test_split 
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier

In [2]:
# 获取Task2进行特征选择后的数据集
import pandas as pd
dataset = pd.read_csv('./dataset/task2_proc.csv')
features = dataset.iloc[:,:-1]
labels = dataset.iloc[:,-1]
print('feature shape:{}, label shape:{}'.format(features.shape,labels.shape))

random_state = 2018
# 拆分数据集
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size=0.3, random_state=random_state)
x_sub1,x_sub2,y_sub1,y_sub2 = train_test_split(x_train,y_train,test_size = 0.5,random_state=random_state)
print(len(x_sub1),len(x_sub2),len(x_test))

feature shape:(4455, 49), label shape:(4455,)
1559 1559 1337


In [3]:
# get etimators
def getEstimators(file):
    estimator = None
    with open(file,'rb') as pf:
        try:
            estimator = pickle.load(pf)
        except:
            print('getEstimators error')
    return estimator

In [38]:
import os
root = './model_saved/'
estimators = []
for file in os.listdir(root):
    file_path = os.path.join(root,file)
    print(file_path)
    estimators.append(getEstimators(file_path))
estimators

./model_saved/DecisionTree.pickle
./model_saved/logistic.pickle
./model_saved/RandomForest.pickle
./model_saved/svc.pickle
./model_saved/xgboost.pickle


[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=0.1, min_samples_split=0.1,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 LogisticRegression(C=0.49304878062589097, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=25, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
  

# 使用Stacking进行模型融合

In [51]:
x_sub2_predict = np.empty((len(x_sub1),len(estimators)),dtype=np.float32)
x_test_predict = np.empty((len(x_test),len(estimators)),dtype=np.float32)
for i,estimator in enumerate(estimators):
    estimator.fit(x_sub1,y_sub1)
    x_sub2_predict[:,i] = estimator.predict(x_sub2)
    x_test_predict[:,i] = estimator.predict(x_test)    
    
#rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender = XGBClassifier(learning_rate=0.05, max_depth=3)
rnd_forest_blender.fit(x_sub2_predict, y_sub2)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [52]:
# 使用测试集进行测试
y_pred = rnd_forest_blender.predict(x_test_predict)
y_pred_scores = cross_val_predict(rnd_forest_blender,x_test,y_test,cv=5,
                                    method='predict_proba')
accuracy = accuracy_score(y_test,y_pred)
print('Stacking',accuracy)
for estimator in estimators:
    print(estimator.__class__.__name__,estimator.score(x_test, y_test))
    
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
roc_auc = roc_auc_score(y_test,y_pred_scores[:,1])
print('accuracy:{:.3f},precision:{:.3f},recall:{:.3f},f1:{:.3f},roc_auc:{:.3f}'.format(accuracy,precision,recall,f1,roc_auc))

Stacking 0.7958115183246073
DecisionTreeClassifier 0.74943904263276
LogisticRegression 0.7569184741959611
RandomForestClassifier 0.7980553477935677
SVC 0.7576664173522812
XGBClassifier 0.7883320867614061
accuracy:0.796,precision:0.738,recall:0.244,f1:0.367,roc_auc:0.788


# 使用投票分类器进行模型拟合

In [39]:
named_estimators = [naned_estimator for naned_estimator in zip(['dt','lr','rf','svc','xgb'],estimators)]
voting_clf = VotingClassifier(estimators=named_estimators,voting='hard')
new_estimators = estimators.copy()
new_estimators.append(voting_clf)
for clf in new_estimators:
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

DecisionTreeClassifier 0.7786088257292446
LogisticRegression 0.7569184741959611
RandomForestClassifier 0.7928197456993269




SVC 0.7576664173522812
XGBClassifier 0.8010471204188482




VotingClassifier 0.7808526551982049


# 总结

使用Stacking进行模型融合，最后结果：accuracy:0.796,precision:0.738,recall:0.244,f1:0.367,roc_auc:0.788

后续需要对特征衍生、特征选择和模型调参部分进行研究以优化最后模型