# **授权**

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

# **指定文件目录**

In [2]:
!mkdir -p drive
!google-drive-ocamlfuse drive
import os

os.chdir("drive/Colab Notebooks") 
!ls

 “com2018.ipynb”的副本	'“one week.ipynb”的副本'
 data_all.csv		 网格搜索调参.ipynb


# **安装必要的包和库**

In [0]:
!pip install joblib
!pip install pandas
!pip install gbdt
!pip install xgboost
!pip install lightgbm
!pip install imbalanced-learn
!pip install -U git+https://github.com/scikit-learn-contrib/imbalanced-learn.git


# **导入相关库**

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn import metrics
from sklearn.ensemble.forest import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



# **读取数据并拆分数据集**

In [0]:
data_all = pd.read_csv('./data_all.csv')
Y = data_all['status']
X = data_all.drop('status', axis=1)

# 按7:3拆分成training和testing集，随机种子为2018。随机种子的目的是使每次拆分的结果一致
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=2018)

# **数据初探**

In [0]:
X_train.info() # 查看数据信息以及是否有缺失值

# 查看label的分布，存在数据不平衡问题的。直接初步建模也验证了这一现象。
plt.hist(Y_train, bins=3)
plt.show()
plt.hist(Y_test, bins=3)
plt.show()

# **数据不平衡处理**

In [7]:
over_samples = SMOTE(random_state=2018) 
over_samples_X,over_samples_y = over_samples.fit_sample(X_train, Y_train)

# 重抽样前的类别比例
print(Y_train.value_counts()/len(Y_train))
# 重抽样后的类别比例
print(pd.Series(over_samples_y).value_counts()/len(over_samples_y))

0    0.749324
1    0.250676
Name: status, dtype: float64
1    0.5
0    0.5
dtype: float64


# **建模：Logistic回归**

1.这里选择使用pipeline函数将模型种类和标准化操作封装在一起。  
2.Logis调参选择的参数是：C, penalty, solver, max_iter这四个。参数选项封装在param_grid里。  
3.网格搜索评分选择了accuracy，后续尝试使用roc_auc  

In [24]:
pipe_lr = Pipeline([('scl',StandardScaler()),
                 ('clf',LogisticRegression(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__C': param_range,
    'clf__penalty':['l1','l2'],
    'clf__solver':['liblinear'],
    'clf__max_iter':[10,50,100,150,200,300]}]

gs = GridSearchCV(estimator = pipe_lr,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间: 228秒

0.8459936463961129
{'clf__C': 1.0, 'clf__max_iter': 50, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
测试集AUC: 0.7673338341001327


  Xt = transform.transform(Xt)


# **建模: Decision Tree**

**Decision Tree选择调参的参数为:  **  



参数名称 | 解释
------ | ------
criterion                    | 特征选择标准   
splitter                     | 特征划分点选择标准   
max_features          |划分时考虑的最大特征数   
max_depth               |决策树最大深   

In [29]:
pipe_dt = Pipeline([('scl',StandardScaler()),
                 ('clf',tree.DecisionTreeClassifier(random_state =2018))])

param_grid = [{
    'clf__criterion': ['gini','entropy'],
    'clf__splitter':['best','random'],
    'clf__max_features':['auto','sqrt','log2'],
    'clf__max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100]}]

gs = GridSearchCV(estimator = pipe_dt,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间: 22秒

0.7864487498864556
{'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__max_features': 'auto', 'clf__splitter': 'best'}
测试集AUC: 0.6884174204250258


  Xt = transform.transform(Xt)


# 建模：SVM支持向量机

**SVM选择以下参数进行调参**:  
  
  参数名称 | 解释
------ | ------
 C                 |惩罚系数  
 kernel         |选择核类型  
 gamma     |选择RBF函数作为kernel后，该函数自带的一个参数  
 max_iter   |最大迭代次数

In [0]:
pipe_dt = Pipeline([('scl',StandardScaler()),
                 ('clf',svm.SVC(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__C': param_range,
    'clf__kernel':['linear','poly','rbf','sigmoid'],
    'clf__gamma': [1, 0.01, 0.001, 0.0001],
    'clf__max_iter':[10,50,100,150,200,300]}]

gs = GridSearchCV(estimator = pipe_dt,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.decision_function(X_test)))

# 运行时间：640秒

# 0.8124838738897966
# {'clf__C': 0.1, 'clf__gamma': 1, 'clf__kernel': 'rbf', 'clf__max_iter': 300}
# 测试集AUC: 0.5653435468895079


# **建模: Random forest**

Random forest和决策树的参数类似，不再说明

In [35]:
pipe_rf = Pipeline([('scl',StandardScaler()),
                 ('clf',RandomForestClassifier(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__criterion': ['gini','entropy'],
    'clf__n_estimators':[5,10,15,20],
    'clf__max_features':['auto','sqrt','log2'],
    'clf__max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100]}]

gs = GridSearchCV(estimator = pipe_rf,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间：313秒

0.9329562158838433
{'clf__criterion': 'entropy', 'clf__max_depth': 30, 'clf__max_features': 'auto', 'clf__n_estimators': 20}
测试集AUC: 0.7577684057880296


  Xt = transform.transform(Xt)


# **建模: GBDT**

**GBDT选择调参的参数为:  **  



参数名称 | 解释
------ | ------
n_estimators                    | 子树的数量   
loss                     | 损失函数
learning_rate | 即每个弱学习器的权重缩减系数,也称作步长。取值范围(0,1]
max_features          |划分时考虑的最大特征数   
max_depth               |决策树最大深   

In [0]:
pipe_gbdt = Pipeline([('scl',StandardScaler()),
                 ('clf',GradientBoostingClassifier(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__n_estimators':[5,10,15,20,30,40,50,100,200],
    'clf__max_features':['auto','sqrt','log2'],
    'clf__max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100],
    'clf__loss':['deviance','exponential'],
    'clf__learning_rate':[0.001,0.01,0.1,0.2,0.3,0.4,0.5]}]

gs = GridSearchCV(estimator = pipe_gbdt,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间：


# **建模: XGBoost**

XGBoost参数和树的类似。详见：  
https://blog.csdn.net/han_xiaoyang/article/details/52665396

In [0]:
pipe_xgb = Pipeline([('scl',StandardScaler()),
                 ('clf',xgb.XGBClassifier(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__n_estimators':[5,10,15,20,30,40,50,100,200],
    'clf__max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100],
    'clf__gamma':[1, 0.01, 0.001, 0.0001],
    'clf__learning_rate':[0.01,0.1,0.2,0.3,0.4,0.5]}]

gs = GridSearchCV(estimator = pipe_xgb,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间：



# **建模: LightGBM**

详见：https://www.cnblogs.com/bjwu/p/9307344.html

In [0]:
pipe_lgbm = Pipeline([('scl',StandardScaler()),
                 ('clf',LGBMClassifier(random_state =2018))])

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{
    'clf__n_estimators':[5,10,15,20,30,40,50,100,200],
    'clf__feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'clf__bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    'clf__max_depth':[1,2,3,4,5,10,20,30,40,50,60,70,80,90,100],
    'clf__reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'clf__reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'clf__learning_rate':[0.01,0.1,0.2,0.3,0.4,0.5]}]

gs = GridSearchCV(estimator = pipe_lgbm,
                 param_grid = param_grid,
                 scoring = 'roc_auc',
                 cv = 5,
                 n_jobs = -1)
gs = gs.fit(over_samples_X, over_samples_y)

print(gs.best_score_)
print(gs.best_params_)

# 测试在testing上的效果
clf = gs.best_estimator_
clf.fit(over_samples_X,over_samples_y)

print('测试集AUC:',metrics.roc_auc_score(Y_test,clf.predict_proba(X_test)[:, 1]))

# 运行时间：






# **总结及问题**

**总结：**  
1. 之前只是直接建模，参数都是默认值。本次调参任务开始接触每个模型的具体参数。参数略多，需要好好消化吸收。  
2. 调参工作有两个重点：①必须了解模型涉及到哪些参数以及每个参数的含义；②还需要实践经验。每个模型的参数在不同的应用场合需要怎么调，就要靠经验积累了。


**问题： ** 
  
1. 代码重复量依旧存在，后面的查找最佳参数和测试集结果都可以封装成一个函数。待后续解决。
2. GridSearchCV中的scoring参数，目前调用的是roc_auc。本来是想用准确度和auc一起看的，但是代码报错？
3. 运行code的结果老是会有一些类似地址链接的东西跳出来。。。。。有点讨厌的。。。。
