# **授权**

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

# **指定文件目录**

In [3]:
!mkdir -p drive
!google-drive-ocamlfuse drive
import os

os.chdir("drive/Colab Notebooks") 
!ls

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option
“com2018.ipynb”的副本  data_all.csv


# **安装必要的包和库**

In [0]:
!pip install joblib
!pip install pandas
!pip install gbdt
!pip install xgboost
!pip install lightgbm
!pip install imbalanced-learn
!pip install -U git+https://github.com/scikit-learn-contrib/imbalanced-learn.git


# **导入相关库**

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble.forest import RandomForestClassifier

# **读取数据并拆分数据集**

In [0]:
data_all = pd.read_csv('./data_all.csv')
Y = data_all['status']
X = data_all.drop('status', axis=1)

# 按7:3拆分成training和testing集，随机种子为2018。随机种子的目的是使每次拆分的结果一致
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=2018)

# **数据初探**

In [0]:
X_train.info() # 查看数据信息以及是否有缺失值

# 查看label的分布，存在数据不平衡问题的。直接初步建模也验证了这一现象。
plt.hist(Y_train, bins=3)
plt.show()
plt.hist(Y_test, bins=3)
plt.show()

# **数据不平衡处理**

In [5]:
over_samples = SMOTE(random_state=2018) 
over_samples_X,over_samples_y = over_samples.fit_sample(X_train, Y_train)

# 重抽样前的类别比例
print(Y_train.value_counts()/len(Y_train))
# 重抽样后的类别比例
print(pd.Series(over_samples_y).value_counts()/len(over_samples_y))

0    0.749324
1    0.250676
Name: status, dtype: float64
1    0.5
0    0.5
dtype: float64


# **建模: Random forest**

In [13]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(over_samples_X, over_samples_y)

predict_Y = clf.predict(X_test)
predict_prob_y = clf.predict_proba(X_test)[:, 1]

print(clf.feature_importances_)

acc = metrics.accuracy_score(Y_test,predict_Y)
auc = metrics.roc_auc_score(Y_test,predict_prob_y)
print(acc,auc)
print(metrics.classification_report(Y_test, predict_Y))

# acc:0.7105816398037842
# auc:0.7388266407937154

[0.02393462 0.         0.00535079 0.         0.0005763  0.
 0.00056952 0.00127636 0.00679375 0.00423868 0.         0.
 0.         0.         0.00589873 0.00032809 0.00957432 0.00545136
 0.01063993 0.         0.01721928 0.         0.01166582 0.0011234
 0.         0.00148329 0.00140192 0.         0.16355128 0.06671431
 0.08480424 0.         0.00870784 0.00171416 0.         0.
 0.00298948 0.         0.         0.04389177 0.04447799 0.
 0.         0.         0.00656209 0.         0.         0.
 0.         0.05825357 0.00095936 0.         0.         0.09954234
 0.         0.00148645 0.         0.01169925 0.00334048 0.00039399
 0.         0.12451515 0.01315588 0.11010383 0.00182046 0.
 0.         0.00055327 0.00478948 0.00221861 0.         0.00274299
 0.         0.         0.         0.00275503 0.00438796 0.00390442
 0.00380723 0.         0.01126606 0.         0.00336684 0.00399803]
0.7105816398037842 0.7388266407937154
              precision    recall  f1-score   support

           0     

# **建模: GBDT**

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100,
                                 random_state=2018)

clf.fit(over_samples_X, over_samples_y)

predict_Y = clf.predict(X_test)
predict_prob_y = clf.predict_proba(X_test)[:, 1]

acc = metrics.accuracy_score(Y_test,predict_Y)
auc = metrics.roc_auc_score(Y_test,predict_prob_y)
print(acc,auc)
print(metrics.classification_report(Y_test, predict_Y))

# acc:0.7792571829011913
# auc:0.7773987251311905

0.7792571829011913 0.7773987251311905
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1068
           1       0.57      0.47      0.52       359

   micro avg       0.78      0.78      0.78      1427
   macro avg       0.70      0.68      0.69      1427
weighted avg       0.77      0.78      0.77      1427



# **建模: XGBoost**

In [41]:
import xgboost as xgb

clf = xgb.XGBClassifier(silent=1, max_depth=10, n_estimators=1000, learning_rate=0.05)
clf.fit(X_train, Y_train)

predict_Y = clf.predict(X_test)
predict_prob_y = clf.predict_proba(X_test)[:, 1]

acc = metrics.accuracy_score(Y_test,predict_Y)
auc = metrics.roc_auc_score(Y_test,predict_prob_y)
print(acc,auc)
print(metrics.classification_report(Y_test, predict_Y))

# acc: 0.7659425367904695
# auc: 0.7532888902799078

0.7659425367904695 0.7532888902799078
              precision    recall  f1-score   support

           0       0.80      0.91      0.85      1068
           1       0.56      0.34      0.42       359

   micro avg       0.77      0.77      0.77      1427
   macro avg       0.68      0.63      0.64      1427
weighted avg       0.74      0.77      0.75      1427



# **建模: LightGBM**

In [37]:
import lightgbm as lgb

# 转换数据格式
lgb_train = lgb.Dataset(over_samples_X, over_samples_y)
lgb_eval = lgb.Dataset(X_test, Y_test, reference=lgb_train)

# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression', # 目标函数
    'metric': {'12', 'auc'},  # 评估函数
    'num_leaves': 31,   # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9, # 建树的特征选择比例
    'bagging_fraction': 0.8, # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}


# 训练 cv and train
gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)

# auc: 0.765976

[1]	valid_0's auc: 0.710716	valid_0's l2: 0.242345
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.727982	valid_0's l2: 0.235519
[3]	valid_0's auc: 0.737203	valid_0's l2: 0.22903
[4]	valid_0's auc: 0.735461	valid_0's l2: 0.223617
[5]	valid_0's auc: 0.738902	valid_0's l2: 0.218251
[6]	valid_0's auc: 0.747436	valid_0's l2: 0.212814
[7]	valid_0's auc: 0.750074	valid_0's l2: 0.208256
[8]	valid_0's auc: 0.749867	valid_0's l2: 0.204482
[9]	valid_0's auc: 0.755952	valid_0's l2: 0.200672
[10]	valid_0's auc: 0.75718	valid_0's l2: 0.197129
[11]	valid_0's auc: 0.760626	valid_0's l2: 0.193982
[12]	valid_0's auc: 0.761257	valid_0's l2: 0.19141
[13]	valid_0's auc: 0.760242	valid_0's l2: 0.18931
[14]	valid_0's auc: 0.764368	valid_0's l2: 0.186509
[15]	valid_0's auc: 0.764041	valid_0's l2: 0.184564
[16]	valid_0's auc: 0.76417	valid_0's l2: 0.182607
[17]	valid_0's auc: 0.76335	valid_0's l2: 0.180954
[18]	valid_0's auc: 0.765976	valid_0's l2: 0.178948
[19]	valid_0's auc

# **总结及问题**

总结：
  
1.优先使用最简单的参数，使模型能够跑出结果，然后再进行调参改进等。
   花太多时间想要得到一个完美的结果，参数解释看了半天，结果跑不出结果。  
2.目前结果看来，GBDT的效果最佳，AUC可以达到0.77，但是依然不够理想。

3.写代码的时候多思考下代码可循坏利用的地方。比如讲所有建模都以clf来命名，后续的预测、计算acc和auc完全可以使用一模一样的代码。  
  
问题：
  
1.由于时间原因，尚未进行调参工作，待后续。  
2.使用smote方法生成的训练集，丢失了属性名，导致在XGBoost无法识别，目前 是依然使用X_train。
之后可以尝试下给oversample后的数据加上标签然后进行。
