<a href="https://colab.research.google.com/github/mailguest/ML-000/blob/main/Week16/stackingml_and_deepml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data and data preprocessing

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [5]:
from google.colab import drive
drive.mount('/content/drive')

seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('/content/drive/MyDrive/final/train_final.csv')
df_test = pd.read_csv('/content/drive/MyDrive/final/test_final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
df_train.shape, df_test.shape

((50000, 146), (50000, 146))

In [8]:
train_clean = df_train.copy()
test_clean = df_test.copy()

train_clean.fillna(0,inplace=True)
test_clean.fillna(0,inplace=True)

X_train = train_clean.drop(columns=['loan_status']).values
Y_train = train_clean['loan_status'].values.astype(int)
X_test = test_clean.drop(columns=['loan_status']).values
Y_test = test_clean['loan_status'].values.astype(int)

In [38]:
# split data for five fold
def split_data_for_five_fold(X, Y):
  fold_data = []
  for train_index, eval_index in kf.split(X):
    x_train, x_eval = X[train_index], X[eval_index]
    y_train, y_eval = Y[train_index], Y[eval_index]
    fold_data.append([(x_train, y_train), (x_eval, y_eval)])
  return fold_data

five_fold_data = split_data_for_five_fold(X_train, Y_train)

# Algorithm

In [39]:
def get_model(param, fold_data):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

In [25]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

# num_leaves: 128 -> 168
# num_round: 100 -> 250
# learning_rate: 3e-3 -> 0.0093
# feature_fraction: 0.6 -> 0.84
# bagging_fraction: 0.8 -> 0.972（加大叶子树与随机选取，调节学习率）
param_fine_tuning = {'num_thread': 8,'num_leaves': 168, 'metric': 'binary', 'objective': 'binary', 'num_round': 2500,
                     'learning_rate': 0.0093, 'feature_fraction': 0.84, 'bagging_fraction': 0.972}

In [26]:
# base param train
param_base_model = get_model(param_base, five_fold_data)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning, five_fold_data)

0-th model is training:




[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[2]	valid_0's binary_logloss: 0.494179
[3]	valid_0's binary_logloss: 0.488595
[4]	valid_0's binary_logloss: 0.483207
[5]	valid_0's binary_logloss: 0.477965
[6]	valid_0's binary_logloss: 0.472893
[7]	valid_0's binary_logloss: 0.467948
[8]	valid_0's binary_logloss: 0.463149
[9]	valid_0's binary_logloss: 0.458447
[10]	valid_0's binary_logloss: 0.453916
[11]	valid_0's binary_logloss: 0.449484
[12]	valid_0's binary_logloss: 0.445178
[13]	valid_0's binary_logloss: 0.44097
[14]	valid_0's binary_logloss: 0.436889
[15]	valid_0's binary_logloss: 0.432906
[16]	valid_0's binary_logloss: 0.429019
[17]	valid_0's binary_logloss: 0.425196
[18]	valid_0's binary_logloss: 0.421483
[19]	valid_0's binary_logloss: 0.417836
[20]	valid_0's binary_logloss: 0.414271
[21]	valid_0's binary_logloss: 0.410811
[22]	valid_0's binary_logloss: 0.407386
[23]	valid_0's binary_logloss: 0.404069
[24]	valid_0's binary_logloss: 0.4008
[25]	valid_0's binary_logloss: 0.39761
[26]	valid_

# Test

In [40]:
def test_model(model_list, X, Y):
    data = X
    five_fold_pred = np.zeros((5, len(X)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y)

In [28]:
base_score = test_model(param_base_model, X_test, Y_test)
fine_tuning_score = test_model(param_fine_tuning_model, X_test, Y_test)

In [29]:
print(f'base: {base_score}, fine tuning: {fine_tuning_score}')

base: 0.91626, fine tuning: 0.91702


# 参数筛选的随机性

In [None]:
from sklearn.model_selection import GridSearchCV

bagging_and_feature_params_test={
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}

model_lgb = lgb.LGBMRegressor(objective='binary', num_leaves=168,
                              learning_rate=0.1, metric='binary', 
                              num_round=2500, num_thread=8)
grid_search = GridSearchCV(estimator=model_lgb, 
                 param_grid=bagging_and_feature_params_test, 
                 scoring='neg_mean_squared_error', 
                 cv=5, verbose=1, n_jobs=4)

grid_search.fit(X_train, Y_train)
grid_search.score(X_test, Y_test)
grid_search.grid_scores_, grid_search.best_params_, grid_search.best_score_

# 构建衍生变量

In [41]:
new_train = df_train.copy()
new_test = df_test.copy()

In [42]:
new_train['new_feature1'] = (new_train['discrete_term_1_one_hot'] + new_train['discrete_term_2_one_hot'])
new_train['new_feature2'] = (new_train['discrete_purpose_1_one_hot'] + new_train['discrete_purpose_5_one_hot'])

new_test['new_feature1'] = (new_test['discrete_term_1_one_hot'] + new_test['discrete_term_2_one_hot'])
new_test['new_feature2'] = (new_test['discrete_purpose_1_one_hot'] + new_test['discrete_purpose_5_one_hot'])

In [43]:
new_train.head(5)

Unnamed: 0,continuous_annual_inc,continuous_annual_inc_joint,continuous_delinq_2yrs,continuous_dti,continuous_dti_joint,continuous_fico_range_high,continuous_fico_range_low,continuous_funded_amnt,continuous_funded_amnt_inv,continuous_inq_last_6mths,continuous_installment,continuous_int_rate,continuous_last_fico_range_high,continuous_last_fico_range_low,continuous_loan_amnt,loan_status,continuous_mths_since_last_delinq,continuous_mths_since_last_major_derog,continuous_mths_since_last_record,continuous_open_acc,continuous_pub_rec,discrete_addr_state_1_one_hot,discrete_addr_state_2_one_hot,discrete_addr_state_3_one_hot,discrete_addr_state_4_one_hot,discrete_addr_state_5_one_hot,discrete_addr_state_6_one_hot,discrete_addr_state_7_one_hot,discrete_addr_state_8_one_hot,discrete_addr_state_9_one_hot,discrete_addr_state_10_one_hot,discrete_addr_state_11_one_hot,discrete_addr_state_12_one_hot,discrete_addr_state_13_one_hot,discrete_addr_state_14_one_hot,discrete_addr_state_15_one_hot,discrete_addr_state_16_one_hot,discrete_addr_state_17_one_hot,discrete_addr_state_18_one_hot,discrete_addr_state_19_one_hot,...,discrete_pymnt_plan_1_one_hot,discrete_sub_grade_1_one_hot,discrete_sub_grade_2_one_hot,discrete_sub_grade_3_one_hot,discrete_sub_grade_4_one_hot,discrete_sub_grade_5_one_hot,discrete_sub_grade_6_one_hot,discrete_sub_grade_7_one_hot,discrete_sub_grade_8_one_hot,discrete_sub_grade_9_one_hot,discrete_sub_grade_10_one_hot,discrete_sub_grade_11_one_hot,discrete_sub_grade_12_one_hot,discrete_sub_grade_13_one_hot,discrete_sub_grade_14_one_hot,discrete_sub_grade_15_one_hot,discrete_sub_grade_16_one_hot,discrete_sub_grade_17_one_hot,discrete_sub_grade_18_one_hot,discrete_sub_grade_19_one_hot,discrete_sub_grade_20_one_hot,discrete_sub_grade_21_one_hot,discrete_sub_grade_22_one_hot,discrete_sub_grade_23_one_hot,discrete_sub_grade_24_one_hot,discrete_sub_grade_25_one_hot,discrete_sub_grade_26_one_hot,discrete_sub_grade_27_one_hot,discrete_sub_grade_28_one_hot,discrete_sub_grade_29_one_hot,discrete_sub_grade_30_one_hot,discrete_sub_grade_31_one_hot,discrete_sub_grade_32_one_hot,discrete_sub_grade_33_one_hot,discrete_sub_grade_34_one_hot,discrete_sub_grade_35_one_hot,discrete_term_1_one_hot,discrete_term_2_one_hot,new_feature1,new_feature2
0,55000.0,,0.0,5.91,,679.0,675.0,3600.0,3600.0,1.0,123.03,13.99,564.0,560.0,3600.0,1,30.0,30.0,,7.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
1,65000.0,,1.0,16.06,,719.0,715.0,24700.0,24700.0,4.0,820.28,11.99,699.0,695.0,24700.0,1,6.0,,,22.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,63000.0,71000.0,0.0,10.78,13.85,699.0,695.0,20000.0,20000.0,0.0,432.66,10.78,704.0,700.0,20000.0,1,,,,6.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,104433.0,,1.0,25.37,,699.0,695.0,10400.0,10400.0,3.0,289.91,22.45,704.0,700.0,10400.0,1,12.0,,,12.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
4,34000.0,,0.0,10.2,,694.0,690.0,11950.0,11950.0,0.0,405.18,13.44,759.0,755.0,11950.0,1,,,,5.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1


In [44]:
X_new_train = new_train.drop(columns=['loan_status']).values
Y_new_train = new_train['loan_status'].values.astype(int)

X_new_test = new_test.drop(columns=['loan_status']).values
Y_new_test = new_test['loan_status'].values.astype(int)

In [45]:
print(X_new_train)

[[5.50e+04      nan 0.00e+00 ... 0.00e+00 1.00e+00 1.00e+00]
 [6.50e+04      nan 1.00e+00 ... 0.00e+00 1.00e+00 0.00e+00]
 [6.30e+04 7.10e+04 0.00e+00 ... 1.00e+00 1.00e+00 0.00e+00]
 ...
 [5.10e+04      nan 0.00e+00 ... 0.00e+00 1.00e+00 0.00e+00]
 [4.55e+05      nan 0.00e+00 ... 0.00e+00 1.00e+00 1.00e+00]
 [5.80e+04      nan 0.00e+00 ... 0.00e+00 1.00e+00 1.00e+00]]


In [46]:
new_five_fold_data = split_data_for_five_fold(X_new_train, Y_new_train)

In [47]:
print(new_five_fold_data)

[[(array([[5.50e+04,      nan, 0.00e+00, ..., 0.00e+00, 1.00e+00, 1.00e+00],
       [6.50e+04,      nan, 1.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [6.30e+04, 7.10e+04, 0.00e+00, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       ...,
       [5.10e+04,      nan, 0.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [4.55e+05,      nan, 0.00e+00, ..., 0.00e+00, 1.00e+00, 1.00e+00],
       [5.80e+04,      nan, 0.00e+00, ..., 0.00e+00, 1.00e+00, 1.00e+00]]), array([1, 1, 1, ..., 1, 1, 1])), (array([[3.4000e+04,        nan, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [8.5000e+04,        nan, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [8.5000e+04,        nan, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       ...,
       [1.0000e+05,        nan, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [1.2075e+05,        nan, 0.0000e+00, ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00],
       [4.7000e+04,    

In [48]:
# base param train
new_param_base_model = get_model(param_base, new_five_fold_data)

# param fine tuning
new_param_fine_tuning_model = get_model(param_fine_tuning, new_five_fold_data)

0-th model is training:




[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[2]	valid_0's binary_logloss: 0.494198
[3]	valid_0's binary_logloss: 0.488612
[4]	valid_0's binary_logloss: 0.483221
[5]	valid_0's binary_logloss: 0.477997
[6]	valid_0's binary_logloss: 0.472915
[7]	valid_0's binary_logloss: 0.467975
[8]	valid_0's binary_logloss: 0.463181
[9]	valid_0's binary_logloss: 0.458517
[10]	valid_0's binary_logloss: 0.453988
[11]	valid_0's binary_logloss: 0.449547
[12]	valid_0's binary_logloss: 0.445195
[13]	valid_0's binary_logloss: 0.440991
[14]	valid_0's binary_logloss: 0.436936
[15]	valid_0's binary_logloss: 0.432951
[16]	valid_0's binary_logloss: 0.429039
[17]	valid_0's binary_logloss: 0.425187
[18]	valid_0's binary_logloss: 0.421464
[19]	valid_0's binary_logloss: 0.417816
[20]	valid_0's binary_logloss: 0.41424
[21]	valid_0's binary_logloss: 0.410761
[22]	valid_0's binary_logloss: 0.407349
[23]	valid_0's binary_logloss: 0.40403
[24]	valid_0's binary_logloss: 0.400756
[25]	valid_0's binary_logloss: 0.397573
[26]	vali

In [49]:
new_base_score = test_model(new_param_base_model, X_new_test, Y_new_test)
new_fine_tuning_score = test_model(new_param_fine_tuning_model, X_new_test, Y_new_test)

In [51]:
print(f'base_score: {base_score}, fine_tuning_score: {fine_tuning_score}')
print(f'new_base_score: {new_base_score}, new_fine_tuning_score: {new_fine_tuning_score}')
print('base提升：%.6f%%' %((new_base_score-base_score)/base_score))
print('fine tuning提升：%.6f%%' %((new_fine_tuning_score-fine_tuning_score)/fine_tuning_score))

base_score: 0.91626, fine_tuning_score: 0.91702
new_base_score: 0.91582, new_fine_tuning_score: 0.91684
base提升：-0.000480%
fine tuning提升：-0.000196%


# Stacking Model

In [52]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC

from lightgbm.sklearn import LGBMClassifier
import xgboost as xgb

from sklearn.model_selection import (GridSearchCV, KFold)
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=20)
rnd_clf.fit(X_train, Y_train)
rnd_clf.score(X_test, Y_test)

0.9164

In [57]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=20)
sgd_clf.fit(X_train, Y_train)
sgd_clf.score(X_test, Y_test)

0.8639

In [58]:
from sklearn.ensemble import GradientBoostingClassifier
gdbt_clf = GradientBoostingClassifier(random_state = 20)
gdbt_clf.fit(X_train, Y_train)
gdbt_clf.score(X_test, Y_test)

0.91772

In [59]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, Y_train)
ada_clf.score(X_test, Y_test)

0.91604

In [60]:
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier()
lgbm_clf.fit(X_train, Y_train)
lgbm_clf.score(X_test, Y_test)

0.91768

In [61]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 20)
lr_clf.fit(X_train, Y_train)
lr_clf.score(X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.91108

In [62]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, Y_train)
xgb_clf.score(X_test, Y_test)

0.91712

In [63]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('rf',rnd_clf ),('gdbt',gdbt_clf ),
                                          ('ada',ada_clf ),('lgbm',lgbm_clf ),
                                          ('xgb',xgb_clf )],voting='hard')
voting_clf.fit(X_train, Y_train)
voting_clf.score(X_test, Y_test)

0.91814

# Todo

## 机器学习集成
  * Stacking
  * 投票融合法

## 深度学习

  * 加速方法
    * 优化器
    * 学习率调整
    * 不同的训练阶段
    * 损失函数
    * Ensemble集成
    * 数据扩充
    * Encoder 拼接
  * 深度学习网络+Adam
  * 残差网络
  * 使用TabNet
  * 深度网络集成