<a href="https://colab.research.google.com/github/mailguest/ML-000/blob/main/Week16/lgb_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load data and data preprocessing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('/content/drive/MyDrive/final/train_final.csv')
df_test = pd.read_csv('/content/drive/MyDrive/final/test_final.csv')

X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

In [8]:
X_train.shape, Y_train.shape

((44797, 145), (44797,))

In [9]:
# split data for five fold

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

# Algorithm

In [4]:
def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

# train

In [5]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [10]:
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

0-th model is training:




[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[5]	valid_0's binary_logloss: 0.500091
[6]	valid_0's binary_logloss: 0.498308
[7]	valid_0's binary_logloss: 0.496488
[8]	valid_0's binary_logloss: 0.494685
[9]	valid_0's binary_logloss: 0.492905
[10]	valid_0's binary_logloss: 0.491137
[11]	valid_0's binary_logloss: 0.489393
[12]	valid_0's binary_logloss: 0.489103
[13]	valid_0's binary_logloss: 0.488784
[14]	valid_0's binary_logloss: 0.487058
[15]	valid_0's binary_logloss: 0.485354
[16]	valid_0's binary_logloss: 0.483662
[17]	valid_0's binary_logloss: 0.481989
[18]	valid_0's binary_logloss: 0.480327
[19]	valid_0's binary_logloss: 0.478751
[20]	valid_0's binary_logloss: 0.477121
[21]	valid_0's binary_logloss: 0.476826
[22]	valid_0's binary_logloss: 0.475219
[23]	valid_0's binary_logloss: 0.473634
[24]	valid_0's binary_logloss: 0.47206
[25]	valid_0's binary_logloss: 0.470505
[26]	valid_0's binary_logloss: 0.468951
[27]	valid_0's binary_logloss: 0.467411
[28]	valid_0's binary_logloss: 0.465892
[29]	

# Test

In [11]:
def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [12]:
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)

print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

base: 0.9159952685964247, fine tuning: 0.9175128886111545
