# LGBM - Code

## Data

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')

In [2]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit,CODE
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0,F-13899202500.0Commercial associate
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5,1.0,F-11380247500.0Commercial associate
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22,2.0,M-19087450000.0Working
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0,F-15088202500.0Commercial associate
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0,F-15037157500.0State servant


## Preprocessing

### Numerical Data: Scaling

In [3]:
def num_data_scale(train_df, test_df, scaling_cols=None):
    
    scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

    std_scaler = StandardScaler()
    std_scaler.fit(train_df[scaling_cols])

    train_scaled = std_scaler.transform(train_df[scaling_cols])
    test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

    train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
    test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

    train_scaled.shape, test_scaled.shape
    
    return train_scaled, test_scaled

In [4]:
train_scaled, test_scaled = num_data_scale(train_df, test_df)

### Categorical Data: One-hot Encoding

In [5]:
def cat_data_encode(train_df, test_df, onehot_cols=None):
    
    onehot_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email', 'CODE']
    data = pd.concat([train_df[onehot_cols], test_df[onehot_cols]]) # One-hot encoding using the combination of train and test data
    data = pd.get_dummies(data)

    # data separation
    data = data.reset_index(drop=True)
    train_encoded = data.loc[:train_df.shape[0]-1]
    test_encoded = data.loc[train_df.shape[0]:]
    test_encoded = test_encoded.reset_index(drop=True)
    
    train_encoded.shape, test_encoded.shape
    
    return train_encoded, test_encoded

In [6]:
train_encoded, test_encoded = cat_data_encode(train_df, test_df)

### Merge scaled numerical data with encoded categorical data

In [7]:
def merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded):
    
    Train = pd.concat([train_scaled,train_encoded],axis=1)
    Train = pd.concat([Train,train_df['credit']],axis=1)

    Test = pd.concat([test_scaled,test_encoded],axis=1)

    Train.shape, Test.shape
    
    return Train, Test

In [8]:
Train, Test = merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded)

## Split Training and Validation set#from sklearn.model_selection import train_test_split

In [9]:
x_data = Train.drop(['credit'], axis=1)
y_data = Train['credit']

In [10]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(x_data, y_data, test_size=0.2)

## Train with LGBM

In [11]:
lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'early_stopping_round': 100, 
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

In [12]:
def train_model(model, trn, tst, cv = 5):
    
    tst_preds = []
    vld_preds = []
    feats_importance = np.zeros(tst.shape[1])    
    for n, (trn_idx, vld_idx) in enumerate(StratifiedKFold(cv).split(trn.drop('credit', axis = 1).values, trn['credit'].values)):
        print(f"{n+1}/{cv}th fold..........")        
        X_trn = trn.loc[trn_idx, :].drop('credit', axis = 1)
        X_vld = trn.loc[vld_idx, :].drop('credit', axis = 1)
        y_trn = trn.loc[trn_idx, 'credit'].values
        y_vld = trn.loc[vld_idx, 'credit'].values
        
        model.fit(
            X_trn, y_trn,
            eval_set = [(X_trn, y_trn), (X_vld, y_vld)],
            verbose = 500, early_stopping_rounds = 30
        )    
        vld_preds.append(log_loss(y_vld, model.predict_proba(X_vld)))        
        
        tst_pred = model.predict_proba(tst)
        tst_preds.append(tst_pred)
        feats_importance += model.feature_importances_                        
        
    feats_importance = feats_importance / cv
    feats_importance = pd.Series(data = feats_importance, index = tst.columns)

    print('mlogloss: ', np.mean(vld_preds))
    return tst_preds, feats_importance

In [13]:
tst_preds_lgbm, feat_im_lgbm = train_model(LGBMClassifier(**lgbm_params), Train, Test, cv = 10)

1/10th fold..........
[500]	training's multi_logloss: 0.560762	valid_1's multi_logloss: 0.759223
2/10th fold..........
[500]	training's multi_logloss: 0.56255	valid_1's multi_logloss: 0.739597
3/10th fold..........
[500]	training's multi_logloss: 0.560735	valid_1's multi_logloss: 0.757353
4/10th fold..........
[500]	training's multi_logloss: 0.56094	valid_1's multi_logloss: 0.746335
5/10th fold..........
[500]	training's multi_logloss: 0.560028	valid_1's multi_logloss: 0.752481
6/10th fold..........
[500]	training's multi_logloss: 0.561561	valid_1's multi_logloss: 0.741933
7/10th fold..........
[500]	training's multi_logloss: 0.559789	valid_1's multi_logloss: 0.759988
8/10th fold..........
9/10th fold..........
[500]	training's multi_logloss: 0.55843	valid_1's multi_logloss: 0.75501
10/10th fold..........
[500]	training's multi_logloss: 0.558702	valid_1's multi_logloss: 0.76764
mlogloss:  0.7525332502904052


In [14]:
feat_im_lgbm

child_num                                   1871.1
income_total                               15312.1
days_birth                                 23002.4
days_employed                              19902.3
family_size                                 2174.0
                                            ...   
CODE_M-998790000.0Commercial associate         0.0
CODE_M-9990292500.0Commercial associate        0.0
CODE_M-9991157500.0Working                     0.0
CODE_M-9993180000.0Working                     0.0
CODE_M-9996135000.0Working                     0.0
Length: 9690, dtype: float64

## Hyperparameter Tuning

In [15]:
import optuna
from optuna.samplers import TPESampler

sampler = TPESampler(seed=10)


def objective(trial):
    
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        'verbose': -1,
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }
    model = LGBMClassifier(**param)
    lgb_model = model.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=0, early_stopping_rounds=25)

    score = cross_val_score(model, x_data, y_data, cv=5, scoring="neg_log_loss")
    log_loss = score.mean()

    return log_loss


In [16]:
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trials=10)

print("Best Score:",lgbm_study.best_value)
print("Best trial",lgbm_study.best_trial.params)

[32m[I 2022-12-11 16:20:31,879][0m A new study created in memory with name: no-name-04afe976-6ea7-48e8-ab84-49162623e491[0m
[32m[I 2022-12-11 16:21:13,574][0m Trial 0 finished with value: -0.8178811185238409 and parameters: {'max_depth': 14, 'learning_rate': 0.0036694835440513416, 'n_estimators': 278, 'min_child_samples': 75, 'subsample': 0.731834254241718}. Best is trial 0 with value: -0.8178811185238409.[0m
[32m[I 2022-12-11 16:22:02,039][0m Trial 1 finished with value: -0.8185361053014258 and parameters: {'max_depth': 3, 'learning_rate': 0.0008467094608685681, 'n_estimators': 1913, 'min_child_samples': 49, 'subsample': 0.7249337160565229}. Best is trial 0 with value: -0.8178811185238409.[0m
[32m[I 2022-12-11 16:23:30,160][0m Trial 2 finished with value: -0.7954731150805342 and parameters: {'max_depth': 11, 'learning_rate': 0.0012519922459335428, 'n_estimators': 2829, 'min_child_samples': 92, 'subsample': 0.4178388725657859}. Best is trial 2 with value: -0.7954731150805342

Best Score: -0.7910297091060929
Best trial {'max_depth': 7, 'learning_rate': 0.005531892472467897, 'n_estimators': 1534, 'min_child_samples': 5, 'subsample': 0.549757753424006}


In [17]:
# Optuna hyperparameterization

p_optuna = {
    'max_depth': 7,
    'learning_rate': 0.005531892472467897,
    'n_estimators': 1534,
    'min_child_samples': 5,
    'subsample': 0.549757753424006,
  }


In [18]:
tst_preds_lgbm, feat_im_lgbm = train_model(LGBMClassifier(**p_optuna), Train, Test, cv = 10)

1/10th fold..........
[500]	training's multi_logloss: 0.779722	valid_1's multi_logloss: 0.805344
[1000]	training's multi_logloss: 0.762134	valid_1's multi_logloss: 0.797011
[1500]	training's multi_logloss: 0.750835	valid_1's multi_logloss: 0.792265
2/10th fold..........
[500]	training's multi_logloss: 0.779436	valid_1's multi_logloss: 0.80382
[1000]	training's multi_logloss: 0.762123	valid_1's multi_logloss: 0.794357
[1500]	training's multi_logloss: 0.751329	valid_1's multi_logloss: 0.789113
3/10th fold..........
[500]	training's multi_logloss: 0.780235	valid_1's multi_logloss: 0.803762
[1000]	training's multi_logloss: 0.762929	valid_1's multi_logloss: 0.79338
[1500]	training's multi_logloss: 0.751803	valid_1's multi_logloss: 0.788057
4/10th fold..........
[500]	training's multi_logloss: 0.779935	valid_1's multi_logloss: 0.802172
[1000]	training's multi_logloss: 0.762698	valid_1's multi_logloss: 0.793328
[1500]	training's multi_logloss: 0.75175	valid_1's multi_logloss: 0.787295
5/10th 

## Results

**Manual: 0.7525** / Tuned: 0.7913