# LGBM - Code&New

## Data

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')

In [2]:
train_df=pd.read_csv("../data/processed_train_code.csv")
test_df=pd.read_csv("../data/processed_test_code.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,...,birth_month,birth_week,ages_employed,employ_month,employ_week,ages_unemployed,unemploy_month,unemploy_week,income_family,CODE
0,0,F,N,N,202500.0,Commercial associate,Higher education,Married,Municipal apartment,0,...,7.0,1.0,12,0.0,0.0,25,6.0,0.0,101250.0,F13899202500.0Commercial associate
1,1,F,N,Y,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,0,...,7.0,1.0,4,3.0,0.0,26,4.0,1.0,82500.0,F11380247500.0Commercial associate
2,2,M,Y,Y,450000.0,Working,Higher education,Married,House / apartment,0,...,0.0,2.0,12,3.0,1.0,40,8.0,1.0,225000.0,M19087450000.0Working
3,3,F,N,Y,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,0,...,10.0,3.0,5,9.0,2.0,35,1.0,0.0,101250.0,F15088202500.0Commercial associate
4,4,F,Y,Y,157500.0,State servant,Higher education,Married,House / apartment,0,...,9.0,0.0,5,10.0,0.0,35,11.0,3.0,78750.0,F15037157500.0State servant


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26451 entries, 0 to 26450
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26451 non-null  int64  
 1   gender             26451 non-null  object 
 2   car                26451 non-null  object 
 3   reality            26451 non-null  object 
 4   income_total       26451 non-null  float64
 5   income_type        26451 non-null  object 
 6   edu_type           26451 non-null  object 
 7   family_type        26451 non-null  object 
 8   house_type         26451 non-null  object 
 9   work_phone         26451 non-null  int64  
 10  home_phone         26451 non-null  int64  
 11  email              26451 non-null  int64  
 12  occup_type         26451 non-null  object 
 13  family_size        26451 non-null  float64
 14  begin_month        26451 non-null  float64
 15  credit             26451 non-null  float64
 16  days_unemployed    264

## Preprocessing

### Numerical Data: Scaling

In [4]:
def num_data_scale(train_df, test_df, scaling_cols=None):
    
    scaling_cols = ['income_total', 'family_size', 'days_unemployed', 'income_unemployed', 'Age', 'birth_month', 'birth_week', 'ages_employed', 'employ_month', 'employ_week', 'ages_unemployed', 'unemploy_month', 'unemploy_week', 'income_family']

    std_scaler = StandardScaler()
    std_scaler.fit(train_df[scaling_cols])

    train_scaled = std_scaler.transform(train_df[scaling_cols])
    test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

    train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
    test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

    train_scaled.shape, test_scaled.shape
    
    return train_scaled, test_scaled

In [5]:
train_scaled, test_scaled = num_data_scale(train_df, test_df)

### Categorical Data: One-hot Encoding

In [6]:
def cat_data_encode(train_df, test_df, onehot_cols=None):
    
    onehot_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email', 'begin_month', 'CODE']
    data = pd.concat([train_df[onehot_cols], test_df[onehot_cols]]) # One-hot encoding using the combination of train and test data
    data = pd.get_dummies(data)

    # data separation
    data = data.reset_index(drop=True)
    train_encoded = data.loc[:train_df.shape[0]-1]
    test_encoded = data.loc[train_df.shape[0]:]
    test_encoded = test_encoded.reset_index(drop=True)
    
    train_encoded.shape, test_encoded.shape
    
    return train_encoded, test_encoded

In [7]:
train_encoded, test_encoded = cat_data_encode(train_df, test_df)

### Merge scaled numerical data with encoded categorical data

In [8]:
def merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded):
    
    Train = pd.concat([train_scaled,train_encoded],axis=1)
    Train = pd.concat([Train,train_df['credit']],axis=1)

    Test = pd.concat([test_scaled,test_encoded],axis=1)

    Train.shape, Test.shape
    
    return Train, Test

In [9]:
Train, Test = merge_scaled_encoded(train_scaled, test_scaled, train_encoded, test_encoded)

## Split Training and Validation set#from sklearn.model_selection import train_test_split

In [10]:
x_data = Train.drop(['credit'], axis=1)
y_data = Train['credit']

In [11]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(x_data, y_data, test_size=0.2)

## Train with LGBM

In [12]:
lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'early_stopping_round': 100, 
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

In [13]:
def train_model(model, trn, tst, cv = 5):
    
    tst_preds = []
    vld_preds = []
    feats_importance = np.zeros(tst.shape[1])    
    for n, (trn_idx, vld_idx) in enumerate(StratifiedKFold(cv).split(trn.drop('credit', axis = 1).values, trn['credit'].values)):
        print(f"{n+1}/{cv}th fold..........")        
        X_trn = trn.loc[trn_idx, :].drop('credit', axis = 1)
        X_vld = trn.loc[vld_idx, :].drop('credit', axis = 1)
        y_trn = trn.loc[trn_idx, 'credit'].values
        y_vld = trn.loc[vld_idx, 'credit'].values
        
        model.fit(
            X_trn, y_trn,
            eval_set = [(X_trn, y_trn), (X_vld, y_vld)],
            verbose = 500, early_stopping_rounds = 30
        )    
        vld_preds.append(log_loss(y_vld, model.predict_proba(X_vld)))        
        
        tst_pred = model.predict_proba(tst)
        tst_preds.append(tst_pred)
        feats_importance += model.feature_importances_                        
        
    feats_importance = feats_importance / cv
    feats_importance = pd.Series(data = feats_importance, index = tst.columns)

    print('mlogloss: ', np.mean(vld_preds))
    return tst_preds, feats_importance

In [14]:
tst_preds_lgbm, feat_im_lgbm = train_model(LGBMClassifier(**lgbm_params), Train, Test, cv = 10)

1/10th fold..........
[500]	training's multi_logloss: 0.505877	valid_1's multi_logloss: 0.707524
2/10th fold..........
[500]	training's multi_logloss: 0.506251	valid_1's multi_logloss: 0.694409
3/10th fold..........
[500]	training's multi_logloss: 0.504711	valid_1's multi_logloss: 0.715838
4/10th fold..........
[500]	training's multi_logloss: 0.506196	valid_1's multi_logloss: 0.696062
5/10th fold..........
[500]	training's multi_logloss: 0.507217	valid_1's multi_logloss: 0.698763
6/10th fold..........
[500]	training's multi_logloss: 0.5043	valid_1's multi_logloss: 0.707221
7/10th fold..........
[500]	training's multi_logloss: 0.506672	valid_1's multi_logloss: 0.704681
8/10th fold..........
[500]	training's multi_logloss: 0.504059	valid_1's multi_logloss: 0.717582
9/10th fold..........
[500]	training's multi_logloss: 0.504313	valid_1's multi_logloss: 0.713331
10/10th fold..........
[500]	training's multi_logloss: 0.505444	valid_1's multi_logloss: 0.708977
mlogloss:  0.7019213549067368


In [15]:
feat_im_lgbm

income_total                               7307.5
family_size                                1424.5
days_unemployed                           12781.0
income_unemployed                         14395.8
Age                                        8089.8
                                           ...   
CODE_M998790000.0Commercial associate         0.0
CODE_M9990292500.0Commercial associate        0.0
CODE_M9991157500.0Working                     0.0
CODE_M9993180000.0Working                     0.0
CODE_M9996135000.0Working                     0.0
Length: 9699, dtype: float64

## Hyperparameter Tuning

In [16]:
import optuna
from optuna.samplers import TPESampler

sampler = TPESampler(seed=10)


def objective(trial):
    
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        'verbose': -1,
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }
    model = LGBMClassifier(**param)
    lgb_model = model.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=0, early_stopping_rounds=25)

    score = cross_val_score(model, x_data, y_data, cv=5, scoring="neg_log_loss")
    log_loss = score.mean()

    return log_loss


In [17]:
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trials=10)

print("Best Score:",lgbm_study.best_value)
print("Best trial",lgbm_study.best_trial.params)

[32m[I 2022-12-11 16:54:54,707][0m A new study created in memory with name: no-name-ad4c123f-3494-4379-a9e1-21cf27e42024[0m
[32m[I 2022-12-11 16:56:05,237][0m Trial 0 finished with value: -0.875077307414122 and parameters: {'max_depth': 9, 'learning_rate': 2.8747873517678205e-05, 'n_estimators': 1492, 'min_child_samples': 29, 'subsample': 0.7103652353059032}. Best is trial 0 with value: -0.875077307414122.[0m
[32m[I 2022-12-11 16:57:05,603][0m Trial 1 finished with value: -0.882528433586099 and parameters: {'max_depth': 8, 'learning_rate': 1.355913866357637e-07, 'n_estimators': 721, 'min_child_samples': 97, 'subsample': 0.49870118149829573}. Best is trial 0 with value: -0.875077307414122.[0m
[32m[I 2022-12-11 16:58:41,946][0m Trial 2 finished with value: -0.8821441386010406 and parameters: {'max_depth': 7, 'learning_rate': 8.923398922742617e-07, 'n_estimators': 2501, 'min_child_samples': 64, 'subsample': 0.8532101590237792}. Best is trial 0 with value: -0.875077307414122.[0

Best Score: -0.7462660148979645
Best trial {'max_depth': 9, 'learning_rate': 0.004477552374896037, 'n_estimators': 2414, 'min_child_samples': 82, 'subsample': 0.4426919582748543}


In [18]:
# Optuna hyperparameterization

p_optuna = {
    'max_depth': 9,
    'learning_rate': 0.004477552374896037,
    'n_estimators': 2414,
    'min_child_samples': 82,
    'subsample': 0.4426919582748543,
  }


In [19]:
tst_preds_lgbm, feat_im_lgbm = train_model(LGBMClassifier(**p_optuna), Train, Test, cv = 10)

1/10th fold..........
[500]	training's multi_logloss: 0.758924	valid_1's multi_logloss: 0.785473
[1000]	training's multi_logloss: 0.718764	valid_1's multi_logloss: 0.770696
[1500]	training's multi_logloss: 0.687918	valid_1's multi_logloss: 0.759505
[2000]	training's multi_logloss: 0.663583	valid_1's multi_logloss: 0.750955
2/10th fold..........
[500]	training's multi_logloss: 0.757985	valid_1's multi_logloss: 0.783876
[1000]	training's multi_logloss: 0.719207	valid_1's multi_logloss: 0.764869
[1500]	training's multi_logloss: 0.689358	valid_1's multi_logloss: 0.750423
[2000]	training's multi_logloss: 0.664338	valid_1's multi_logloss: 0.741547
3/10th fold..........
[500]	training's multi_logloss: 0.756735	valid_1's multi_logloss: 0.79266
[1000]	training's multi_logloss: 0.716267	valid_1's multi_logloss: 0.777487
[1500]	training's multi_logloss: 0.685813	valid_1's multi_logloss: 0.766828
[2000]	training's multi_logloss: 0.659914	valid_1's multi_logloss: 0.758565
4/10th fold..........
[500

## Results

**Manual: 0.7019** / Tuned: 0 7913