<a href="https://colab.research.google.com/github/park-geun-hyeong/Dacon/blob/main/Credit_Card/LGBM_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Library Import

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import os
import glob 

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder


In [80]:
path = '/content/drive/MyDrive/Dacon/Credit_card/'

train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
submission = pd.read_csv(path+'sample_submission.csv')

In [81]:
train.head(3)

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0


In [82]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

## Data FeatureEngineering
- drop index column
- fillna to 'NAN'
- OneHotEncoding for object Dtype colunmns



In [83]:
del train['index']
del test['index'] 

In [84]:
train.fillna('NAN', inplace=True)
test.fillna('NAN',inplace=True)  

In [90]:
obj_col=[]

for col in train.columns:
    if train[col].dtype == 'object':
        obj_col.append(col)

In [91]:
obj_col

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [92]:
enc = OneHotEncoder()
enc.fit(train.loc[:,obj_col])

train_one_df = pd.DataFrame(enc.transform(train.loc[:,obj_col]).toarray(), columns = enc.get_feature_names(obj_col))
train.drop(obj_col,axis=1, inplace=True)
train = pd.concat([train, train_one_df],axis=1)

In [93]:
test_one_df = pd.DataFrame(enc.transform(test.loc[:,obj_col]).toarray(), columns = enc.get_feature_names(obj_col))
test.drop(test_obj,axis=1, inplace=True)
test = pd.concat([test,test_one_df],axis=1)

In [94]:
train.shape, test.shape

((26457, 57), (10000, 56))

## Split Train, Val Dataset(StratifiedKFold)

In [44]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

folds=[] 

for train_idx, val_idx in skf.split(train, train['credit']):
    folds.append((train_idx, val_idx))

In [53]:
import random

## Train(LGBMClassifier)

In [95]:
random.seed(42)

lgb_models={}

for fold in range(5):
    print(f'___________________________fold:{fold+1}_________________________')
    train_idx, val_idx = folds[fold]

    x_train, x_val, y_train, y_val = train.drop('credit',axis=1).iloc[train_idx].values , train.drop('credit', axis=1).iloc[val_idx].values, train['credit'].iloc[train_idx].values, train['credit'].iloc[val_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)

    lgb.fit(x_train,y_train, eval_set=[(x_train,y_train),(x_val,y_val)], early_stopping_rounds=30, verbose=100)
    lgb_models[fold] = lgb

    print("___________________________________________________________________\n")                                                                   

___________________________fold:1_________________________
Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.677988	valid_1's multi_logloss: 0.759487
[200]	training's multi_logloss: 0.60364	valid_1's multi_logloss: 0.740355
[300]	training's multi_logloss: 0.54788	valid_1's multi_logloss: 0.73406
[400]	training's multi_logloss: 0.502351	valid_1's multi_logloss: 0.729556
Early stopping, best iteration is:
[435]	training's multi_logloss: 0.488139	valid_1's multi_logloss: 0.728254
___________________________________________________________________

___________________________fold:2_________________________
Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.677007	valid_1's multi_logloss: 0.769037
[200]	training's multi_logloss: 0.601918	valid_1's multi_logloss: 0.755076
[300]	training's multi_logloss: 0.545834	valid_1's multi_logloss: 0.749569
[400]	training's multi_logloss: 0.498598	valid_1's multi_

## Predict to 1-Fold

In [55]:
pred = lgb.predict_proba(test)

In [56]:
pred

array([[0.05889216, 0.12938202, 0.81172582],
       [0.11735777, 0.22211774, 0.66052449],
       [0.03667307, 0.16308428, 0.80024265],
       ...,
       [0.0245766 , 0.02129316, 0.95413023],
       [0.16100987, 0.17178689, 0.66720323],
       [0.05508855, 0.24586632, 0.69904513]])

In [57]:
submission.iloc[:,1:]=pred

In [58]:
submission

Unnamed: 0,index,0,1,2
0,26457,0.058892,0.129382,0.811726
1,26458,0.117358,0.222118,0.660524
2,26459,0.036673,0.163084,0.800243
3,26460,0.213784,0.112477,0.673739
4,26461,0.088536,0.279802,0.631663
...,...,...,...,...
9995,36452,0.112932,0.182735,0.704333
9996,36453,0.134534,0.279818,0.585648
9997,36454,0.024577,0.021293,0.954130
9998,36455,0.161010,0.171787,0.667203


In [59]:
submission.to_csv('lgbm1.csv',index=False)

## Predict to Fold_Ensemble(5Fold)

In [None]:
submission = pd.read_csv(path+'sample_submission.csv')

In [67]:
submission.iloc[:,1:]=0

for fold in range(5):
    submission.iloc[:,1:] += lgb_models[fold].predict_proba(test)

In [68]:
submission.iloc[:,1:] = submission.iloc[:,1:] / 5

In [74]:
submission.to_csv('lgbm2.csv',index=False)