In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [3]:
cols = ['gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month']
train.drop_duplicates(cols, inplace=True)

train['DAYS_BIRTH'] = - round(train['DAYS_BIRTH'] / 365)
test['DAYS_BIRTH'] = - round(test['DAYS_BIRTH'] / 365)

train['DAYS_EMPLOYED'] = - (train['DAYS_EMPLOYED'] / 365)
train.loc[train['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0
test['DAYS_EMPLOYED'] = - (test['DAYS_EMPLOYED'] / 365)
test.loc[test['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

# 일단 None으로 하고 나중에 더 fancy한 imputation 꼭 해야할듯
train['occyp_type'].fillna('none', inplace=True)
test['occyp_type'].fillna('none', inplace=True)

train = train.loc[train['child_num'] < 10, :]

categoric_cols = ['gender','car','reality',
                 'income_type', 'edu_type', 'family_type',
                 'house_type', 'FLAG_MOBIL', 'work_phone',
                 'phone', 'email', 'occyp_type']
numeric_cols = ['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']

train_OH = pd.get_dummies(train, columns=categoric_cols, drop_first=True)
test_OH = pd.get_dummies(test, columns=categoric_cols, drop_first=True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_OH[numeric_cols])
train_OH_scaled = scaler.transform(train_OH[numeric_cols])
test_OH_scaled = scaler.transform(test_OH[numeric_cols])

train_final = train_OH.copy()
test_final = test_OH.copy()

train_final[numeric_cols] = train_OH_scaled
test_final[numeric_cols] = test_OH_scaled

# Relabeling
- 기존 MetaCost와 약간은 다르게 진행

In [14]:
train_y.value_counts()

2.0    15543
1.0     5692
0.0     2865
Name: credit, dtype: int64

In [54]:
train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = train_final['credit']

L = 10 # num of models
relabeld_train_y =  np.zeros((train_y.shape[0], 3)) 

sample_weight = np.array([1]*len(train_y))

for i in range(L):
    lgb = LGBMClassifier(n_estimators=2000)
    
    sample_weight[np.where(train_y==1.0)] = 1 + 0.2*i
    sample_weight[np.where(train_y==0.0)] = 1 + 0.4*i
    
    lgb.fit(train_X, train_y,
            verbose=200, 
            eval_set=[(train_X, train_y)],
            sample_weight=sample_weight)
    relabeld_train_y += lgb.predict_proba(train_X) / L
    print('-------------------------------------------------')

[200]	training's multi_logloss: 0.581584
[400]	training's multi_logloss: 0.474093
[600]	training's multi_logloss: 0.397332
[800]	training's multi_logloss: 0.337098
[1000]	training's multi_logloss: 0.286579
[1200]	training's multi_logloss: 0.248434
[1400]	training's multi_logloss: 0.215747
[1600]	training's multi_logloss: 0.188166
[1800]	training's multi_logloss: 0.16432
[2000]	training's multi_logloss: 0.143884
-------------------------------------------------
[200]	training's multi_logloss: 0.581584
[400]	training's multi_logloss: 0.474093
[600]	training's multi_logloss: 0.397332
[800]	training's multi_logloss: 0.337098
[1000]	training's multi_logloss: 0.286579
[1200]	training's multi_logloss: 0.248434
[1400]	training's multi_logloss: 0.215747
[1600]	training's multi_logloss: 0.188166
[1800]	training's multi_logloss: 0.16432
[2000]	training's multi_logloss: 0.143884
-------------------------------------------------
[200]	training's multi_logloss: 0.581584
[400]	training's multi_loglos

In [55]:
relabeld_train_y[:3]

array([[0.05193172, 0.75973248, 0.1883358 ],
       [0.24692439, 0.57617257, 0.17690304],
       [0.01679196, 0.00626419, 0.97694385]])

In [56]:
relabeled_target = np.argmax(relabeld_train_y, axis=1)

In [57]:
pd.Series(relabeled_target).value_counts()

2    15596
1     5620
0     2884
dtype: int64

# train with relabeled target

In [51]:
lgb = LGBMClassifier(n_estimators=2000)

train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = pd.Series(relabeled_target)
test_X = test_final.drop(labels='index', axis=1)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_val_losses = []
single_sub = np.zeros((test_X.shape[0], 3))

for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
    X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
                                                  
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=400, verbose=200)

    single_sub += lgb.predict_proba(test_X) / folds.n_splits
    print('-------------------------------------------')

Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.071963	valid_1's multi_logloss: 0.186712
[400]	training's multi_logloss: 0.0321441	valid_1's multi_logloss: 0.205058
Early stopping, best iteration is:
[164]	training's multi_logloss: 0.0844565	valid_1's multi_logloss: 0.18594
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.0706168	valid_1's multi_logloss: 0.194518
[400]	training's multi_logloss: 0.0322412	valid_1's multi_logloss: 0.209338
Early stopping, best iteration is:
[160]	training's multi_logloss: 0.0851187	valid_1's multi_logloss: 0.194382
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.0699552	valid_1's multi_logloss: 0.203137
[400]	training's multi_logloss: 0.0317007	valid_1's multi_logloss: 0.225326
Early stopping, best iteration is:
[138]	training'

In [52]:
submission.iloc[:,[1,2,3]] = single_sub

In [53]:
submission.to_csv('sub/MetaCost_5fold_singleLGB.csv', index=False)