In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [3]:
cols = ['gender', 'car', 'reality', 'child_num', 'income_total',
       'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email',
       'occyp_type', 'family_size', 'begin_month']
train.drop_duplicates(cols, inplace=True)

In [4]:
train['DAYS_BIRTH'] = - round(train['DAYS_BIRTH'] / 365)
test['DAYS_BIRTH'] = - round(test['DAYS_BIRTH'] / 365)

train['DAYS_EMPLOYED'] = - (train['DAYS_EMPLOYED'] / 365)
train.loc[train['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0
test['DAYS_EMPLOYED'] = - (test['DAYS_EMPLOYED'] / 365)
test.loc[test['DAYS_EMPLOYED'] < 0, 'DAYS_EMPLOYED'] = 0

In [5]:
# 일단 None으로 하고 나중에 더 fancy한 imputation 꼭 해야할듯
train['occyp_type'].fillna('none', inplace=True)
test['occyp_type'].fillna('none', inplace=True)

In [6]:
train = train.loc[train['child_num'] < 10, :]

In [7]:
categoric_cols = ['gender','car','reality',
                 'income_type', 'edu_type', 'family_type',
                 'house_type', 'FLAG_MOBIL', 'work_phone',
                 'phone', 'email', 'occyp_type']
numeric_cols = ['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'family_size', 'begin_month']

In [8]:
train_OH = pd.get_dummies(train, columns=categoric_cols, drop_first=True)
test_OH = pd.get_dummies(test, columns=categoric_cols, drop_first=True)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_OH[numeric_cols])
train_OH_scaled = scaler.transform(train_OH[numeric_cols])
test_OH_scaled = scaler.transform(test_OH[numeric_cols])

In [10]:
train_final = train_OH.copy()
test_final = test_OH.copy()

train_final[numeric_cols] = train_OH_scaled
test_final[numeric_cols] = test_OH_scaled

In [11]:
train_final.head()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,credit,gender_M,car_Y,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff,occyp_type_none
0,0,-0.591186,0.15635,-0.495717,1.062968,-0.219251,1.212203,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0.786589,0.601656,-1.102109,-0.275762,0.891057,1.272701,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,-0.591186,2.605532,0.717066,0.946796,-0.219251,0.244238,2.0,1,1,...,0,1,0,0,0,0,0,0,0,0
3,3,-0.591186,0.15635,-0.235835,-0.042572,-0.219251,-0.663229,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,-0.591186,-0.288956,-0.235835,-0.03708,-0.219251,0.002247,2.0,0,1,...,0,1,0,0,0,0,0,0,0,0


# Single LGB

In [113]:
lgb = LGBMClassifier(n_estimators=2000)

train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = train_final['credit']
test_X = test_final.drop(labels='index', axis=1)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_val_losses = []
single_sub = np.zeros((test_X.shape[0], 3))

for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
    X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
                                                  
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=400, verbose=200)

    single_sub += lgb.predict_proba(test_X) / folds.n_splits
    print('-------------------------------------------')

Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.561225	valid_1's multi_logloss: 0.750397
[400]	training's multi_logloss: 0.440716	valid_1's multi_logloss: 0.750174
[600]	training's multi_logloss: 0.359317	valid_1's multi_logloss: 0.762936
Early stopping, best iteration is:
[306]	training's multi_logloss: 0.491653	valid_1's multi_logloss: 0.747003
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.554812	valid_1's multi_logloss: 0.747485
[400]	training's multi_logloss: 0.432458	valid_1's multi_logloss: 0.746893
[600]	training's multi_logloss: 0.350252	valid_1's multi_logloss: 0.760262
Early stopping, best iteration is:
[287]	training's multi_logloss: 0.494078	valid_1's multi_logloss: 0.742806
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.558059	valid_1's multi

In [114]:
pd.Series(np.argmax(single_sub, axis=1)).value_counts()

2    8825
1     973
0     202
dtype: int64

In [101]:
lgb = LGBMClassifier(n_estimators=2000)
sample_weight_dic = {2:1, 1:1.5, 0:2}

train_X = train_final.drop(labels=['credit', 'index'], axis=1)
train_y = train_final['credit']
test_X = test_final.drop(labels='index', axis=1)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_val_losses = []
sub = np.zeros((test_X.shape[0], 3))

for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
    X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
                 
        
    # sample weight
    sample_weight = []
    for i in y_train.values:
        sample_weight.append(sample_weight_dic[i])
            
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            sample_weight=sample_weight,
            early_stopping_rounds=400, verbose=200)

    sub += lgb.predict_proba(test_X) / folds.n_splits
    print('-------------------------------------------')

Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.645741	valid_1's multi_logloss: 0.772742
[400]	training's multi_logloss: 0.504058	valid_1's multi_logloss: 0.764042
[600]	training's multi_logloss: 0.403556	valid_1's multi_logloss: 0.771718
Early stopping, best iteration is:
[367]	training's multi_logloss: 0.523582	valid_1's multi_logloss: 0.763749
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.642402	valid_1's multi_logloss: 0.771047
[400]	training's multi_logloss: 0.494341	valid_1's multi_logloss: 0.761389
[600]	training's multi_logloss: 0.39568	valid_1's multi_logloss: 0.770029
Early stopping, best iteration is:
[324]	training's multi_logloss: 0.539934	valid_1's multi_logloss: 0.760105
-------------------------------------------
Training until validation scores don't improve for 400 rounds
[200]	training's multi_logloss: 0.644638	valid_1's multi_

In [102]:
pd.Series(np.argmax(sub, axis=1)).value_counts()

2    8243
1    1295
0     462
dtype: int64

# OVO + Prob Calibration

In [25]:
train_final['credit'] = train_final['credit'].astype('int8')

In [26]:
train_final['credit'].value_counts()

2    15543
1     5692
0     2865
Name: credit, dtype: int64

- weight는 {2:1, 1:1.5, 0:2}

In [41]:
from sklearn.isotonic import IsotonicRegression

In [116]:
test_X = test_final.drop(labels='index', axis=1)
sub = np.zeros((test_X.shape[0], 3))

lgb = LGBMClassifier(n_estimators=2000)
sample_weight_dic = {2:1, 1:1.5, 0:2}

for c in [[0,1], [0,2], [1,2]]:
    print(f'OVO with class : {c}')
    sub_train = train_final.loc[train_final['credit'].isin(c), :]
    train_X = sub_train.drop(labels=['credit', 'index'], axis=1)
    train_y = sub_train['credit']
    
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_X, train_y)):
        X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        
        # sample weight
        sample_weight = []
        for i in y_train.values:
            sample_weight.append(sample_weight_dic[i])
        
        # model fit
        lgb.fit(X_train, y_train, 
                eval_set=[(X_train, y_train), (X_val, y_val)],
                sample_weight=sample_weight,
                early_stopping_rounds=400, verbose=200)
        
        # calibration
        prob_val = lgb.predict_proba(X_val)[:, 1] # (0,1)이라고 하면 label = 1에 해당하는 확률값
        iso_reg = IsotonicRegression(y_min = 0, y_max = 1,
                             out_of_bounds = 'clip').fit(prob_val, np.where(y_val.values==c[0], 0, 1)) # label을 0,1로 바꾸기
        calibrated_test_prob = iso_reg.predict(lgb.predict_proba(test_X)[:, 1])
        # test predict
        sub[:, c[1]] += calibrated_test_prob / (folds.n_splits*3)
        sub[:, c[0]] += (1-calibrated_test_prob) / (folds.n_splits*3)
    print('---------------------------------------------------------')

OVO with class : [0, 1]
Training until validation scores don't improve for 400 rounds
[200]	training's binary_logloss: 0.3776	valid_1's binary_logloss: 0.565685
[400]	training's binary_logloss: 0.2637	valid_1's binary_logloss: 0.568011
[600]	training's binary_logloss: 0.193913	valid_1's binary_logloss: 0.582342
Early stopping, best iteration is:
[258]	training's binary_logloss: 0.337084	valid_1's binary_logloss: 0.561032
Training until validation scores don't improve for 400 rounds
[200]	training's binary_logloss: 0.377634	valid_1's binary_logloss: 0.571399
[400]	training's binary_logloss: 0.258976	valid_1's binary_logloss: 0.575004
[600]	training's binary_logloss: 0.186804	valid_1's binary_logloss: 0.593741
Early stopping, best iteration is:
[277]	training's binary_logloss: 0.324011	valid_1's binary_logloss: 0.569001
Training until validation scores don't improve for 400 rounds
[200]	training's binary_logloss: 0.375862	valid_1's binary_logloss: 0.580895
[400]	training's binary_logloss

sample_weight_dic = {2:1, 1:1.5, 0:2} 의 경우 test 예측값

In [117]:
pd.Series(np.argmax(sub, axis=1)).value_counts()

2    8991
1     875
0     134
dtype: int64

In [108]:
submission[['0','1','2']] = sub

submission.head()

Unnamed: 0,index,0,1,2
0,26457,0.15058,0.242412,0.607009
1,26458,0.248737,0.224398,0.526865
2,26459,0.140464,0.263335,0.596201
3,26460,0.231147,0.182914,0.585939
4,26461,0.141697,0.300582,0.557721


In [109]:
submission.to_csv('sub/OVO(lgb+sampleWeight)+Calibration.csv', index=False)

# Single + OVO

In [130]:
pd.Series(np.argmax((single_sub + sub) / 2 , axis=1)).value_counts()

2    8878
1     947
0     175
dtype: int64

In [131]:
submission[['0','1','2']] = (single_sub + sub) / 2 

submission.head()

Unnamed: 0,index,0,1,2
0,26457,0.102338,0.161347,0.736315
1,26458,0.201909,0.184521,0.613569
2,26459,0.089675,0.177371,0.732954
3,26460,0.16955,0.158331,0.672119
4,26461,0.107801,0.24091,0.651289


In [132]:
submission.to_csv('sub/Single_mean_OVO+Calibration.csv', index=False)