In [4]:
'''

One imputation method, 10 iterations may take 1 hrs.

test_id 如果是隨機的, 會有引響麼, 理論上只要sort就好, 就算有?
'''
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import multiprocessing
import warnings
import sys
sys.path.append('../py_model')
from utils import init_logging
import logging 
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(int(time.time()))

NUM_FOLDS = 5
STRATIFIED = True  
TEST_NULL_HYPO = False
ITERATION = (80 if TEST_NULL_HYPO else 10) # It means how many iterations need to get the final stable AUC score.

In [5]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False):
    '''
    num_folds: int, how many foles u'r going to split.

    Maybe we can write a helper function, to find a best parametres each time when u add a new features, to make sure reliability of experiment.
    But, the experiement time will go up more.
    '''
    #---------------------
    # Divide in training/validation and test data
    #---------------------
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    # 
    logging.info('no bugging in split' if train_df.shape[0] + test_df.shape[0] == df.shape[0] else " opps")
    #---------------------
    # core
    #---------------------
    logging.info("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    # Create arrays and dataframes to store results
    # train
    oof_preds = np.zeros(train_df.shape[0])
    train_preds = np.zeros(train_df.shape[0])
    # test
    sub_preds = np.zeros(test_df.shape[0])
    # feature importance
    feature_importance_df = pd.DataFrame()

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    if TEST_NULL_HYPO:
        train_df['TARGET'] = train_df['TARGET'].copy().sample(frac = 1.0).values
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        if TEST_NULL_HYPO:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=127,
                max_depth=8,
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )
        else:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34, # 20
                colsample_bytree=0.2, #0.9497036 < 0.2
                subsample=0.8715623,
                max_depth=8, # 7
                reg_alpha=0.041545473, # 0.3
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775, # 60
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= False, early_stopping_rounds= 100) # early_stopping_rounds= 200
        # training/validating
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        train_preds[train_idx] += clf.predict_proba(train_x, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        # testing
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        logging.info('Fold %2d val AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    logging.info('Over-folds train AUC score : {}'.format(roc_auc_score(train_df['TARGET'], train_preds)))
    
    over_folds_val_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    logging.info('Over-folds val AUC score : {}'.format(over_folds_val_auc))
    
    # # Write submission file and plot feature importance
    # test_df.loc[:,'TARGET'] = sub_preds
    # test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)

    return feature_importance_df, over_folds_val_auc


In [3]:
def main():
    checked_already = ['normalized_knn_65_descrete.h5', 
                       'normalized_knn_65_cluster1.h5', 
                       'base_featurs_filled_knn_k_65_by_cluster_2.h5',
                       'base_featurs_filled_knn_k_65_by_cluster_2+1.h5',
                       'base_featurs_filled_knn_k_65_corr_0_95.h5',
                       'base_featurs_filled_knn_k_65_corr_0_95_with_10_fold.h5',
                       'base_featurs_filled_knn_k_65_by_cluster_0.h5',
                       'base_featurs_filled_knn_k_65_by_cluster_1.h5',
                       'normalized_knn_65_similar_features_2.h5',
                       'normalized_knn_65_similar_features_3.h5',
                       'normalized_knn_65_similar_features_7.h5',
                       'normalized_knn_65_similar_features_19.h5',
                       'normalized_knn_65_similar_features_36.h5']
    for i in os.listdir('../features/filled_by_knn/'):
        if 'target' not in i:
            if i not in checked_already:
                #--------------------
                # load features
                #--------------------
                df = pd.read_hdf('../features/filled_by_knn/{}'.format(i))
                logging.info('loading features: {}'.format(i))
                #--------------------
                # out-of-fold validating stratigy + LGB
                #--------------------    
                with timer("Run LightGBM with kfold"):
                    feature_importance_df = pd.DataFrame()
                    over_folds_val_auc_list = np.zeros(ITERATION)
                    for i in range(ITERATION):
                        logging.info('Iteration %i' %i)
                        iter_feat_imp, over_folds_val_auc = kfold_lightgbm(df, num_folds= NUM_FOLDS, stratified= STRATIFIED)
                        feature_importance_df = pd.concat([feature_importance_df, iter_feat_imp], axis=0)
                        over_folds_val_auc_list[i] = over_folds_val_auc

                    logging.info('Over-iterations val AUC score : {}'.format(over_folds_val_auc_list.mean()))
                    logging.info('Standard deviation : {}'.format(over_folds_val_auc_list.std()))

                    # display_importances(feature_importance_df)
                    feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
                    useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
                    feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
                    #---------------------
                    # save
                    #---------------------
                    output_path = '../output'
                    if not os.path.isdir(output_path):
                        os.mkdir(output_path)

                    if TEST_NULL_HYPO:
                        feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance-null_hypo.csv'), index = True)
                    else:
                        feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance.csv'), index = True)
                        useless_features_list = useless_features_df.index.tolist()
                        logging.info('useless/overfitting features: \'' + '\', \''.join(useless_features_list) + '\'')


In [6]:
checked_already = ['normalized_knn_65_descrete.h5', 
                   'normalized_knn_65_cluster1.h5', 
                   'base_featurs_filled_knn_k_65_by_cluster_2.h5',
                   'base_featurs_filled_knn_k_65_by_cluster_2+1.h5',
                   'base_featurs_filled_knn_k_65_corr_0_95.h5',
                   'base_featurs_filled_knn_k_65_corr_0_95_with_10_fold.h5',
                   'base_featurs_filled_knn_k_65_by_cluster_0.h5',
                   'base_featurs_filled_knn_k_65_by_cluster_1.h5',
                   'normalized_knn_65_similar_features_2.h5',
                   'normalized_knn_65_similar_features_3.h5',
                   'normalized_knn_65_similar_features_7.h5',
                   'normalized_knn_65_similar_features_19.h5',
                   'normalized_knn_65_similar_features_36.h5']
for i in os.listdir('../features/filled_by_knn/'):
    if 'target' not in i:
        if i not in checked_already:
            print (i)

normalized_knn_65_cluster_1_2_3.h5
normalized_knn_65_similar_features_th_09_1.h5
normalized_knn_65_similar_features_th_09_2.h5
normalized_knn_65_cluster3.h5
normalized_knn_65_continue.h5
normalized_knn_65_cluster2.h5
normalized_knn_65_similar_features_th_09_5.h5


In [5]:
CPU_USE_RATE = 0.4
log_dir = '../log_imputating_exp' # +了11個補完值得new feautures的實驗..
init_logging(log_dir)
with timer("Lightgbm run a score"):
    main()


loading features: normalized_knn_65_cluster_1_2_3.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.798220
Fold  2 val AUC : 0.795830
Fold  3 val AUC : 0.794852
Fold  4 val AUC : 0.793078
Fold  5 val AUC : 0.795101
Over-folds train AUC score : 0.8882852619584785
Over-folds val AUC score : 0.7954091609905474
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.797739
Fold  2 val AUC : 0.794711
Fold  3 val AUC : 0.794917
Fold  4 val AUC : 0.795766
Fold  5 val AUC : 0.794310
Over-folds train AUC score : 0.879977560072644
Over-folds val AUC score : 0.7954692654654422
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.793083
Fold  2 val AUC : 0.793756
Fold  3 val AUC : 0.797186
Fold  4 val AUC : 0.795959
Fold  5 val AUC : 0.797842
Over-folds train AUC score : 0.880963708767

Run LightGBM with kfold - done in 4258s


loading features: normalized_knn_65_similar_features_th_09_1.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.793959
Fold  2 val AUC : 0.799062
Fold  3 val AUC : 0.796857
Fold  4 val AUC : 0.794279
Fold  5 val AUC : 0.797158
Over-folds train AUC score : 0.8881557855697748
Over-folds val AUC score : 0.7962477638215673
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.791157
Fold  2 val AUC : 0.800581
Fold  3 val AUC : 0.794645
Fold  4 val AUC : 0.794614
Fold  5 val AUC : 0.798313
Over-folds train AUC score : 0.8824079146137066
Over-folds val AUC score : 0.7958441111752667
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.798298
Fold  2 val AUC : 0.797533
Fold  3 val AUC : 0.793831
Fold  4 val AUC : 0.793106
Fold  5 val AUC : 0.797053
Over-folds train AUC score : 0.

Run LightGBM with kfold - done in 4783s


loading features: normalized_knn_65_similar_features_th_09_2.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.798301
Fold  2 val AUC : 0.796317
Fold  3 val AUC : 0.797297
Fold  4 val AUC : 0.795294
Fold  5 val AUC : 0.792446
Over-folds train AUC score : 0.880317688546978
Over-folds val AUC score : 0.7959165113406997
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.793717
Fold  2 val AUC : 0.795960
Fold  3 val AUC : 0.793958
Fold  4 val AUC : 0.798575
Fold  5 val AUC : 0.794832
Over-folds train AUC score : 0.8813241630504096
Over-folds val AUC score : 0.7953831031496589
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796364
Fold  2 val AUC : 0.794413
Fold  3 val AUC : 0.799979
Fold  4 val AUC : 0.794417
Fold  5 val AUC : 0.792866
Over-folds train AUC score : 0.8

Run LightGBM with kfold - done in 4272s


loading features: normalized_knn_65_cluster3.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.799066
Fold  2 val AUC : 0.793820
Fold  3 val AUC : 0.795745
Fold  4 val AUC : 0.795154
Fold  5 val AUC : 0.795044
Over-folds train AUC score : 0.8860713814525238
Over-folds val AUC score : 0.7957703635368978
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.799998
Fold  2 val AUC : 0.792174
Fold  3 val AUC : 0.795889
Fold  4 val AUC : 0.791341
Fold  5 val AUC : 0.802331
Over-folds train AUC score : 0.8863200698662436
Over-folds val AUC score : 0.7963658057852174
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796976
Fold  2 val AUC : 0.801926
Fold  3 val AUC : 0.791392
Fold  4 val AUC : 0.791411
Fold  5 val AUC : 0.796374
Over-folds train AUC score : 0.885796858066747


Run LightGBM with kfold - done in 4299s


loading features: normalized_knn_65_continue.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (0, 280), test shape: (356251, 280)


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [3]:
pd.read_hdf('../features/filled_by_knn/normalized_knn_65_continue.h5')

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_REQ_CREDIT_BUREAU_QRT,APARTMENTS_MEDI,CODE_GENDER,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,...,CC_AMT_TOTAL_RECEIVABLE_SUM,CC_AMT_TOTAL_RECEIVABLE_VAR,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_ATM_CURRENT_VAR,CC_CNT_DRAWINGS_CURRENT_MAX,CC_CNT_DRAWINGS_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_VAR,CC_CNT_DRAWINGS_POS_CURRENT_MAX,CC_NAME_CONTRACT_STATUS_Active_MIN,TARGET
0,-0.184970,-0.454492,-0.483578,0.0,-0.851820,0,1.509654,0.753785,0.581110,-0.186114,...,-0.476829,-0.297476,0.240355,0.114094,-0.012236,0.081407,0.039778,0.077818,,
1,0.561524,1.770417,1.643090,0.0,-0.198047,1,-0.166037,0.517756,1.786060,0.180324,...,-0.264774,0.403508,0.107738,-0.022909,-0.148271,0.091701,-0.005622,0.066681,,
2,-1.403369,-1.135827,-1.073636,0.0,-0.155059,0,-0.689347,0.930272,0.310343,0.195892,...,-0.457041,-0.322280,-0.109949,-0.143619,-0.350775,-0.204116,-0.154654,-0.224310,,
3,0.153457,-0.690089,-0.631092,,-0.045780,1,-0.679940,-0.275150,0.372270,0.432998,...,-0.659841,-0.379177,-0.227304,-0.184378,-0.637578,-0.414112,-0.228381,-0.257743,1.0,
4,-0.377397,-0.187568,-0.041034,0.0,-0.106750,0,-0.892614,-0.274722,-0.300367,-0.152584,...,-0.094759,-0.238434,-0.032183,0.013835,-0.294117,-0.193468,-0.138167,-0.344844,,
5,0.006235,-0.244024,-0.200842,1.0,-0.036000,0,-0.206415,0.346409,1.663523,-1.865022,...,-0.273708,-0.165489,0.044619,0.067550,-0.232746,-0.148525,-0.135246,-0.140970,,
6,0.941796,2.440780,2.368370,1.0,-0.069052,1,0.519243,-0.314131,1.569973,-0.698648,...,-0.273761,-0.219518,-0.284456,-0.188040,-0.314220,-0.136894,-0.106954,-0.031309,,
7,0.994332,2.363700,2.737156,0.0,0.027107,0,-0.644380,0.834318,0.410481,-0.109473,...,-0.415351,-0.253685,-0.298381,-0.178188,-0.338079,-0.259836,-0.166678,-0.119814,,
8,0.434461,1.083324,1.053032,0.0,-0.069140,1,-0.930927,-0.011739,-0.337260,1.171862,...,0.573799,-0.063739,-0.578824,-0.273440,-0.259662,-0.399323,-0.225062,-0.552960,1.0,
9,-0.487050,-0.458500,-0.336063,,0.001797,0,0.360713,0.161783,-0.652168,-0.831571,...,-0.331595,-0.210092,0.144360,0.052782,0.036266,-0.034728,0.009945,0.054250,,
