In [1]:
'''

One imputation method, 10 iterations may take 1 hrs.

test_id 如果是隨機的, 會有引響麼, 理論上只要sort就好, 就算有?
'''
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import multiprocessing
import warnings
import sys
sys.path.append('../py_model')
from utils import init_logging
import logging 
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(int(time.time()))

NUM_FOLDS = 5
STRATIFIED = True  
TEST_NULL_HYPO = False
ITERATION = (80 if TEST_NULL_HYPO else 10) # It means how many iterations need to get the final stable AUC score.

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False):
    '''
    num_folds: int, how many foles u'r going to split.

    Maybe we can write a helper function, to find a best parametres each time when u add a new features, to make sure reliability of experiment.
    But, the experiement time will go up more.
    '''
    #---------------------
    # Divide in training/validation and test data
    #---------------------
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    # 
    logging.info('no bugging in split' if train_df.shape[0] + test_df.shape[0] == df.shape[0] else " opps")
    #---------------------
    # core
    #---------------------
    logging.info("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    # Create arrays and dataframes to store results
    # train
    oof_preds = np.zeros(train_df.shape[0])
    train_preds = np.zeros(train_df.shape[0])
    # test
    sub_preds = np.zeros(test_df.shape[0])
    # feature importance
    feature_importance_df = pd.DataFrame()

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    if TEST_NULL_HYPO:
        train_df['TARGET'] = train_df['TARGET'].copy().sample(frac = 1.0).values
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        if TEST_NULL_HYPO:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=127,
                max_depth=8,
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )
        else:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34, # 20
                colsample_bytree=0.2, #0.9497036 < 0.2
                subsample=0.8715623,
                max_depth=8, # 7
                reg_alpha=0.041545473, # 0.3
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775, # 60
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= False, early_stopping_rounds= 100) # early_stopping_rounds= 200
        # training/validating
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        train_preds[train_idx] += clf.predict_proba(train_x, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        # testing
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        logging.info('Fold %2d val AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    logging.info('Over-folds train AUC score : {}'.format(roc_auc_score(train_df['TARGET'], train_preds)))
    
    over_folds_val_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    logging.info('Over-folds val AUC score : {}'.format(over_folds_val_auc))
    
    # # Write submission file and plot feature importance
    # test_df.loc[:,'TARGET'] = sub_preds
    # test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)

    return feature_importance_df, over_folds_val_auc


In [3]:
def main():
    #--------------------
    # load features
    #--------------------
    import re
    compiled_k = re.compile(r'knn_\d*')
    #[int(compiled_k.findall(i)[0][4:]) for i in result.inputation_method if compiled_k.search(i) ]
    for p in os.listdir('../features/filled_by_knn/'):
        if int(compiled_k.findall(p)[0][4:]) > 20:
            df = pd.read_hdf('../features/filled_by_knn/{}'.format(p))
            logging.info('loading features: {}'.format(p))
            #--------------------
            # out-of-fold validating stratigy + LGB
            #--------------------    
            with timer("Run LightGBM with kfold"):
                feature_importance_df = pd.DataFrame()
                over_folds_val_auc_list = np.zeros(ITERATION)
                for i in range(ITERATION):
                    logging.info('Iteration %i' %i)
                    iter_feat_imp, over_folds_val_auc = kfold_lightgbm(df, num_folds= NUM_FOLDS, stratified= STRATIFIED)
                    feature_importance_df = pd.concat([feature_importance_df, iter_feat_imp], axis=0)
                    over_folds_val_auc_list[i] = over_folds_val_auc

                logging.info('Over-iterations val AUC score : {}'.format(over_folds_val_auc_list.mean()))
                logging.info('Standard deviation : {}'.format(over_folds_val_auc_list.std()))

                # display_importances(feature_importance_df)
                feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
                useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
                feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
                #---------------------
                # save
                #---------------------
                output_path = '../output'
                if not os.path.isdir(output_path):
                    os.mkdir(output_path)

                if TEST_NULL_HYPO:
                    feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance-null_hypo.csv'), index = True)
                else:
                    feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance.csv'), index = True)
                    useless_features_list = useless_features_df.index.tolist()
                    logging.info('useless/overfitting features: \'' + '\', \''.join(useless_features_list) + '\'')


In [5]:
already_trained = ['base_featurs_filled_knn_65_wo_target',
                'base_featurs_filled_knn_95_w_target',
                'base_featurs_filled_knn_30_wo_target',
                'base_featurs_filled_knn_45_w_target',
                'base_featurs_filled_knn_75_wo_target',
                'base_featurs_filled_knn_40_w_target']

In [6]:
import re
compiled_k = re.compile(r'knn_\d*')
#[int(compiled_k.findall(i)[0][4:]) for i in result.inputation_method if compiled_k.search(i) ]
for p in os.listdir('../features/filled_by_knn/'):
    if int(compiled_k.findall(p)[0][4:]) > 20:
        if (p[:-3] not in already_trained):
            print (p[:-3])

base_featurs_filled_knn_35_wo_target
base_featurs_filled_knn_90_wo_target
base_featurs_filled_knn_80_w_target
base_featurs_filled_knn_40_wo_target
base_featurs_filled_knn_65_w_target
base_featurs_filled_knn_50_w_target
base_featurs_filled_knn_70_wo_target
base_featurs_filled_knn_25_wo_target
base_featurs_filled_knn_90_w_target
base_featurs_filled_knn_85_wo_target
base_featurs_filled_knn_55_wo_target
base_featurs_filled_knn_35_w_target
base_featurs_filled_knn_85_w_target
base_featurs_filled_knn_80_wo_target
base_featurs_filled_knn_95_wo_target
base_featurs_filled_knn_30_w_target
base_featurs_filled_knn_45_wo_target
base_featurs_filled_knn_60_w_target
base_featurs_filled_knn_50_wo_target
base_featurs_filled_knn_55_w_target
base_featurs_filled_knn_60_wo_target
base_featurs_filled_knn_75_w_target
base_featurs_filled_knn_70_w_target
base_featurs_filled_knn_25_w_target


In [None]:
CPU_USE_RATE = 0.8
log_dir = '../log_imputating_exp'
init_logging(log_dir)
with timer("Lightgbm run a score"):
    main()


loading features: base_featurs_filled_knn_65_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.795578
Fold  2 val AUC : 0.793440
Fold  3 val AUC : 0.791083
Fold  4 val AUC : 0.793510
Fold  5 val AUC : 0.794859
Over-folds train AUC score : 0.8882419196707059
Over-folds val AUC score : 0.7936738837479553
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.790206
Fold  2 val AUC : 0.795358
Fold  3 val AUC : 0.791259
Fold  4 val AUC : 0.796197
Fold  5 val AUC : 0.798026
Over-folds train AUC score : 0.8882638420407751
Over-folds val AUC score : 0.7941974738829686
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.791424
Fold  2 val AUC : 0.791561
Fold  3 val AUC : 0.794598
Fold  4 val AUC : 0.792357
Fold  5 val AUC : 0.798179
Over-folds train AUC score : 0.876641

Run LightGBM with kfold - done in 7897s


loading features: base_featurs_filled_knn_95_w_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796519
Fold  2 val AUC : 0.793272
Fold  3 val AUC : 0.793979
Fold  4 val AUC : 0.795288
Fold  5 val AUC : 0.789979
Over-folds train AUC score : 0.8912352988775412
Over-folds val AUC score : 0.7938032467072538
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.794676
Fold  2 val AUC : 0.791726
Fold  3 val AUC : 0.791217
Fold  4 val AUC : 0.798257
Fold  5 val AUC : 0.793735
Over-folds train AUC score : 0.8853842747072669
Over-folds val AUC score : 0.7938591403292243
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.791988
Fold  2 val AUC : 0.791791
Fold  3 val AUC : 0.796336
Fold  4 val AUC : 0.794331
Fold  5 val AUC : 0.795788
Over-folds train AUC score : 0.8930021

Run LightGBM with kfold - done in 8166s


loading features: base_featurs_filled_knn_30_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796100
Fold  2 val AUC : 0.798183
Fold  3 val AUC : 0.789831
Fold  4 val AUC : 0.787950
Fold  5 val AUC : 0.794036
Over-folds train AUC score : 0.8863588853802484
Over-folds val AUC score : 0.7932133426923993
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.795292
Fold  2 val AUC : 0.796955
Fold  3 val AUC : 0.788999
Fold  4 val AUC : 0.792898
Fold  5 val AUC : 0.791656
Over-folds train AUC score : 0.8882306999920264
Over-folds val AUC score : 0.7931474573078117
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.788583
Fold  2 val AUC : 0.794593
Fold  3 val AUC : 0.796301
Fold  4 val AUC : 0.791809
Fold  5 val AUC : 0.792180
Over-folds train AUC score : 0.878413

Run LightGBM with kfold - done in 8314s


loading features: base_featurs_filled_knn_45_w_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.791434
Fold  2 val AUC : 0.791309
Fold  3 val AUC : 0.795739
Fold  4 val AUC : 0.793949
Fold  5 val AUC : 0.794295
Over-folds train AUC score : 0.8879075957609408
Over-folds val AUC score : 0.7933277563400715
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.793838
Fold  2 val AUC : 0.798371
Fold  3 val AUC : 0.791686
Fold  4 val AUC : 0.791256
Fold  5 val AUC : 0.794083
Over-folds train AUC score : 0.8946931972915766
Over-folds val AUC score : 0.7938262361117289
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.791930
Fold  2 val AUC : 0.798911
Fold  3 val AUC : 0.792862
Fold  4 val AUC : 0.791037
Fold  5 val AUC : 0.792012
Over-folds train AUC score : 0.8855060

Run LightGBM with kfold - done in 3316s


loading features: base_featurs_filled_knn_75_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.795101
Fold  2 val AUC : 0.790877
Fold  3 val AUC : 0.797865
Fold  4 val AUC : 0.791207
Fold  5 val AUC : 0.794849
Over-folds train AUC score : 0.8902715165802904
Over-folds val AUC score : 0.7939472436273319
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.790608
Fold  2 val AUC : 0.798392
Fold  3 val AUC : 0.794463
Fold  4 val AUC : 0.791736
Fold  5 val AUC : 0.794684
Over-folds train AUC score : 0.8851009914649146
Over-folds val AUC score : 0.7939376742325006
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.792229
Fold  2 val AUC : 0.796503
Fold  3 val AUC : 0.793552
Fold  4 val AUC : 0.794438
Fold  5 val AUC : 0.794433
Over-folds train AUC score : 0.890476

Run LightGBM with kfold - done in 3291s


loading features: base_featurs_filled_knn_40_w_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.794576
Fold  2 val AUC : 0.796330
Fold  3 val AUC : 0.793971
Fold  4 val AUC : 0.788968
Fold  5 val AUC : 0.793659
Over-folds train AUC score : 0.8812123903413922
Over-folds val AUC score : 0.7934930707208901
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.790830
Fold  2 val AUC : 0.791425
Fold  3 val AUC : 0.794240
Fold  4 val AUC : 0.795480
Fold  5 val AUC : 0.795349
Over-folds train AUC score : 0.8892724808228603
Over-folds val AUC score : 0.793432182357605
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796684
Fold  2 val AUC : 0.794937
Fold  3 val AUC : 0.791620
Fold  4 val AUC : 0.794617
Fold  5 val AUC : 0.790079
Over-folds train AUC score : 0.88445565

In [None]:
# 不要把時間花在沒有技術含量的東西, 這邊做完實驗就知道結果了。