In [8]:
import os
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import multiprocessing
import warnings
import sys
sys.path.append('../py_model/')
from utils import init_logging
import logging 

warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(int(time.time()))

CPU_USE_RATE = 0.5
NUM_FOLDS = 5
STRATIFIED = True  
TEST_NULL_HYPO = False
ITERATION = (80 if TEST_NULL_HYPO else 10) # It means how many iterations need to get the final stable AUC score.

In [17]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

def kfold_lightgbm(df, num_folds, stratified = False):
    '''
    num_folds: int, how many foles u'r going to split.

    Maybe we can write a helper function, to find a best parametres each time when u add a new features, to make sure reliability of experiment.
    But, the experiement time will go up more.
    '''
    #---------------------
    # Divide in training/validation and test data
    #---------------------
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    # test_id 如果是隨機的, 會有引響麼, 理論上只要sort就好, 就算有
    logging.info('no bugging in split' if train_df.shape[0] + test_df.shape[0] == df.shape[0] else " opps")
    #---------------------
    # core
    #---------------------
    logging.info("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    # Create arrays and dataframes to store results
    # train
    oof_preds = np.zeros(train_df.shape[0])
    train_preds = np.zeros(train_df.shape[0])
    # test
    sub_preds = np.zeros(test_df.shape[0])
    # feature importance
    feature_importance_df = pd.DataFrame()

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    if TEST_NULL_HYPO:
        train_df['TARGET'] = train_df['TARGET'].copy().sample(frac = 1.0).values
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        if TEST_NULL_HYPO:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=127,
                max_depth=8,
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )
        else:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34, # 20
                colsample_bytree=0.2, #0.9497036 < 0.2
                subsample=0.8715623,
                max_depth=8, # 7
                reg_alpha=0.041545473, # 0.3
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775, # 60
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= False, early_stopping_rounds= 100) # early_stopping_rounds= 200
        # training/validating
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        train_preds[train_idx] += clf.predict_proba(train_x, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        # testing
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        logging.info('Fold %2d val AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    logging.info('Over-folds train AUC score : {}'.format(roc_auc_score(train_df['TARGET'], train_preds)))
    
    over_folds_val_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    logging.info('Over-folds val AUC score : {}'.format(over_folds_val_auc))
    
    # # Write submission file and plot feature importance
    # test_df.loc[:,'TARGET'] = sub_preds
    # test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)

    return feature_importance_df, over_folds_val_auc

def main():
    #--------------------
    # load features
    #--------------------
    input_dir = "../features/filled_by_knn/"
    for f_p in os.listdir(input_dir):
        df = pd.read_hdf(os.path.join(input_dir, f_p))
        logging.info('loading features: {}'.format(f_p))
        #--------------------
        # out-of-fold validating stratigy + LGB
        #--------------------    
        with timer("Run LightGBM with kfold"):
            feature_importance_df = pd.DataFrame()
            over_folds_val_auc_list = np.zeros(ITERATION)
            for i in range(ITERATION):
                logging.info('Iteration %i' %i)
                iter_feat_imp, over_folds_val_auc = kfold_lightgbm(df, num_folds= NUM_FOLDS, stratified= STRATIFIED)
                feature_importance_df = pd.concat([feature_importance_df, iter_feat_imp], axis=0)
                over_folds_val_auc_list[i] = over_folds_val_auc

            logging.info('Over-iterations val AUC score : {}'.format(over_folds_val_auc_list.mean()))
            logging.info('Standard deviation : {}'.format(over_folds_val_auc_list.std()))

            # display_importances(feature_importance_df)
            feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
            useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
            feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
            #---------------------
            # save
            #---------------------
            output_path = '../output'
            if not os.path.isdir(output_path):
                os.mkdir(output_path)

            if TEST_NULL_HYPO:
                feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance-null_hypo.csv'), index = True)
            else:
                feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance.csv'), index = True)
                useless_features_list = useless_features_df.index.tolist()
                logging.info('useless/overfitting features: \'' + '\', \''.join(useless_features_list) + '\'')



In [None]:
with timer("Lightgbm run a score"):
    main()


loading features: base_featurs_filled_knn_10_w_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.740681
Fold  2 val AUC : 0.749132
Fold  3 val AUC : 0.744320
Fold  4 val AUC : 0.730504
Fold  5 val AUC : 0.745857
Over-folds train AUC score : 0.7643825539076633
Over-folds val AUC score : 0.7402420698934183
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.738166
Fold  2 val AUC : 0.742753
Fold  3 val AUC : 0.741713
Fold  4 val AUC : 0.733229
Fold  5 val AUC : 0.746878
Over-folds train AUC score : 0.7607863036814546
Over-folds val AUC score : 0.7404326337168636
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.737450
Fold  2 val AUC : 0.749093
Fold  3 val AUC : 0.747489
Fold  4 val AUC : 0.737776
Fold  5 val AUC : 0.742487
Over-folds train AUC score : 0.7624675

Run LightGBM with kfold - done in 367s


loading features: base_featurs_filled_knn_5_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.740996
Fold  2 val AUC : 0.731331
Fold  3 val AUC : 0.743621
Fold  4 val AUC : 0.736635
Fold  5 val AUC : 0.736548
Over-folds train AUC score : 0.7600130107517895
Over-folds val AUC score : 0.7375065877582754
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.740080
Fold  2 val AUC : 0.740224
Fold  3 val AUC : 0.744751
Fold  4 val AUC : 0.752092
Fold  5 val AUC : 0.744479
Over-folds train AUC score : 0.7613412717957149
Over-folds val AUC score : 0.7430780719990728
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.749928
Fold  2 val AUC : 0.730736
Fold  3 val AUC : 0.736194
Fold  4 val AUC : 0.744315
Fold  5 val AUC : 0.744674
Over-folds train AUC score : 0.7616927

Run LightGBM with kfold - done in 366s


loading features: base_featurs_filled_knn_5_w_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.738218
Fold  2 val AUC : 0.746455
Fold  3 val AUC : 0.740304
Fold  4 val AUC : 0.746535
Fold  5 val AUC : 0.732652
Over-folds train AUC score : 0.7621957927907818
Over-folds val AUC score : 0.7389209342396372
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.742792
Fold  2 val AUC : 0.730905
Fold  3 val AUC : 0.731653
Fold  4 val AUC : 0.742173
Fold  5 val AUC : 0.709929
Over-folds train AUC score : 0.7578021158901823
Over-folds val AUC score : 0.7313890376165467
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.738399
Fold  2 val AUC : 0.742145
Fold  3 val AUC : 0.737875
Fold  4 val AUC : 0.736545
Fold  5 val AUC : 0.751529
Over-folds train AUC score : 0.76034874

Run LightGBM with kfold - done in 364s


loading features: base_featurs_filled_knn_15_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.746556
Fold  2 val AUC : 0.741693
Fold  3 val AUC : 0.750720
Fold  4 val AUC : 0.745365
Fold  5 val AUC : 0.735587
Over-folds train AUC score : 0.7634345418602351
Over-folds val AUC score : 0.7436007654575371
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.750974
Fold  2 val AUC : 0.716407
Fold  3 val AUC : 0.744531
Fold  4 val AUC : 0.740554
Fold  5 val AUC : 0.744358
Over-folds train AUC score : 0.7632320908203598
Over-folds val AUC score : 0.7385462784813168
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.748464
Fold  2 val AUC : 0.748500
Fold  3 val AUC : 0.742853
Fold  4 val AUC : 0.746684
Fold  5 val AUC : 0.737725
Over-folds train AUC score : 0.763472

Run LightGBM with kfold - done in 365s


loading features: base_featurs_filled_knn_20_wo_target.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.747405
Fold  2 val AUC : 0.740634
Fold  3 val AUC : 0.730414
Fold  4 val AUC : 0.746628
Fold  5 val AUC : 0.744106
Over-folds train AUC score : 0.7629820384037909
Over-folds val AUC score : 0.7412093656522493
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.735089
Fold  2 val AUC : 0.739993
Fold  3 val AUC : 0.749893
Fold  4 val AUC : 0.739302
Fold  5 val AUC : 0.752392
Over-folds train AUC score : 0.7625066532723068
Over-folds val AUC score : 0.7427848114577779
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.745776
Fold  2 val AUC : 0.743328
Fold  3 val AUC : 0.745774
Fold  4 val AUC : 0.750176
Fold  5 val AUC : 0.742980
Over-folds train AUC score : 0.761859

In [6]:
log_dir = '../log_imputating_exp'
init_logging(log_dir)

In [12]:
input_dir = "../features/filled_by_knn/"
for f_p in os.listdir(input_dir):
    print (os.path.join(input_dir, f_p))

../features/filled_by_knn/base_featurs_filled_knn_10_w_target.h5
../features/filled_by_knn/base_featurs_filled_knn_5_wo_target.h5
../features/filled_by_knn/base_featurs_filled_knn_5_w_target.h5
../features/filled_by_knn/base_featurs_filled_knn_15_wo_target.h5
../features/filled_by_knn/base_featurs_filled_knn_20_wo_target.h5
../features/filled_by_knn/base_featurs_filled_knn_20_w_target.h5
../features/filled_by_knn/base_featurs_filled_knn_10_wo_target.h5
../features/filled_by_knn/base_featurs_filled_knn_15_w_target.h5
