In [1]:
'''

One imputation method, 10 iterations may take 1 hrs.

test_id 如果是隨機的, 會有引響麼, 理論上只要sort就好, 就算有?
'''
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import multiprocessing
import warnings
import sys
sys.path.append('../py_model')
from utils import init_logging
import logging 
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(int(time.time()))

NUM_FOLDS = 5
STRATIFIED = True  
TEST_NULL_HYPO = False
ITERATION = (80 if TEST_NULL_HYPO else 10) # It means how many iterations need to get the final stable AUC score.

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False):
    '''
    num_folds: int, how many foles u'r going to split.

    Maybe we can write a helper function, to find a best parametres each time when u add a new features, to make sure reliability of experiment.
    But, the experiement time will go up more.
    '''
    #---------------------
    # Divide in training/validation and test data
    #---------------------
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    # 
    logging.info('no bugging in split' if train_df.shape[0] + test_df.shape[0] == df.shape[0] else " opps")
    #---------------------
    # core
    #---------------------
    logging.info("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=int(time.time()))
    # Create arrays and dataframes to store results
    # train
    oof_preds = np.zeros(train_df.shape[0])
    train_preds = np.zeros(train_df.shape[0])
    # test
    sub_preds = np.zeros(test_df.shape[0])
    # feature importance
    feature_importance_df = pd.DataFrame()

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    if TEST_NULL_HYPO:
        train_df['TARGET'] = train_df['TARGET'].copy().sample(frac = 1.0).values
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        if TEST_NULL_HYPO:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=127,
                max_depth=8,
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )
        else:
            clf = LGBMClassifier(
                nthread=int(multiprocessing.cpu_count()*CPU_USE_RATE),
                n_estimators=10000,
                learning_rate=0.02,
                num_leaves=34, # 20
                colsample_bytree=0.2, #0.9497036 < 0.2
                subsample=0.8715623,
                max_depth=8, # 7
                reg_alpha=0.041545473, # 0.3
                reg_lambda=0.0735294,
                min_split_gain=0.0222415,
                min_child_weight=39.3259775, # 60
                silent=-1,
                verbose=-1,
                random_state=int(time.time()),
                )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= False, early_stopping_rounds= 100) # early_stopping_rounds= 200
        # training/validating
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        train_preds[train_idx] += clf.predict_proba(train_x, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        # testing
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        logging.info('Fold %2d val AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    logging.info('Over-folds train AUC score : {}'.format(roc_auc_score(train_df['TARGET'], train_preds)))
    
    over_folds_val_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    logging.info('Over-folds val AUC score : {}'.format(over_folds_val_auc))
    
    # # Write submission file and plot feature importance
    # test_df.loc[:,'TARGET'] = sub_preds
    # test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)

    return feature_importance_df, over_folds_val_auc


In [3]:
def main():
    checked_already = ['base_featurs_filled_mice_by_cluster_2.h5', 
                   'base_featurs_filled_mice_by_cluster_2+1.h5',
                  'base_featurs_filled_mice_by_cluster_1.h5',
                  'base_featurs_filled_mice_half_training_data.h5',
                  'base_featurs_filled_mice_clustering.h5']
    for i in os.listdir('../features/filled_by_mice/'):
        if 'target' in i:
            if i == 'normalized_mice_discrete_w_peeking_target_612.h5': 
                #--------------------
                # load features
                #--------------------
                df = pd.read_hdf('../features/filled_by_mice/{}'.format(i))
                logging.info('loading features: {}'.format(i))
                #--------------------
                # out-of-fold validating stratigy + LGB
                #--------------------    
                with timer("Run LightGBM with kfold"):
                    feature_importance_df = pd.DataFrame()
                    over_folds_val_auc_list = np.zeros(ITERATION)
                    for i in range(ITERATION):
                        logging.info('Iteration %i' %i)
                        iter_feat_imp, over_folds_val_auc = kfold_lightgbm(df, num_folds= NUM_FOLDS, stratified= STRATIFIED)
                        feature_importance_df = pd.concat([feature_importance_df, iter_feat_imp], axis=0)
                        over_folds_val_auc_list[i] = over_folds_val_auc

                    logging.info('Over-iterations val AUC score : {}'.format(over_folds_val_auc_list.mean()))
                    logging.info('Standard deviation : {}'.format(over_folds_val_auc_list.std()))

                    # display_importances(feature_importance_df)
                    feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
                    useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
                    feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
                    #---------------------
                    # save
                    #---------------------
                    output_path = '../output'
                    if not os.path.isdir(output_path):
                        os.mkdir(output_path)

                    if TEST_NULL_HYPO:
                        feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance-null_hypo.csv'), index = True)
                    else:
                        feature_importance_df_mean.to_csv(os.path.join(output_path, 'feature_importance.csv'), index = True)
                        useless_features_list = useless_features_df.index.tolist()
                        logging.info('useless/overfitting features: \'' + '\', \''.join(useless_features_list) + '\'')


In [4]:
checked_already = ['base_featurs_filled_mice_by_cluster_2.h5', 
                   'base_featurs_filled_mice_by_cluster_2+1.h5',
                  'base_featurs_filled_mice_by_cluster_1.h5',
                  'base_featurs_filled_mice_half_training_data.h5',
                  'base_featurs_filled_mice_clustering.h5']
for i in os.listdir('../features/filled_by_mice/'):
    if i not in checked_already: 
            print (i)

normalized_mice_discrete_wo_target.h5
normalized_mice_discrete_w_peeking_target_724.h5
normalized_mice_discrete.h5
normalized_mice_similar_features_th_09_5.h5
normalized_mice_cluster_3.h5
normalized_mice_discrete_w_peeking_target.h5
.DS_Store
normalized_mice_cluster_1_2_3.h5
normalized_mice_similar_features_th_09_2.h5
normalized_mice_similar_features_th_09_1.h5
normalized_mice_cluster_2.h5
normalized_mice_cluster_1.h5
normalized_mice_discrete_w_peeking_target_612.h5
base_featurs_filled_mice_half_training_data_wo_target.h5


In [5]:
CPU_USE_RATE = 0.4
log_dir = '../log_imputating_exp' # +了11個補完值得new feautures的實驗..
init_logging(log_dir)
with timer("Lightgbm run a score"):
    main()


loading features: normalized_mice_discrete_w_peeking_target_612.h5
Iteration 0
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.796300
Fold  2 val AUC : 0.798675
Fold  3 val AUC : 0.791667
Fold  4 val AUC : 0.792033
Fold  5 val AUC : 0.796188
Over-folds train AUC score : 0.8754258688854542
Over-folds val AUC score : 0.7949617620710923
Iteration 1
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.797527
Fold  2 val AUC : 0.796904
Fold  3 val AUC : 0.795919
Fold  4 val AUC : 0.791305
Fold  5 val AUC : 0.795934
Over-folds train AUC score : 0.884716231654566
Over-folds val AUC score : 0.7955004250645841
Iteration 2
no bugging in split
Starting LightGBM. Train shape: (307507, 280), test shape: (48744, 280)
Fold  1 val AUC : 0.799445
Fold  2 val AUC : 0.796333
Fold  3 val AUC : 0.793401
Fold  4 val AUC : 0.791880
Fold  5 val AUC : 0.800060
Over-folds train AUC score : 

Run LightGBM with kfold - done in 6382s
Lightgbm run a score - done in 6383s


In [6]:
# why overefit? 思考一下

# 加一條標準差的虛線...
0.7959379558537741 - 0.0003017394118141831 > 0.795883003439369 #(0.7962306295343666)

False

In [None]:
sub = pd.read_csv('../py_model/sub.csv')
sub.drop(['Unnamed: 0'], axis = 1 , inplace = True)
sub.SK_ID_CURR = sub.SK_ID_CURR.map(lambda x: str(int(x)))
sub.to_csv('../py_model/sub.csv', index= False)

In [None]:
for i in np.arange(0.05, 0.01, step = - 0.01):
    print (i)