In [4]:
"""
python3 main.py ../../dataset/train.csv ../../dataset/test.csv ../result/cv_results.csv ../result/submission.csv > ../result/logs.txt

make train

"""
import sys
sys.path.append("../fraud_detection/src/")
import time
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from contextlib import contextmanager
import gc 
from util import s_to_time_format, string_to_datetime, hour_to_range, kfold_lightgbm, kfold_xgb
from util import rolling_stats_target_by_cols
#from util import _time_elapsed_between_last_transactions,time_elapsed_between_last_transactions
#from util import num_transaction_in_past_n_days
#from util import add_auto_encoder_feature
#from util import group_target_by_cols_split_by_users
from time import strftime, localtime
import logging
import sys
from config import Configs

# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))
#log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
log_file = '../fraud_detection/result/fs_{}.log'.format(strftime("%y%m%d-%H%M", localtime()))
logger.addHandler(logging.FileHandler(log_file))

def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)

        # reduced memory    
        del tmp
        gc.collect()
    
    return df_train, df_test

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    logger.info("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    if args.load_feature == True:
        with timer("Load train/test features extracted"):
            #-------------------------
            # load dataset
            #-------------------------
            df_train = pd.read_csv("../fraud_detection/features/train.csv")
            df_test = pd.read_csv("../fraud_detection/features/test.csv")

            #-------------------------
            # pre-processing
            #-------------------------

            for cat in Configs.CATEGORY:
                df_train[cat] = df_train[cat].astype('category') #.cat.codes
                df_test[cat] = df_test[cat].astype('category')
            for df in [df_train, df_test]:
                df["hour_range"] = df["hour_range"].astype('category')

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))     

    else:
        with timer("Process train/test application"):
            #-------------------------
            # load dataset
            #-------------------------
            df_train = pd.read_csv(args.train_file)
            df_test = pd.read_csv(args.test_file)

            #-------------------------
            # pre-processing
            #-------------------------

            for cat in Configs.CATEGORY:
                df_train[cat] = df_train[cat].astype('category') #.cat.codes
                df_test[cat] = df_test[cat].astype('category')
                
            for df in [df_train, df_test]:
                # pre-processing
                df["loctm_"] = df.loctm.astype(int).astype(str)
                df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
                # # time-related feature
                df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
                df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
                df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
                # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
                df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
                # removed the columns no need
                df.drop(columns = ["loctm_"], axis = 1, inplace = True)
                # auxiliary fields
                df["day_hr_min"] = ["{}:{}:{}".format(i,j,k) for i,j,k in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour)]
                df["day_hr_min_sec"] = ["{}:{}:{}:{}".format(i,j,k,z) for i,j,k,z in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour,df.loctm_second_of_min)]

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add bacno/cano feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_1)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add iterm-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.ITERM_AGG_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add conam-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_2)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add hour-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.HOUR_AGG_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add cano/conam feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CANO_CONAM_COUNT_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add cano/bacno latent feature"):
            df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_cano.csv")
            df_train = df_train.merge(df, on = "bacno", how = "left")
            df_test = df_test.merge(df, on = "bacno", how = "left")
            df = pd.read_csv("../fraud_detection/features/bacno_cano_latent_features.csv")
            df_train = df_train.merge(df, on = "cano", how = "left")
            df_test = df_test.merge(df, on = "cano", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add locdt-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add mchno-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.MCHNO_CONAM_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add scity-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.SCITY_CONAM_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add stocn-related feature"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.STOCN_CONAM_RECIPE)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add mchno/bacno latent feature"):
            df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_mchno.csv")
            df_train = df_train.merge(df, on = "bacno", how = "left")
            df_test = df_test.merge(df, on = "bacno", how = "left")
            df = pd.read_csv("../fraud_detection/features/bacno_mchno_latent_features.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add time second-level feature on bacno"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.HOUR_AGG_SEC_LEVEL_RECIPE_BACNO,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add time second-level feature on cano"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.HOUR_AGG_SEC_LEVEL_RECIPE_CANO,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add time second-level feature on mchno"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.HOUR_AGG_SEC_LEVEL_RECIPE_MCHNO,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add time second-level feature on csmcu/stocn/scity"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.HOUR_AGG_SEC_LEVEL_RECIPE,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add time second-level feature on acqic/csmcu/stocn/scity"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.HOUR_AGG_SEC_LEVEL_RECIPE_2,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add conam-related feature v3"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.CONAM_AGG_RECIPE_3,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add locdt-related feature v2"):
            df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE_2)

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add conam-related feature v4"):
            df_train, df_test = group_target_by_cols(
                df_train, 
                df_test, 
                Configs.CONAM_AGG_RECIPE_4,
                )
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add cano/mchno latent feature"):
            df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_mchno.csv")
            df_train = df_train.merge(df, on = "cano", how = "left")
            df_test = df_test.merge(df, on = "cano", how = "left")
            df = pd.read_csv("../fraud_detection/features/cano_mchno_latent_features.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add cano/locdt latent feature"):
            df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_locdt.csv")
            df_train = df_train.merge(df, on = "cano", how = "left")
            df_test = df_test.merge(df, on = "cano", how = "left")
            df = pd.read_csv("../fraud_detection/features/cano_locdt_latent_features.csv")
            df_train = df_train.merge(df, on = "locdt", how = "left")
            df_test = df_test.merge(df, on = "locdt", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add mchno/locdt latent feature"):
            df = pd.read_csv("../fraud_detection/features/mchno_latent_features_w_locdt.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")
            df = pd.read_csv("../fraud_detection/features/mchno_locdt_latent_features.csv")
            df_train = df_train.merge(df, on = "locdt", how = "left")
            df_test = df_test.merge(df, on = "locdt", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        with timer("Add mchno time aggregate average feature"):
            # df = pd.read_csv("../features/average_mchno_time_agg.csv")
            # df_train = df_train.merge(df, on = "txkey", how = "left")
            # df_test = df_test.merge(df, on = "txkey", how = "left")
            df = pd.read_csv("../fraud_detection/features/average_mchno_mean_conam_in_past_7_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_mean_conam_in_past_14_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_std_conam_in_past_7_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_std_conam_in_past_14_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_min_conam_in_past_7_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_min_conam_in_past_14_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_max_conam_in_past_7_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_max_conam_in_past_14_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_median_conam_in_past_7_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            df = pd.read_csv("../fraud_detection/features/average_mchno_median_conam_in_past_14_days.csv")
            df_train = df_train.merge(df, on = "mchno", how = "left")
            df_test = df_test.merge(df, on = "mchno", how = "left")

            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add bacno time aggregate average feature"):
        df = pd.read_csv("../fraud_detection/features/average_bacno_min_conam_in_past_7_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_max_conam_in_past_7_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_mean_conam_in_past_7_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_median_conam_in_past_7_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_std_conam_in_past_7_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_min_conam_in_past_14_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_max_conam_in_past_14_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_mean_conam_in_past_14_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_median_conam_in_past_14_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_bacno_std_conam_in_past_14_days.csv").iloc[:,1:]
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mcc time aggregate average feature"):
        df = pd.read_csv("../fraud_detection/features/average_mcc_median_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = "mcc", how = "left")
        df_test = df_test.merge(df, on = "mcc", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_mcc_max_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = "mcc", how = "left")
        df_test = df_test.merge(df, on = "mcc", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_mcc_min_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = "mcc", how = "left")
        df_test = df_test.merge(df, on = "mcc", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_mcc_mean_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = "mcc", how = "left")
        df_test = df_test.merge(df, on = "mcc", how = "left")

        df = pd.read_csv("../fraud_detection/features/average_mcc_std_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = "mcc", how = "left")
        df_test = df_test.merge(df, on = "mcc", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add scity time aggregate feature"):
        df = pd.read_csv("../fraud_detection/features/scity_mean_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = ["scity","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["scity","locdt"], how = "left")

        df = pd.read_csv("../fraud_detection/features/scity_mean_conam_in_past_14_days.csv")
        df_train = df_train.merge(df, on = ["scity","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["scity","locdt"], how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add stocn time aggregate feature"):
        df = pd.read_csv("../fraud_detection/features/stocn_mean_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = ["stocn","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["stocn","locdt"], how = "left")

        df = pd.read_csv("../fraud_detection/features/stocn_mean_conam_in_past_14_days.csv")
        df_train = df_train.merge(df, on = ["stocn","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["stocn","locdt"], how = "left")
        
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add acqic time aggregate feature"):
        df = pd.read_csv("../fraud_detection/features/acqic_mean_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = ["acqic","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["acqic","locdt"], how = "left")

        df = pd.read_csv("../fraud_detection/features/acqic_mean_conam_in_past_14_days.csv")
        df_train = df_train.merge(df, on = ["acqic","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["acqic","locdt"], how = "left")
        
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno time aggregate feature"):
        df = pd.read_csv("../fraud_detection/features/mchno_mean_conam_in_past_7_days.csv")
        df_train = df_train.merge(df, on = ["mchno","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["mchno","locdt"], how = "left")

        df = pd.read_csv("../fraud_detection/features/mchno_mean_conam_in_past_14_days.csv")
        df_train = df_train.merge(df, on = ["mchno","locdt"], how = "left")
        df_test = df_test.merge(df, on = ["mchno","locdt"], how = "left")
        
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))   
        
    return df_train, df_test

#         ITERATION = (5 if args.TEST_NULL_HYPO else 1)
#         feature_importance_df = pd.DataFrame()
#         over_iterations_val_auc = np.zeros(ITERATION)
#         for i in range(ITERATION):
#             logger.info('Iteration %i' %i)
#             if args.model == "lgb":    
#                 iter_feat_imp, over_folds_val_auc = kfold_lightgbm(df_train, df_test, num_folds = args.NUM_FOLDS, args = args, stratified = args.STRATIFIED, seed = args.SEED, logger = logger)
#             elif args.model == "xgb":
#                 iter_feat_imp, over_folds_val_auc = kfold_xgb(df_train, df_test, num_folds = args.NUM_FOLDS, args = args, stratified = args.STRATIFIED, seed = args.SEED, logger = logger)
#             else:
#                 print("Now we only support LightGBM or Xgboost model!")           
#             feature_importance_df = pd.concat([feature_importance_df, iter_feat_imp], axis=0)
#             over_iterations_val_auc[i] = over_folds_val_auc

#         logger.info('============================================\nOver-iterations val f1 score %.6f' %over_iterations_val_auc.mean())
#         logger.info('Standard deviation %.6f\n============================================' %over_iterations_val_auc.std())
    
#     if args.feature_importance_plot == True:
#         from util import display_importances
#         display_importances(feature_importance_df, args.model)
        
#     feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
#     useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
#     feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)

#     if args.TEST_NULL_HYPO:
#         feature_importance_df_mean.to_csv("../fraud_detection/result/feature_importance-null_hypo.csv", index = True)
#     else:
#         feature_importance_df_mean.to_csv("../fraud_detection/result/feature_importance.csv", index = True)
#         useless_features_list = useless_features_df.index.tolist()
#         logger.info('Useless features: \'' + '\', \''.join(useless_features_list) + '\'')
#     return feature_importance_df_mean

In [5]:
args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":True,
 "feature_importance_plot": False,
 "SEED": 1030,
 "NUM_FOLDS": 5, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3,
 "TEST_NULL_HYPO":False,
 "model": "lgb",
 "ensemble":False,
 "seed":1030,
 "load_feature": True
}

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict(args)
args

{'train_file': '/data/yunrui_li/fraud/dataset/train.csv',
 'test_file': '/data/yunrui_li/fraud/dataset/test.csv',
 'result_path': '/data/yunrui_li/fraud/fraud_detection/result/submission.csv',
 'feature_selection': True,
 'feature_importance_plot': False,
 'SEED': 1030,
 'NUM_FOLDS': 5,
 'CPU_USE_RATE': 1.0,
 'STRATIFIED': True,
 'NUM_LEAVES': 31,
 'COLSAMPLE_BYTREE': 1.0,
 'SUBSAMPLE': 1.0,
 'SUBSAMPLE_FREQ': 0,
 'MAX_DEPTH': -1,
 'REG_ALPHA': 0.0,
 'REG_LAMBDA': 0.0,
 'MIN_SPLIT_GAIN': 0.0,
 'MIN_CHILD_WEIGHT': 0.001,
 'MAX_BIN': 255,
 'SCALE_POS_WEIGHT': 3,
 'TEST_NULL_HYPO': False,
 'model': 'lgb',
 'ensemble': False,
 'seed': 1030,
 'load_feature': True}

In [6]:
df_train, df_test = main(args)

  if (await self.run_code(code, result,  async_=asy)):


Train application df shape: (1521787, 477)
Train application df shape: (1521787, 477)
Test application df shape: (421665, 476)
Test application df shape: (421665, 476)
Load train/test features extracted - done in 135s
Load train/test features extracted - done in 135s
Train application df shape: (1521787, 487)
Train application df shape: (1521787, 487)
Test application df shape: (421665, 486)
Test application df shape: (421665, 486)
Add bacno time aggregate average feature - done in 244s
Add bacno time aggregate average feature - done in 244s
Train application df shape: (1521787, 492)
Train application df shape: (1521787, 492)
Test application df shape: (421665, 491)
Test application df shape: (421665, 491)
Add mcc time aggregate average feature - done in 20s
Add mcc time aggregate average feature - done in 20s
Train application df shape: (1521787, 494)
Train application df shape: (1521787, 494)
Test application df shape: (421665, 493)
Test application df shape: (421665, 493)
Add scity 

In [7]:
df = pd.concat([df_train,df_test], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [15]:
#df.to_csv("../fraud_detection/features/features.csv", index = False)
df.shape

(1943452, 500)

In [8]:
corr = df.corr()


In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# csmcu & scity positive correlation (幣別與消費城市)
plt.figure(figsize = (14,14))
plt.title('Credit Card Transactions features correlation plot (Pearson)')
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Reds")
plt.show()

<Figure size 1400x1400 with 2 Axes>

In [14]:
corr

Unnamed: 0,acqic,acqic_mean_conam_in_past_14_days,acqic_mean_conam_in_past_7_days,agg_mchno_mean_conam_in_past_14_days,agg_mchno_mean_conam_in_past_7_days,average_bacno_max_conam_in_past_14_days,average_bacno_max_conam_in_past_7_days,average_bacno_mean_conam_in_past_14_days,average_bacno_mean_conam_in_past_7_days,average_bacno_median_conam_in_past_14_days,...,var_conam_BY_stocn_locdt,var_iterm_BY_acqic,var_iterm_BY_bacno,var_iterm_BY_cano,var_iterm_BY_csmcu,var_iterm_BY_etymd,var_iterm_BY_loctm_hour_of_day,var_iterm_BY_mcc,var_iterm_BY_mchno,var_iterm_BY_scity
acqic,1.000000,0.320225,0.326286,0.160841,0.152204,-0.004555,-0.006539,0.044403,0.034820,0.046436,...,-0.112477,0.268873,0.026811,0.032059,0.448791,-0.206753,0.113479,0.112542,0.079699,0.205128
acqic_mean_conam_in_past_14_days,0.320225,1.000000,0.956168,0.302235,0.278542,0.107147,0.105148,0.156368,0.138444,0.153749,...,0.018734,0.375302,0.030571,0.034325,0.354873,0.053693,0.026289,0.177197,0.123213,0.344075
acqic_mean_conam_in_past_7_days,0.326286,0.956168,1.000000,0.296824,0.275176,0.103225,0.101161,0.152094,0.134622,0.149628,...,0.008544,0.371062,0.030365,0.034145,0.366249,0.058029,0.024129,0.176460,0.122027,0.340907
agg_mchno_mean_conam_in_past_14_days,0.160841,0.302235,0.296824,1.000000,0.923376,0.135408,0.131393,0.234383,0.200843,0.235897,...,-0.008018,0.183910,0.053379,0.057764,0.208587,0.051218,-0.012918,0.173418,0.195831,0.152786
agg_mchno_mean_conam_in_past_7_days,0.152204,0.278542,0.275176,0.923376,1.000000,0.121683,0.119705,0.216464,0.186823,0.218744,...,-0.015783,0.168561,0.051446,0.055730,0.198980,0.042117,-0.009302,0.169040,0.193530,0.150856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
var_iterm_BY_etymd,-0.206753,0.053693,0.058029,0.051218,0.042117,0.140184,0.137869,0.088563,0.088087,0.083396,...,0.110368,-0.013853,0.020704,0.023244,0.211011,1.000000,0.072035,0.134191,0.116551,0.057371
var_iterm_BY_loctm_hour_of_day,0.113479,0.026289,0.024129,-0.012918,-0.009302,0.025343,0.026045,-0.000931,0.004199,-0.000673,...,0.032707,0.041975,0.018992,0.019950,0.058031,0.072035,1.000000,0.080910,0.067462,0.048408
var_iterm_BY_mcc,0.112542,0.177197,0.176460,0.173418,0.169040,0.056615,0.056782,0.064041,0.059964,0.062424,...,-0.006004,0.313597,0.099081,0.104489,0.113184,0.134191,0.080910,1.000000,0.425703,0.335558
var_iterm_BY_mchno,0.079699,0.123213,0.122027,0.195831,0.193530,0.039344,0.038053,0.052592,0.047000,0.051263,...,-0.016700,0.317029,0.209848,0.223278,0.082405,0.116551,0.067462,0.425703,1.000000,0.337273


In [29]:
th = 0.9
f = []
feature_seen = []
for feat1 in corr.columns.tolist():
    for feat2, value in corr[feat1].iteritems():
        if feat1!=feat2:
            if value > th or value < (-1)* th:
                if feat2 not in feature_seen:
                    f.append((feat1,feat2,value))
                    feature_seen.append(feat1)

In [30]:
FEATURE_GRAVEYARD = []
for fea_pair in f:
    if fea_pair[0]!=fea_pair[1]:
        if (fea_pair[0]) not in FEATURE_GRAVEYARD and (fea_pair[1]) not in FEATURE_GRAVEYARD:
                FEATURE_GRAVEYARD.append(fea_pair[0])
            
FEATURE_GRAVEYARD = list(set(FEATURE_GRAVEYARD))

In [31]:
len(FEATURE_GRAVEYARD)

16

In [32]:
FEATURE_GRAVEYARD

['mean_iterm_BY_etymd',
 'sum_conam_BY_bacno_locdt',
 'stocn_mean_conam_in_past_14_days',
 'mean_conam_BY_bacno_mcc',
 'var_conam_BY_bacno_locdt_stocn_scity',
 'median_conam_BY_bacno_locdt_stocn',
 'mean_conam_BY_cano_locdt',
 'mean_conam_BY_bacno_locdt',
 'count_conam_BY_acqic',
 'mean_conam_BY_mchno_locdt',
 'mean_iterm_BY_mcc',
 'mean_iterm_BY_loctm_hour_of_day',
 'sum_conam_BY_cano_flbmk',
 'average_mcc_median_conam_in_past_7_days',
 'average_mcc_mean_conam_in_past_7_days',
 'average_bacno_mean_conam_in_past_7_days']

In [33]:
for pair in f:
    if pair[0]=="mean_iterm_BY_etymd":
        print (pair)

('mean_iterm_BY_etymd', 'var_iterm_BY_etymd', 0.9878016116458587)
