In [1]:
import sys
sys.path.append("../fraud_detection/src/")
import time
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from contextlib import contextmanager
import gc 
from util import s_to_time_format, string_to_datetime, hour_to_range, kfold_lightgbm, kfold_xgb
from util import rolling_stats_target_by_cols
#from util import _time_elapsed_between_last_transactions,time_elapsed_between_last_transactions
#from util import num_transaction_in_past_n_days
#from util import add_auto_encoder_feature
#from util import group_target_by_cols_split_by_users
from time import strftime, localtime
import logging
import sys
from config import Configs

# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))
#log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
log_file = '../fraud_detection/result/{}.log'.format(strftime("%y%m%d-%H%M", localtime()))
logger.addHandler(logging.FileHandler(log_file))

In [2]:
def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)

        # reduced memory    
        del tmp
        gc.collect()
    
    return df_train, df_test

In [3]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    logger.info("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    with timer("Process train/test application"):
        #-------------------------
        # load dataset
        #-------------------------
        df_train = pd.read_csv(args.train_file)
        df_test = pd.read_csv(args.test_file)

        #-------------------------
        # pre-processing
        #-------------------------

        for cat in Configs.CATEGORY:
            df_train[cat] = df_train[cat].astype('category') #.cat.codes
            df_test[cat] = df_test[cat].astype('category')
            
        for df in [df_train, df_test]:
            # pre-processing
            df["loctm_"] = df.loctm.astype(int).astype(str)
            df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
            # # time-related feature
            df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
            df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
            df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
            # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
            # removed the columns no need
            df.drop(columns = ["loctm_"], axis = 1, inplace = True)
            # auxiliary fields
            df["day_hr_min"] = ["{}:{}:{}".format(i,j,k) for i,j,k in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["day_hr_min_sec"] = ["{}:{}:{}:{}".format(i,j,k,z) for i,j,k,z in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour,df.loctm_second_of_min)]

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add bacno/cano feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_1)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add iterm-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.ITERM_AGG_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_2)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add hour-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.HOUR_AGG_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/conam feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CANO_CONAM_COUNT_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/bacno latent feature"):
        df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_cano.csv")
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")
        df = pd.read_csv("../fraud_detection/features/bacno_cano_latent_features.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add locdt-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.MCHNO_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add scity-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.SCITY_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add stocn-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.STOCN_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno/bacno latent feature"):
        df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_mchno.csv")
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")
        df = pd.read_csv("../fraud_detection/features/bacno_mchno_latent_features.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on bacno"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_BACNO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on cano"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_CANO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on mchno"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_MCHNO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on csmcu/stocn/scity"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on acqic/csmcu/stocn/scity"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_2,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature v3"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.CONAM_AGG_RECIPE_3,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add locdt-related feature v2"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE_2)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature v4"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.CONAM_AGG_RECIPE_4,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/mchno latent feature"):
        df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_mchno.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")
        df = pd.read_csv("../fraud_detection/features/cano_mchno_latent_features.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/locdt latent feature"):
        df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_locdt.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")
        df = pd.read_csv("../fraud_detection/features/cano_locdt_latent_features.csv")
        df_train = df_train.merge(df, on = "locdt", how = "left")
        df_test = df_test.merge(df, on = "locdt", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno/locdt latent feature"):
        df = pd.read_csv("../fraud_detection/features/mchno_latent_features_w_locdt.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")
        df = pd.read_csv("../fraud_detection/features/mchno_locdt_latent_features.csv")
        df_train = df_train.merge(df, on = "locdt", how = "left")
        df_test = df_test.merge(df, on = "locdt", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno time aggregate feature"):
        df = pd.read_csv("../fraud_detection/features/average_mchno_time_agg.csv")
        df_train = df_train.merge(df, on = "txkey", how = "left")
        df_test = df_test.merge(df, on = "txkey", how = "left")
  
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add cano/bacno ratio feature"):
    #     from util import num_transaction

    #     df_train = num_transaction(df_train,target = "cano")
    #     df_test = num_transaction(df_test,target = "cano")
    #     df_train = num_transaction(df_train,target = "bacno")
    #     df_test = num_transaction(df_test,target = "bacno")

    #     df_train["cano_ratio"] = df_train["cano_len"] / df_train["bacno_len"]
    #     df_test["cano_ratio"] = df_test["cano_len"] / df_test["bacno_len"]

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add if_conam_zero feature"):
    #     from util import if_conam_zero

    #     df_train = if_conam_zero(df_train)
    #     df_test = if_conam_zero(df_test)

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add DAGMM latent feature"):
    #     df_train["cano_locdt_index"] = ["{}_{}".format(str(i),str(j)) for i,j in zip(df_train.cano,df_train.locdt)]
    #     df_test["cano_locdt_index"] = ["{}_{}".format(str(i),str(j)) for i,j in zip(df_test.cano,df_test.locdt)]

    #     df = pd.read_csv("../features/DAGMM_features_less_input.csv")
    #     df_train = df_train.merge(df, on = "cano_locdt_index", how = "left").drop_duplicates("txkey")
    #     df_test = df_test.merge(df, on = "cano_locdt_index", how = "left").drop_duplicates("txkey")

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))
    #     del df
    #     gc.collect()

    # with timer("Add bacno/locdt latent feature"):
    #     df = pd.read_csv("../features/bacno_latent_features_w_locdt.csv")
    #     df_train = df_train.merge(df, on = "bacno", how = "left")
    #     df_test = df_test.merge(df, on = "bacno", how = "left")
    #     df = pd.read_csv("../features/bacno_locdt_latent_features.csv")
    #     df_train = df_train.merge(df, on = "locdt", how = "left")
    #     df_test = df_test.merge(df, on = "locdt", how = "left")

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add stocn/locdt latent feature"):
    #     df = pd.read_csv("../features/stocn_latent_features_w_locdt.csv")
    #     df_train = df_train.merge(df, on = "stocn", how = "left")
    #     df_test = df_test.merge(df, on = "stocn", how = "left")
    #     df = pd.read_csv("../features/stocn_locdt_latent_features.csv")
    #     df_train = df_train.merge(df, on = "locdt", how = "left")
    #     df_test = df_test.merge(df, on = "locdt", how = "left")

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))



    # with timer("Add elapsed time feature"):
    #     df = pd.concat([df_train, df_test], axis = 0)
    #     df.sort_values(by = ["bacno","locdt"], inplace = True)
        
    #     df["time_elapsed_between_last_transactions"] = df[["bacno","locdt"]] \
    #     .groupby("bacno").apply(_time_elapsed_between_last_transactions).values
        
    #     df_train = df[~df.fraud_ind.isnull()]
    #     df_test = df[df.fraud_ind.isnull()]
        
    #     df_test.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
    #     del df
    #     gc.collect()

    #     df_train["time_elapsed_between_last_transactions"] = df_train[["bacno","locdt","time_elapsed_between_last_transactions"]] \
    #     .groupby(["bacno","locdt"]).apply(time_elapsed_between_last_transactions).values
        
    #     df_test["time_elapsed_between_last_transactions"] = df_test[["bacno","locdt","time_elapsed_between_last_transactions"]] \
    #     .groupby(["bacno","locdt"]).apply(time_elapsed_between_last_transactions).values
        
    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add elapsed time aggregate feature"):
    #     df_train, df_test = group_target_by_cols(df_train, df_test, Configs.TIME_ELAPSED_AGG_RECIPE)

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))  

    # with timer("Add elapsed time related feature"):
    #     df_train, df_test = group_target_by_cols(df_train, df_test, Configs.TIME_ELAPSED_AGG_RECIPE_2)

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))  

    # with timer("Add historical-related feature"):
    #     df = pd.concat([df_train, df_test], axis = 0)
    #     df.sort_values(by = ["bacno","locdt"], inplace = True)
        
    #     for past_n_days in [2,3,4,5,6,7,14,30]:
    #         df["num_transaction_in_past_{}_days".format(past_n_days)] = df[["bacno","locdt"]].groupby("bacno")\
    #         .apply(lambda x: num_transaction_in_past_n_days(x,past_n_days)).values

    #     df_train = df[~df.fraud_ind.isnull()]
    #     df_test = df[df.fraud_ind.isnull()]
        
    #     df_test.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
    #     del df
    #     gc.collect()

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))  

    # with timer("Add descriptive stats in past transactions feature"):
    #     df_train, df_test = rolling_stats_target_by_cols(df_train,df_test, Configs.HISTORY_RECIPE)

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))  
       
    # with timer("Add scity/bacno latent feature"):
    #     df = pd.read_csv("../features/bacno_latent_features_w_scity.csv")
    #     df_train = df_train.merge(df, on = "bacno", how = "left")
    #     df_test = df_test.merge(df, on = "bacno", how = "left")
    #     df = pd.read_csv("../features/scity_latent_features.csv")
    #     df_train = df_train.merge(df, on = "scity", how = "left")
    #     df_test = df_test.merge(df, on = "scity", how = "left")

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer("Add stocn/bacno latent feature"):
    #     df = pd.read_csv("../features/bacno_latent_features_w_stocn.csv")
    #     df_train = df_train.merge(df, on = "bacno", how = "left")
    #     df_test = df_test.merge(df, on = "bacno", how = "left")
    #     df = pd.read_csv("../features/stocn_latent_features.csv")
    #     df_train = df_train.merge(df, on = "stocn", how = "left")
    #     df_test = df_test.merge(df, on = "stocn", how = "left")

    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))

    # with timer('Add time-aggregate features'):
    #     from extraction import merge_and_split_dfs, get_conam_dict_by_day, last_x_day_conam

    #     df, split_df = merge_and_split_dfs(df_train, df_test)
    #     conam_dict = get_conam_dict_by_day(df)

    #     df['last_3_day_mean_conam_per_day'] = last_x_day_conam(3, df, conam_dict)
    #     df['last_7_day_mean_conam_per_day'] = last_x_day_conam(7, df, conam_dict)
    #     df['last_10_day_mean_conam_per_day'] = last_x_day_conam(10, df, conam_dict)
    #     df['last_30_day_mean_conam_per_day'] = last_x_day_conam(30, df, conam_dict)

    #     df_train, df_test = split_df(df)
        
    #     logger.info("Train application df shape: {}".format(df_train.shape))
    #     logger.info("Test application df shape: {}".format(df_test.shape))



    with timer("Run LightGBM with kfold"):
        if args.feature_selection:
            logger.info("==============Feature Selection==============")
            for df in [df_train, df_test]:
                # drop random features (by null hypothesis)
                df.drop(Configs.FEATURE_GRAVEYARD, axis=1, inplace=True, errors='ignore')

                # drop unused features features_with_no_imp_at_least_twice
                df.drop(Configs.FEATURE_USELESSNESS, axis=1, inplace=True, errors='ignore')

                gc.collect()

        for df in [df_train, df_test]:
            df.drop(columns = ["loctm_hour_of_day",
                               "loctm_minute_of_hour", 
                               "loctm_second_of_min",
                               "day_hr_min",
                               "day_hr_min_sec",
                               #"cano_locdt_index",
                               #"cano_len",
                               #"bacno_len"
                               ], axis = 1, inplace = True)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))
        
    return df_train,df_test

In [4]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
        
        
args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":True,
 "feature_importance_plot": True,
 "SEED": 1030,
 "NUM_FOLDS": 5, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "TEST_NULL_HYPO":False,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3.0
    
}
args = AttrDict(args)
args

{'train_file': '/data/yunrui_li/fraud/dataset/train.csv',
 'test_file': '/data/yunrui_li/fraud/dataset/test.csv',
 'result_path': '/data/yunrui_li/fraud/fraud_detection/result/submission.csv',
 'feature_selection': True,
 'feature_importance_plot': True,
 'SEED': 1030,
 'NUM_FOLDS': 5,
 'CPU_USE_RATE': 1.0,
 'STRATIFIED': True,
 'TEST_NULL_HYPO': False,
 'NUM_LEAVES': 31,
 'COLSAMPLE_BYTREE': 1.0,
 'SUBSAMPLE': 1.0,
 'SUBSAMPLE_FREQ': 0,
 'MAX_DEPTH': -1,
 'REG_ALPHA': 0.0,
 'REG_LAMBDA': 0.0,
 'MIN_SPLIT_GAIN': 0.0,
 'MIN_CHILD_WEIGHT': 0.001,
 'MAX_BIN': 255,
 'SCALE_POS_WEIGHT': 3.0}

In [5]:
df_train, df_test = main(args)


Train application df shape: (1521787, 29)
Test application df shape: (421665, 28)
Process train/test application - done in 67s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Train application df shape: (1521787, 43)
Test application df shape: (421665, 42)
Add bacno/cano feature - done in 16s
Train application df shape: (1521787, 115)
Test application df shape: (421665, 114)
Add iterm-related feature - done in 93s
Train application df shape: (1521787, 185)
Test application df shape: (421665, 184)
Add conam-related feature - done in 125s
Train application df shape: (1521787, 209)
Test application df shape: (421665, 208)
Add hour-related feature - done in 465s
Train application df shape: (1521787, 210)
Test application df shape: (421665, 209)
Add cano/conam feature - done in 38s
Train application df shape: (1521787, 230)
Test application df shape: (421665, 229)
Add cano/bacno latent feature - done in 4s
Train application df shape: (1521787, 265)
Test application df shape: (421665, 264)
Add locdt-related feature - done in 196s
Train application df shape: (1521787, 272)
Test application df shape: (421665, 271)
Add mchno-related feature - done in 91s
Train appli

In [6]:
df_train.to_csv("../fraud_detection/features/train.csv",index = False)

In [7]:
df_test.to_csv("../fraud_detection/features/test.csv",index = False)

In [8]:
df_train

Unnamed: 0,acqic,bacno,cano,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,...,mchno_locdt_latent_features_2,mchno_locdt_latent_features_3,mchno_locdt_latent_features_4,mchno_locdt_latent_features_5,mchno_locdt_latent_features_6,mchno_locdt_latent_features_7,mchno_locdt_latent_features_8,mchno_locdt_latent_features_9,agg_mchno_mean_conam_in_past_7_days,agg_mchno_mean_conam_in_past_14_days
0,6881,113261,38038,5,0,N,0,N,N,0,...,0.444603,-1.241370,0.374940,-0.124085,0.665890,0.468656,0.617257,-0.801575,515.868949,515.847982
1,0,134508,45725,5,0,N,2,N,N,0,...,0.493498,-1.215936,0.416892,-0.243898,0.684927,0.503158,0.713435,-0.765226,490.810931,490.684879
2,6881,15408,188328,5,0,N,0,N,N,0,...,0.482227,-1.447159,0.431211,-0.139393,0.728385,0.499592,0.723351,-0.872364,515.868949,515.847982
3,6716,157159,29967,5,62,N,5,N,N,0,...,0.507119,-1.384453,0.441587,-0.249081,0.661592,0.543381,0.754968,-0.826779,1042.617104,1049.036332
4,5975,105985,81305,5,62,N,4,N,N,0,...,0.482227,-1.447159,0.431211,-0.139393,0.728385,0.499592,0.723351,-0.872364,414.305820,394.415259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521782,6322,91008,15189,5,75,Y,8,,,0,...,0.475790,-1.247112,0.503047,-0.262964,0.771106,0.481871,0.776127,-0.830193,799.426085,799.306556
1521783,3226,145107,116252,5,75,Y,8,,,0,...,0.550710,-1.359502,0.412460,-0.260315,0.696833,0.493119,0.776544,-0.829842,306.726232,306.400758
1521784,6769,162168,93598,5,75,Y,8,,,0,...,0.574788,-1.295733,0.465079,-0.299732,0.707674,0.461632,0.648257,-0.859653,1.380000,1.380000
1521785,6032,45406,197460,5,75,Y,2,,,0,...,0.562824,-1.249367,0.392798,-0.133247,0.686282,0.515431,0.664315,-0.825394,1.380000,1.380000
