In [1]:
"""
python3 main.py ../../dataset/train.csv ../../dataset/test.csv ../result/cv_results.csv ../result/submission.csv > ../result/logs.txt

make train

"""
import sys
sys.path.append("../fraud_detection/src/")
import time
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from contextlib import contextmanager
import gc 
from util import s_to_time_format, string_to_datetime, hour_to_range, kfold_lightgbm, kfold_xgb
from util import _time_elapsed_between_last_transactions,time_elapsed_between_last_transactions
from util import num_transaction_in_past_n_days
from time import strftime, localtime
import logging
import sys
from config import Configs
from extraction import merge_and_split_dfs, get_conam_dict_by_day, last_x_day_conam

# logging
logger = logging.getLogger()
# logger.setLevel(logging.INFO)
# logger.addHandler(logging.StreamHandler(sys.stdout))
# #log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
# log_file = '../result/{}.log'.format(strftime("%y%m%d-%H%M", localtime()))
# logger.addHandler(logging.FileHandler(log_file))

def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)
            
            del tmp
            gc.collect()
    
    return df_train, df_test

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    with timer("Process train/test application"):
        #-------------------------
        # load dataset
        #-------------------------
        df_train = pd.read_csv(args.train_file)
        df_test = pd.read_csv(args.test_file)

        #-------------------------
        # pre-processing
        #-------------------------

        for cat in Configs.CATEGORY:
            df_train[cat] = df_train[cat].astype('category') #.cat.codes
            df_test[cat] = df_test[cat].astype('category')
            
        for df in [df_train, df_test]:
            # pre-processing
            df["loctm_"] = df.loctm.astype(int).astype(str)
            df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
            # # time-related feature
            df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
            df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
            df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
            # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
            # removed the columns no need
            #df.drop(columns = ["loctm_"], axis = 1, inplace = True)
        print("Train application df shape: {}".format(df_train.shape))
        print("Test application df shape: {}".format(df_test.shape))
        

#     with timer("Add bacno/cano feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_1)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

    return df_train, df_test


In [2]:
args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":False,
 "feature_importance_plot": True,
 "SEED": 1030,
 "NUM_FOLDS": 2, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "TEST_NULL_HYPO":False,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3
    
}

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
        
args = AttrDict(args)
df_train, df_test = main(args)


Train application df shape: (1521787, 28)
Test application df shape: (421665, 27)
Process train/test application - done in 63s


In [6]:
for df in [df_train, df_test]:
    df["day_hr_min"] = ["{}:{}:{}".format(i,j,k) for i,j,k in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour)]
    df["day_hr_min_sec"] = ["{}:{}:{}:{}".format(i,j,k,z) for i,j,k,z in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour,df.loctm_second_of_min)]

In [9]:
def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)

        # reduced memory    
        del tmp
        gc.collect()
    
    return df_train, df_test

In [8]:
df_train["day_hr_min_sec"]

0          33:17:26:52
1           9:10:51:14
2           6:15:24:58
3           5:17:29:46
4           6:18:21:29
              ...     
1521782     4:19:16:42
1521783    13:10:23:38
1521784    29:23:46:18
1521785    24:21:52:18
1521786     13:16:36:3
Name: day_hr_min_sec, Length: 1521787, dtype: object

In [20]:
HOUR_AGG_SEC_LEVEL_RECIPE_BACNO = [
    (["csmcu","day_hr_min"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一消費地幣別, 在同一天, 同一分鐘, 刷了幾次卡, 刷卡最大金額, ...
    (["csmcu","day_hr_min_sec"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一消費地幣別, 在同一天, 同一分鐘, 同一秒鐘, 刷了幾次卡, 刷卡最大金額, ...
    (["stocn","day_hr_min"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一歸戶, 在同一天, 同一分鐘, 刷了幾次卡, 刷卡最大金額, ...
    (["stocn","day_hr_min_sec"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一歸戶, 在同一天, 同一分鐘, 同一秒鐘, 刷了幾次卡, 刷卡最大金額, ...

    (["scity","day_hr_min"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一歸戶, 在同一天, 同一分鐘, 刷了幾次卡, 刷卡最大金額, ...
    (["scity","day_hr_min_sec"], [
            ('conam', 'count'),
            ('conam', 'min'),
            ('conam', 'max'),
            ('conam', 'mean'),
            ('conam', 'median'),
            ('conam', 'var'),
            ('conam', 'sum'),
        ]), # 同一歸戶, 在同一天, 同一分鐘, 同一秒鐘, 刷了幾次卡, 刷卡最大金額, ...
]
with timer("Add time second-level feature on bacno"):
    df_train, df_test = group_target_by_cols(
        df_train, 
        df_test, 
        HOUR_AGG_SEC_LEVEL_RECIPE_BACNO,
        )

    print("Train application df shape: {}".format(df_train.shape))
    print("Test application df shape: {}".format(df_test.shape))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Train application df shape: (1521787, 86)
Test application df shape: (421665, 85)
Add time second-level feature on bacno - done in 275s


In [21]:
df_train.columns

Index(['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'fraud_ind', 'hcefg', 'insfg', 'iterm', 'locdt',
       'loctm', 'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'txkey',
       'loctm_', 'loctm_hour_of_day', 'loctm_minute_of_hour',
       'loctm_second_of_min', 'hour_range', 'day_hr_min', 'day_hr_min_sec',
       'count_conam_BY_bacno_day_hr_min', 'min_conam_BY_bacno_day_hr_min',
       'max_conam_BY_bacno_day_hr_min', 'mean_conam_BY_bacno_day_hr_min',
       'median_conam_BY_bacno_day_hr_min', 'var_conam_BY_bacno_day_hr_min',
       'sum_conam_BY_bacno_day_hr_min', 'count_conam_BY_bacno_day_hr_min_sec',
       'min_conam_BY_bacno_day_hr_min_sec',
       'max_conam_BY_bacno_day_hr_min_sec',
       'mean_conam_BY_bacno_day_hr_min_sec',
       'median_conam_BY_bacno_day_hr_min_sec',
       'var_conam_BY_bacno_day_hr_min_sec',
       'sum_conam_BY_bacno_day_hr_min_sec', 'count_conam_BY_csmcu_day_hr_min',
       'min_conam_BY_cs

In [22]:
pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 500


df_train\
.sort_values(by = ["bacno","locdt","loctm_hour_of_day",
                            "loctm_minute_of_hour","loctm_second_of_min","loctm_second_of_min"], inplace = True)

df_train[df_train.bacno == 22313][["bacno","cano",
           "conam","fraud_ind","locdt", 
           "loctm_hour_of_day","loctm_minute_of_hour",
           "loctm_second_of_min","count_conam_BY_bacno_day_hr_min","count_conam_BY_bacno_day_hr_min_sec",
           "count_conam_BY_csmcu_day_hr_min"
]]


Unnamed: 0,bacno,cano,conam,fraud_ind,locdt,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,count_conam_BY_bacno_day_hr_min,count_conam_BY_bacno_day_hr_min_sec,count_conam_BY_csmcu_day_hr_min
206556,22313,164797,103.48,0,5,1,58,37,1,1,3
206557,22313,164796,465.62,0,10,10,44,45,1,1,49
206558,22313,164797,468.74,0,14,5,50,11,5,2,6
206559,22313,164797,467.7,0,14,5,50,11,5,2,6
206560,22313,164797,473.87,0,14,5,50,12,5,1,6
206561,22313,164797,519.1,0,14,5,50,13,5,2,6
206562,22313,164797,492.59,0,14,5,50,13,5,2,6
206563,22313,164797,576.19,0,14,6,20,51,7,2,8
206564,22313,164797,701.45,0,14,6,20,51,7,2,8
206565,22313,164797,701.45,0,14,6,20,52,7,4,8


In [19]:
df_train.mcc.value_counts()

251    193332
247    141106
263    108297
457    103307
292     74036
264     65846
343     60033
451     44515
380     43252
289     39989
250     38514
248     36090
191     34380
257     33743
192     30718
349     26438
373     21512
453     20432
203     18754
277     17807
209     16483
207     14535
286     13719
291     12925
306     12584
270     12241
305     11322
281     11208
294     10050
432      9626
189      9584
299      8508
397      7694
288      7551
245      6542
296      6525
388      6317
411      6166
322      6140
319      6107
337      5832
275      5438
340      5345
317      5152
329      5134
441      5002
459      4810
280      4802
421      4304
255      3984
201      3978
272      3970
446      3940
212      3927
57       3650
210      3610
334      3529
218      3374
357      3363
417      3132
318      3062
295      3028
413      2977
193      2599
381      2559
367      2551
300      2529
374      2513
408      2490
339      2133
309      1952
205   