In [1]:
"""
python3 main.py ../../dataset/train.csv ../../dataset/test.csv ../result/cv_results.csv ../result/submission.csv > ../result/logs.txt

make train

"""
import sys
sys.path.append("../fraud_detection/src/")
import time
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from contextlib import contextmanager
import gc 
from util import s_to_time_format, string_to_datetime, hour_to_range, kfold_lightgbm, kfold_xgb
from util import _time_elapsed_between_last_transactions,time_elapsed_between_last_transactions
from util import num_transaction_in_past_n_days
from time import strftime, localtime
import logging
import sys
from config import Configs
from extraction import merge_and_split_dfs, get_conam_dict_by_day, last_x_day_conam

# logging
logger = logging.getLogger()
# logger.setLevel(logging.INFO)
# logger.addHandler(logging.StreamHandler(sys.stdout))
# #log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
# log_file = '../result/{}.log'.format(strftime("%y%m%d-%H%M", localtime()))
# logger.addHandler(logging.FileHandler(log_file))

def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)
    del tmp
    gc.collect()
    
    return df_train, df_test

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    with timer("Process train/test application"):
        #-------------------------
        # load dataset
        #-------------------------
        df_train = pd.read_csv(args.train_file)
        df_test = pd.read_csv(args.test_file)

        #-------------------------
        # pre-processing
        #-------------------------

        for cat in Configs.CATEGORY:
            df_train[cat] = df_train[cat].astype('category') #.cat.codes
            df_test[cat] = df_test[cat].astype('category')
            
        for df in [df_train, df_test]:
            # pre-processing
            df["loctm_"] = df.loctm.astype(int).astype(str)
            df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
            # # time-related feature
            df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
            df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
            df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
            # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
            # removed the columns no need
            df.drop(columns = ["loctm_"], axis = 1, inplace = True)
        print("Train application df shape: {}".format(df_train.shape))
        print("Test application df shape: {}".format(df_test.shape))
        
#     with timer('Add time-aggregate features'):

#         df, split_df = merge_and_split_dfs(df_train, df_test)
#         conam_dict = get_conam_dict_by_day(df)

#         df['last_3_day_mean_conam_per_day'] = last_x_day_conam(3, df, conam_dict)
#         df['last_7_day_mean_conam_per_day'] = last_x_day_conam(7, df, conam_dict)
#         df['last_10_day_mean_conam_per_day'] = last_x_day_conam(10, df, conam_dict)
#         df['last_14_day_mean_conam_per_day'] = last_x_day_conam(14, df, conam_dict)
#         df['last_30_day_mean_conam_per_day'] = last_x_day_conam(30, df, conam_dict)
#         df['last_45_day_mean_conam_per_day'] = last_x_day_conam(45, df, conam_dict)
#         df['last_60_day_mean_conam_per_day'] = last_x_day_conam(60, df, conam_dict)

#         df_train, df_test = split_df(df)
#         print("Train application df shape: {}".format(df_train.shape))
#         print("Test application df shape: {}".format(df_test.shape))

#     with timer("Add bacno/cano feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_1)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add iterm-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.ITERM_AGG_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add conam-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_2)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add hour-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.HOUR_AGG_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add cano/conam feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CANO_CONAM_COUNT_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add cano/bacno latent feature"):
#         df = pd.read_csv("../fraud_detection/features/bacno_latent_features.csv")
#         df_train = df_train.merge(df, on = "bacno", how = "left")
#         df_test = df_test.merge(df, on = "bacno", how = "left")
#         df = pd.read_csv("../fraud_detection/features/cano_latent_features.csv")
#         df_train = df_train.merge(df, on = "cano", how = "left")
#         df_test = df_test.merge(df, on = "cano", how = "left")

#         print("Train application df shape: {}".format(df_train.shape))
#         print("Test application df shape: {}".format(df_test.shape))

#     with timer("Add locdt-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add mchno-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.MCHNO_CONAM_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add scity-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.SCITY_CONAM_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add stocn-related feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.STOCN_CONAM_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add mchno/bacno latent feature"):
#         df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_mchno.csv")
#         df_train = df_train.merge(df, on = "bacno", how = "left")
#         df_test = df_test.merge(df, on = "bacno", how = "left")
#         df = pd.read_csv("../fraud_detection/features/mchno_latent_features.csv")
#         df_train = df_train.merge(df, on = "mchno", how = "left")
#         df_test = df_test.merge(df, on = "mchno", how = "left")

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add scity/bacno latent feature"):
#         df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_scity.csv")
#         df_train = df_train.merge(df, on = "bacno", how = "left")
#         df_test = df_test.merge(df, on = "bacno", how = "left")
#         df = pd.read_csv("../fraud_detection/features/scity_latent_features.csv")
#         df_train = df_train.merge(df, on = "scity", how = "left")
#         df_test = df_test.merge(df, on = "scity", how = "left")

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

#     with timer("Add stocn/bacno latent feature"):
#         df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_stocn.csv")
#         df_train = df_train.merge(df, on = "bacno", how = "left")
#         df_test = df_test.merge(df, on = "bacno", how = "left")
#         df = pd.read_csv("../fraud_detection/features/stocn_latent_features.csv")
#         df_train = df_train.merge(df, on = "stocn", how = "left")
#         df_test = df_test.merge(df, on = "stocn", how = "left")

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))
        
#     with timer("Add elapsed time feature"):
#         df = pd.concat([df_train, df_test], axis = 0)
#         df.sort_values(by = ["bacno","locdt"], inplace = True)
        
#         df["time_elapsed_between_last_transactions"] = df[["bacno","locdt"]] \
#         .groupby("bacno").apply(_time_elapsed_between_last_transactions).values
        
#         df_train = df[~df.fraud_ind.isnull()]
#         df_test = df[df.fraud_ind.isnull()]
        
#         df_test.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
#         del df
#         gc.collect()

#         df_train["time_elapsed_between_last_transactions"] = df_train[["bacno","locdt","time_elapsed_between_last_transactions"]] \
#         .groupby(["bacno","locdt"]).apply(time_elapsed_between_last_transactions).values
        
#         df_test["time_elapsed_between_last_transactions"] = df_test[["bacno","locdt","time_elapsed_between_last_transactions"]] \
#         .groupby(["bacno","locdt"]).apply(time_elapsed_between_last_transactions).values
        
#         print("Train application df shape: {}".format(df_train.shape))
#         print("Test application df shape: {}".format(df_test.shape))

    with timer("Add historical-related feature"):
        df = pd.concat([df_train, df_test], axis = 0)
        df.sort_values(by = ["bacno","locdt"], inplace = True)
        
        for past_n_days in [2,3,4,5,6,7,14,30]:
            df["num_transaction_in_past_{}_days".format(past_n_days)] = df[["bacno","locdt"]].groupby("bacno")\
            .apply(lambda x: num_transaction_in_past_n_days(x,past_n_days)).values

        df_train = df[~df.fraud_ind.isnull()]
        df_test = df[df.fraud_ind.isnull()]
        
        df_test.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
        del df
        gc.collect()
       
        print("Train application df shape: {}".format(df_train.shape))
        print("Test application df shape: {}".format(df_test.shape))


#     with timer("Add elapsed time agg feature"):
#         df_train, df_test = group_target_by_cols(df_train, df_test, Configs.TIME_ELAPSED_AGG_RECIPE)

#         logger.info("Train application df shape: {}".format(df_train.shape))
#         logger.info("Test application df shape: {}".format(df_test.shape))

    return df_train, df_test




In [2]:
#NUM_FOLDS = 5
#CPU_USE_RATE = 0.8
#STRATIFIED = True
#TEST_NULL_HYPO = False
#ITERATION = (80 if TEST_NULL_HYPO else 1)

args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":False,
 "feature_importance_plot": True,
 "SEED": 1030,
 "NUM_FOLDS": 2, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "TEST_NULL_HYPO":False,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3
    
}

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [3]:
args = AttrDict(args)
args

{'train_file': '/data/yunrui_li/fraud/dataset/train.csv',
 'test_file': '/data/yunrui_li/fraud/dataset/test.csv',
 'result_path': '/data/yunrui_li/fraud/fraud_detection/result/submission.csv',
 'feature_selection': False,
 'feature_importance_plot': True,
 'SEED': 1030,
 'NUM_FOLDS': 2,
 'CPU_USE_RATE': 1.0,
 'STRATIFIED': True,
 'TEST_NULL_HYPO': False,
 'NUM_LEAVES': 31,
 'COLSAMPLE_BYTREE': 1.0,
 'SUBSAMPLE': 1.0,
 'SUBSAMPLE_FREQ': 0,
 'MAX_DEPTH': -1,
 'REG_ALPHA': 0.0,
 'REG_LAMBDA': 0.0,
 'MIN_SPLIT_GAIN': 0.0,
 'MIN_CHILD_WEIGHT': 0.001,
 'MAX_BIN': 255,
 'SCALE_POS_WEIGHT': 3}

In [4]:
df_train, df_test = main(args)
#assert 1==0

Train application df shape: (1521787, 27)
Test application df shape: (421665, 26)
Process train/test application - done in 63s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Train application df shape: (1521787, 35)
Test application df shape: (421665, 34)
Add historical-related feature - done in 295s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500

past_n_days = 2
df_train[df_train.bacno == 22313][["bacno",
                                   "conam","locdt","fraud_ind","num_transaction_in_past_{}_days".format(past_n_days)]]

Unnamed: 0,bacno,conam,locdt,fraud_ind,num_transaction_in_past_2_days
325442,22313,103.48,5,0.0,0
62624,22313,465.62,10,0.0,0
260554,22313,519.1,14,0.0,0
323567,22313,492.59,14,0.0,0
505497,22313,468.74,14,0.0,0
562900,22313,467.7,14,0.0,0
568596,22313,701.45,14,0.0,0
622879,22313,738.75,14,0.0,0
640117,22313,576.19,14,0.0,0
873689,22313,716.83,14,0.0,0


In [None]:
# capture temporal dependencies of conam (通常盜刷那幾筆, 金額數都會很低, 然後數字再起來)
# 匹如說 df_train.bacno == 131439
# 匹如說 df_train.bacno == 22313 (特別明顯)

In [None]:
df_train.groupby([""])

In [11]:
df_train[df_train.fraud_ind == 1].bacno.sample()

413449    22313
Name: bacno, dtype: int64

In [20]:
 df_test[['txkey',"bacno","locdt"]]

Unnamed: 0,txkey,bacno,locdt
182946,1751253,0,102
412770,1688453,5,119
97049,635437,6,99
97048,417354,6,113
223371,888505,10,91
...,...,...,...
299285,1254903,163885,108
299287,899458,163885,113
299282,1329852,163885,115
148335,300674,163886,109


In [None]:
# time-series split

In [31]:
df_train_ = df_train.sort_values(by = "locdt")

In [33]:
feats = ["bacno","locdt","fraud_ind"]
df_train_[feats]

Unnamed: 0,bacno,locdt,fraud_ind
484091,121415,1,0.0
1185472,42906,1,0.0
1009736,42906,1,0.0
496888,42906,1,0.0
374897,42906,1,0.0
...,...,...,...
267970,91258,90,0.0
99342,17258,90,0.0
1033135,17264,90,0.0
798342,91308,90,0.0


In [39]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = df_train_[feats]
y = df_train_["fraud_ind"]
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)  

for train_index, test_index in tscv.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    print ("NUM_TRAIN",len(train_index))
    print ("NUM_TEST",len(test_index))
    X_train, X_test = X[feats].iloc[train_index], X['fraud_ind'].iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
"""

"""

TimeSeriesSplit(max_train_size=None, n_splits=5)
NUM_TRAIN 253632
NUM_TEST 253631
NUM_TRAIN 507263
NUM_TEST 253631
NUM_TRAIN 760894
NUM_TEST 253631
NUM_TRAIN 1014525
NUM_TEST 253631
NUM_TRAIN 1268156
NUM_TEST 253631


In [36]:
X_train

Unnamed: 0,bacno,locdt,fraud_ind
484091,121415,1,0.0
1185472,42906,1,0.0
1009736,42906,1,0.0
496888,42906,1,0.0
374897,42906,1,0.0
...,...,...,...
253469,55262,75,0.0
732289,87473,75,0.0
974156,87473,75,0.0
1255827,93555,75,0.0


In [41]:
df_train[df_train.locdt > 70][feats]

Unnamed: 0,bacno,locdt,fraud_ind
545593,1,79,0.0
471824,2,71,0.0
1484119,2,71,0.0
526545,2,72,0.0
569282,2,72,0.0
...,...,...,...
748156,163883,80,0.0
888236,163883,80,0.0
247174,163883,83,0.0
1018048,163883,83,0.0


In [43]:
df_train[df_train.bacno == 163883][feats]

Unnamed: 0,bacno,locdt,fraud_ind
953092,163883,1,0.0
1506892,163883,1,0.0
224327,163883,2,0.0
1047638,163883,2,0.0
897478,163883,16,0.0
1162249,163883,16,0.0
394870,163883,26,0.0
1358644,163883,26,0.0
688748,163883,27,0.0
1352654,163883,28,0.0


In [79]:
def unique_transaction(x):
    if len(x) > 1:
        return 1
    return 0

df = pd.concat([df_train, df_test], axis = 0)
test = df.groupby("bacno").apply(unique_transaction)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [80]:
train_user = set(df_train.bacno.unique())
test_user = set(df_test.bacno.unique())

In [82]:
test = test.to_frame("only_one_transaction").reset_index()

In [86]:
test[test.bacno.isin(train_user)].only_one_transaction.value_counts(normalize = True)

1    0.914088
0    0.085912
Name: only_one_transaction, dtype: float64

In [87]:
test[test.bacno.isin(test_user)].only_one_transaction.value_counts(normalize = True)

1    0.790616
0    0.209384
Name: only_one_transaction, dtype: float64

In [None]:
# 教你怎麼觀察feature (case study)

In [15]:
pd.options.display.max_rows = 100
df_train[df_train.bacno == 99141][["bacno","cano","locdt","conam","time_elapsed_between_last_transactions",
                                    "fraud_ind",
                                    #"min_time_elapsed_between_last_transactions_BY_conam",
                                    #"max_time_elapsed_between_last_transactions_BY_conam",
                                    #"mean_time_elapsed_between_last_transactions_BY_bacno_hour_range"
                                   ]
                                  ]
# 同一個歸戶, 有沒有更換卡號
# 同一卡號, 有多少筆相同金額

Unnamed: 0,bacno,cano,locdt,conam,time_elapsed_between_last_transactions,fraud_ind
993764,99141,145591,2,915.01,-1.0,0.0
1306615,99141,145591,6,734.84,4.0,0.0
66691,99141,145591,9,465.62,3.0,0.0
415133,99141,145591,12,513.8,3.0,0.0
125467,99141,145591,13,513.8,1.0,0.0
509921,99141,145591,13,932.27,1.0,0.0
697468,99141,145591,13,513.8,1.0,0.0
737057,99141,145591,17,881.97,4.0,0.0
629442,99141,145591,19,513.8,2.0,0.0
80501,99141,145591,21,465.38,2.0,0.0


In [7]:
df_test.shape

(421665, 79)

In [None]:
pd.options.display.max_columns = 300
# len(df_train[df_train.reconstruction_error.isnull()])
df_train[["bacno","locdt","fraud_ind","time_elapsed_between_last_transactions"]]

In [12]:
test_case = df_train[df_train.bacno == 163884][["bacno","locdt","fraud_ind","time_elapsed_between_last_transactions"]]

In [13]:
test_case

Unnamed: 0,bacno,locdt,fraud_ind,time_elapsed_between_last_transactions
47582,163884,14,0.0,
884424,163884,14,0.0,
984588,163884,14,0.0,
374166,163884,16,0.0,2.0
662712,163884,17,0.0,1.0
164387,163884,23,0.0,6.0
113023,163884,25,0.0,2.0
240983,163884,30,0.0,5.0
623366,163884,32,0.0,2.0
725381,163884,38,0.0,6.0


In [14]:
test_case = df_test[df_test.bacno == 163884][["bacno","locdt","fraud_ind",
                                              "time_elapsed_between_last_transactions"]]

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,hour_range,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,time_elapsed_between_last_transactions
182946,6189,0,114775,929.46,5,62,N,2,N,N,5,morning,N,0,102,100045.0,10,0,45,343,87081,N,5817,102,0,1751253,
412770,5975,5,49266,225.84,5,62,N,4,N,N,5,morning,N,0,119,101355.0,10,13,55,263,92577,N,5817,102,0,1688453,
97049,5112,6,213186,513.80,5,0,N,0,N,N,5,early_morning,N,0,99,72127.0,7,21,27,453,544,N,0,102,0,635437,
97048,6769,6,213186,210.36,5,62,N,5,N,N,5,night,N,0,113,202621.0,20,26,21,251,84157,N,5817,102,0,417354,14.0
223371,5982,10,59511,419.79,5,62,N,4,N,N,5,afternoon,N,0,91,151539.0,15,15,39,251,24881,N,3297,102,0,888505,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299285,6677,163885,13454,561.61,5,62,N,4,N,N,5,afternoon,N,0,108,151015.0,15,10,15,292,27266,N,6032,102,0,1254903,1.0
299287,6231,163885,13454,794.93,5,62,Y,2,N,Y,5,noon,N,0,113,123048.0,12,30,48,191,20434,N,5817,102,0,899458,5.0
299282,6231,163885,13454,794.93,5,62,Y,2,N,Y,5,noon,N,0,115,140454.0,14,4,54,191,20434,N,5817,102,0,1329852,2.0
148335,6769,163886,42854,2403.67,4,62,N,2,N,N,5,morning,N,0,109,90150.0,9,1,50,343,78638,N,5817,102,0,300674,


In [17]:
df_test[df_test.bacno == 0]

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,hour_range,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,time_elapsed_between_last_transactions
182946,6189,0,114775,929.46,5,62,N,2,N,N,5,morning,N,0,102,100045.0,10,0,45,343,87081,N,5817,102,0,1751253,


In [22]:
df_test[df_test.bacno == 6][["bacno","locdt","time_elapsed_between_last_transactions"]]
# add one columne to represent if this bacno is a new user
# fill na with -1(optional)

Unnamed: 0,bacno,locdt,time_elapsed_between_last_transactions
97049,6,99,
97048,6,113,14.0


In [38]:
df_test[df_test.bacno == 26056][["bacno","locdt","time_elapsed_between_last_transactions"]]

Unnamed: 0,bacno,locdt,time_elapsed_between_last_transactions
360110,26056,93,
360118,26056,93,
360115,26056,94,1.0
360116,26056,94,1.0
360113,26056,95,1.0
360114,26056,98,3.0
360117,26056,98,3.0
360112,26056,114,16.0
360119,26056,115,1.0
360111,26056,120,5.0


In [39]:
df_test.sample()

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,hour_range,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,time_elapsed_between_last_transactions
162746,6769,9541,12155,297.11,5,62,N,5,N,N,5,afternoon,N,0,101,163555.0,16,35,55,251,78074,N,5812,102,0,98115,6.0


In [40]:
# new user 的feature(如果比例很高)
df = pd.concat([df_train, df_test], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [45]:
history_user = set(df_train.bacno.unique())

In [46]:
df_test["is_new_user"] = [0 if user in history_user else 1 for user in df_test.bacno]

In [48]:
df_test.is_new_user.value_counts(normalize = True)

1    0.948516
0    0.051484
Name: is_new_user, dtype: float64

In [53]:
def unique_transaction(x):
    if len(x) > 1:
        return 1
    return 0
unique_transaction = df_test[df_test.is_new_user == 1].groupby("bacno").apply(unique_transaction)

In [56]:
unique_transaction.value_counts(normalize = True)

1    0.78098
0    0.21902
dtype: float64

In [None]:
def _time_elapsed_between_last_transactions(df):
    if len(df) > 1:
        df.time_elapsed_between_last_transactions = [df.time_elapsed_between_last_transactions.iloc[0] for i in range(len(df))]
        return df
    else:
        return df
test_case.groupby(["bacno","locdt"]).apply(_time_elapsed_between_last_transactions)

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
import multiprocessing
import gc

seed = int(time.time())
           
# Cross validation model
if args.stratified:
    folds = StratifiedKFold(n_splits= args.num_folds, shuffle=True, random_state=seed)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train.shape[0])
#train_preds = np.zeros(df_train.shape[0])
sub_preds = np.zeros(df_test.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in df_train.columns if f not in ["fraud_ind"]]

In [None]:
len(feats)

In [None]:
if args.TEST_NULL_HYPO:
    # shuffling our label for feature selection
    df_train['fraud_ind'] = df_train['fraud_ind'].copy().sample(frac=1.0).values
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[feats], df_train['fraud_ind'])):
    train_x, train_y = df_train[feats].iloc[train_idx], df_train['fraud_ind'].iloc[train_idx]
    valid_x, valid_y = df_train[feats].iloc[valid_idx], df_train['fraud_ind'].iloc[valid_idx]
    # LightGBM parameters found by Bayesian optimization
    if TEST_NULL_HYPO:
        clf = lgb.LGBMClassifier(
            nthread=int(multiprocessing.cpu_count()*args.CPU_USE_RATE),
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=127,
            max_depth=MAX_DEPTH,
            silent=-1,
            verbose=-1,
            random_state=seed,
            )
    else:
        clf = lgb.LGBMClassifier(
            nthread=int(multiprocessing.cpu_count()*args.CPU_USE_RATE),
            n_estimators=10000,
            learning_rate=0.1, # 0.02
            num_leaves=args.NUM_LEAVES,
            colsample_bytree=args.COLSAMPLE_BYTREE,
            subsample=args.SUBSAMPLE,
            subsample_freq=args.SUBSAMPLE_FREQ,
            max_depth=args.MAX_DEPTH,
            reg_alpha=args.REG_ALPHA,
            reg_lambda=args.REG_LAMBDA,
            min_split_gain=args.MIN_SPLIT_GAIN,
            min_child_weight=args.MIN_CHILD_WEIGHT,
            max_bin=args.MAX_BIN,
            silent=-1,
            verbose=-1,
            random_state=seed,
            scale_pos_weight=args.SCALE_POS_WEIGHT
            )
    clf.fit(train_x, 
            train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= lgb_f1_score, 
            verbose= True, 
            early_stopping_rounds= 100, 
            categorical_feature='auto') # early_stopping_rounds= 200
    # probabilty belong to class1(fraud)
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    #train_preds[train_idx] += clf.predict_proba(train_x, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    sub_preds += clf.predict_proba(df_test[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d val f1-score : %.6f' % (n_fold + 1, lgb_f1_score(valid_y, oof_preds[valid_idx])[1]))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()


In [None]:
int(multiprocessing.cpu_count()*args.CPU_USE_RATE)
(0.661943+0.679792)/2

In [None]:
pd.Series(train_preds).plot(kind = "hist")

In [None]:
pd.Series(np.round(train_preds)).value_counts()

In [None]:
#print('---------------------------------------\nOver-folds train f1-score %.6f' % lgb_f1_score(df_train['fraud_ind'], train_preds)[1])
print('---------------------------------------\n')
over_folds_val_score = lgb_f1_score(df_train['fraud_ind'], oof_preds)[1]
print('Over-folds val f1-score %.6f\n---------------------------------------' % over_folds_val_score)
# Write submission file and plot feature importance
df_test.loc[:,'fraud_ind'] = np.round(sub_preds)
df_test[['txkey', 'fraud_ind']].to_csv(args.result_path, index= False)


In [None]:
fold_importance_df

In [None]:
# display_importances(feature_importance_df)
feature_importance_df_median = feature_importance_df[["feature", "importance"]].groupby("feature").median().sort_values(by="importance", ascending=False)
useless_features_df = feature_importance_df_median.loc[feature_importance_df_median['importance'] == 0]
feature_importance_df_mean = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)


In [None]:
useless_features_df

In [None]:
feature_importance_df_mean

In [None]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout
    plt.savefig('lgbm_importances.png')
display_importances(fold_importance_df)

In [None]:
if args.TEST_NULL_HYPO:
    feature_importance_df_mean.to_csv("feature_importance-null_hypo.csv", index = True)
else:
    feature_importance_df_mean.to_csv("feature_importance.csv", index = True)
    useless_features_list = useless_features_df.index.tolist()
    print('Useless features: \'' + '\', \''.join(useless_features_list) + '\'')

In [None]:
df_train = pd.read_csv(args.train_file)

for cat in CATEGORY:
    df_train[cat] = df_train[cat].astype('category')#.cat.codes

y_train = df_train['fraud_ind']
x_train = df_train.drop('fraud_ind', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)

estimator = lgb.LGBMClassifier(num_leaves=31)

param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100],
        'scale_pos_weight': [3, 5, 70, 100]
        }



In [None]:
gbm = GridSearchCV(estimator, 
                   param_grid, 
                   cv = 2,
                   scoring='f1', 
                   return_train_score = True,
                   n_jobs = -1)

In [None]:
gbm

In [None]:
help(gbm)

In [None]:
gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=lgb_f1_score, early_stopping_rounds=5, categorical_feature='auto')


In [None]:
pd.options.display.max_colwidth = 100
df = pd.DataFrame(gbm.cv_results_)
df.sort_values(by = "mean_test_score", inplace =True)
mean_test_score = df.iloc[0].mean_test_score
std_test_score = df.iloc[0].std_test_score
print ("10-fold result on best paras : {} with +/- {}".format(round(mean_test_score, 4), round(std_test_score,4)))

In [None]:
gbm.best_estimator_

In [None]:
gbm.best_score_

In [None]:
gbm.refit_time_

In [None]:
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 100
cv_results = pd.read_csv("/data/yunrui_li/fraud/fraud_detection/result/cv_results.csv")
cv_results.sort_values(by = "mean_test_score", ascending = False)

# historical data

In [14]:
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import f1_score
import numpy as np
#from pandarallel import pandarallel

from statistics import mean 
def merge_and_split_dfs(df_train, df_test):
    len_train = len(df_train)
    df = pd.concat([df_train, df_test]).reset_index()

    def split_df(df):
        df = df.drop(['index'], axis=1)
        return df.iloc[:len_train], df.iloc[len_train:].drop(['fraud_ind'], axis=1)
    return df, split_df


def get_conam_dict_by_day(df):
    """
    key: id in cano
    value: dict with {locdt: sum of conam at that day}
    number of key: unique number of cano
    
    """
    dt_dict = defaultdict(lambda: defaultdict(lambda : 0))
    for index, row in df.iterrows():
        dt_dict[row['cano']][row['locdt']] += row['conam']

    return dt_dict

def _get_last_x_day_conam(cano, locdt, days_back, dt_dict):
    return mean(dict(filter(lambda dt: dt[0]<=locdt and locdt -dt[0] <=days_back, dt_dict.items())).values())

def last_x_day_conam(days_back, df, cano_dict):
    return df[['cano', 'locdt']].apply(lambda row: _get_last_x_day_conam(row['cano'], row['locdt'], days_back, cano_dict[row['cano']]), axis=1)

In [25]:
df = df_train[df_train.bacno.isin([1,2])]
df.sort_values(by = ["cano","locdt"], inplace = True)
df[ ["cano","locdt"]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,cano,locdt
263679,96923,1
481299,96923,1
1278126,96923,1
559276,96923,2
655603,96923,2
1287235,96923,2
1518063,96923,2
1107826,96923,5
1517834,96923,5
1336989,96923,6


In [250]:
df = pd.concat([df_train,df_test], axis = 0)
df = df.iloc[:10000]
len(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


10000

In [244]:
def num_transaction_in_past_n_days(df, n):
    current_day_at_this_transaction = df.locdt.tolist()
    output = []
    for current_date in current_day_at_this_transaction:
        history_date = current_date-n
        tmp_df = df[(df.locdt < current_date)&(df.locdt >= history_date)]
        output.append(len(tmp_df))
    df["num_transaction_in_past_{}_days".format(n)] = output
    return df

def num_transaction_in_past_n_days(df, n):
    """
    how many transaction that this user have in the past n days
    """
    current_day_at_this_transaction = df.locdt.tolist()
    output = []
    for current_date in current_day_at_this_transaction:
        history_date = current_date-n
        c = 0
        for i in current_day_at_this_transaction:
            if (i >= history_date) & (i < current_date):
                c+=1
        output.append(c)
    return pd.Series(output) # return Series instead of list

"""
amount spent by the card-holder in shops
from a given country in the last 24h"
"""

'\namount spent by the card-holder in shops\nfrom a given country in the last 24h"\n'

In [26]:
df[["cano","locdt"]].groupby("cano")\
.apply(lambda x: num_transaction_in_past_n_days(x,past_n_days)).values

array([0, 0, 0, 3, 3, 3, 3, 0, 0, 2, 3, 3, 3, 3, 4, 1, 1, 2, 2, 3, 3, 2,
       0, 1, 2, 2, 2, 2, 2, 0, 0, 2, 3, 1, 1, 0, 1, 2, 2, 2, 2, 4, 4, 2,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 6, 1, 1, 3, 3, 1, 1, 1, 3, 1, 1, 1, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 5, 5, 6, 3, 3, 0, 1, 1, 1,
       4, 4, 5, 5, 4, 4, 4, 4, 4, 5, 4, 4, 3, 3, 3, 5, 5, 5, 0, 0, 0, 0,
       0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [27]:
s = time.time()
pd.options.display.max_rows = 300
past_n_days = 2
# test_case = df[["bacno","locdt"]].head(100).groupby("bacno").apply(lambda x: num_transaction_in_past_n_days(x,past_n_days))
df["num_transaction_in_past_{}_days_w_cano".format(past_n_days)] = df[["cano","locdt"]].groupby("cano")\
.apply(lambda x: num_transaction_in_past_n_days(x,past_n_days)).values

e = time.time()
print (e-s)

0.007372140884399414


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [29]:
# df["num_transaction_in_past_{}_days".format(past_n_days)] = df[["bacno","locdt"]].groupby("bacno")\
# .apply(lambda x: num_transaction_in_past_n_days(x,past_n_days)).values
# # df[["bacno","locdt"]].groupby("bacno")\
# # .apply(lambda x: num_transaction_in_past_n_days(x,past_n_days))
df[["cano","locdt","num_transaction_in_past_{}_days_w_cano".format(past_n_days)]]

Unnamed: 0,cano,locdt,num_transaction_in_past_2_days_w_cano
263679,96923,1,0
481299,96923,1,0
1278126,96923,1,0
559276,96923,2,3
655603,96923,2,3
1287235,96923,2,3
1518063,96923,2,3
1107826,96923,5,0
1517834,96923,5,0
1336989,96923,6,2


In [252]:
1943452/10000 * 0.15033984184265137 / 60

0.48696377718130746

In [190]:
df

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,hour_range,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,time_elapsed_between_last_transactions
502741,6413,1,117264,934.49,5,62,N,4,N,N,0.0,5,night,N,0,3,200000.0,20,0,0,275,53099,N,5817,102,0,1549254,-1.0
994932,6189,1,117264,939.19,5,62,Y,2,N,N,0.0,5,night,N,0,4,221428.0,22,14,28,317,90151,N,1463,102,0,1837177,1.0
606676,6189,1,117264,1267.47,5,62,Y,2,N,N,0.0,5,night,N,0,25,212635.0,21,26,35,317,90151,N,1463,102,0,1859385,21.0
1388156,6231,1,117264,1017.37,5,62,N,5,N,N,0.0,5,night,N,0,30,200947.0,20,9,47,277,12726,N,5817,102,0,994333,5.0
10441,6189,1,117264,613.81,5,62,N,4,N,N,0.0,5,afternoon,N,0,34,150512.0,15,5,12,263,92571,N,5817,102,0,1639576,4.0
621811,6189,1,117264,643.76,5,62,N,4,N,N,0.0,5,noon,N,0,50,144817.0,14,48,17,263,92641,N,5817,102,0,1149335,16.0
617285,6189,1,117264,645.26,5,62,N,4,N,N,0.0,5,noon,N,0,60,134734.0,13,47,34,263,92641,N,5817,102,0,1227066,10.0
1264456,6189,1,117264,1621.54,5,62,Y,2,N,N,0.0,5,night,N,0,63,213723.0,21,37,23,317,90151,N,1463,102,0,1900543,3.0
327323,6413,1,117264,954.83,5,62,Y,2,N,Y,0.0,5,night,N,0,68,232029.0,23,20,29,273,53416,N,515,102,0,1905868,5.0
1106685,6413,1,117264,954.83,5,62,N,5,N,N,0.0,5,night,N,0,68,192933.0,19,29,33,277,52334,N,5817,102,0,1660525,5.0


In [53]:
df['last_2_day_mean_conam_per_day'] = last_x_day_conam(2, df, conam_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
df[["bacno","cano","locdt","conam","last_2_day_mean_conam_per_day"]]

Unnamed: 0,bacno,cano,locdt,conam,last_2_day_mean_conam_per_day
502741,1,117264,3,934.49,934.49
994932,1,117264,4,939.19,936.84
606676,1,117264,25,1267.47,1267.47
1388156,1,117264,30,1017.37,1017.37
10441,1,117264,34,613.81,613.81
621811,1,117264,50,643.76,643.76
617285,1,117264,60,645.26,645.26
1264456,1,117264,63,1621.54,1621.54
327323,1,117264,68,954.83,1909.66
1106685,1,117264,68,954.83,1909.66


In [56]:
(934.49+939.19)/2

936.84