In [3]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import sys
sys.path.append("../fraud_detection/src/")

from util import s_to_time_format, string_to_datetime,hour_to_range
from tqdm import tqdm

#-----------------------------
# load data
#-----------------------------
df_train = pd.read_csv("/data/yunrui_li/fraud/dataset/train.csv")
df_test = pd.read_csv("/data/yunrui_li/fraud/dataset/test.csv")


for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour)
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)

    # removed the columns no need
    df.drop(columns = ["loctm_"], axis = 1, inplace = True)


In [4]:
def average_stats_in_past_n_days(df, n, method):
    """
    Calcuate average conam given sorted df by locdt.
    """
    current_day_at_this_transaction = df.locdt.tolist()
    time_interval = [(current_date-n,current_date) for current_date in current_day_at_this_transaction]
    out = []
    check_if_exist = []
    for s,e in time_interval:
        tmp = df[(df.locdt < e) & (df.locdt >= s)]
        if len(tmp)!=0:
            if check_if_exist == tmp.index.tolist():
                pass
            else:
                if method == "mean":
                    out.append(tmp.conam.mean())
                elif method == "sum":
                    out.append(tmp.conam.sum())
                elif method == "max":
                    out.append(tmp.conam.max())
                elif method == "min":
                    out.append(tmp.conam.min())
                elif method == "std":
                    out.append(tmp.conam.std())
                else:
                    raise ValueError("this method {} we don't support".format(method))
                check_if_exist = tmp.index.tolist()
    if len(out) == 0:
        return 0.0
    else:
        return 1.0* sum(out)/len(out)

def get_agg_time_feature(df, gby = "hcefg"):
    for past_n_days in [7,14]:
        for m in ["mean",
#                   "sum",
#                   "max",
#                   "min",
#                   "std"
                 ]:
            cols = "average_{}_{}_conam_in_past_{}_days".format(gby,m,past_n_days)
            tmp = df[["{}".format(gby),"locdt","conam"]].groupby(gby).apply(lambda x: average_stats_in_past_n_days(x,past_n_days,m)).to_frame(cols).reset_index()
            df = df.merge(tmp, on = gby, how = "left")
    return df   

In [5]:
df

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min
0,6881,163188,116799,513.80,5,0,N,0,N,N,5,N,0,102,215328.0,457,59360,N,0,102,0,592489,21,53,28
1,6881,163188,116799,513.80,5,0,N,0,N,N,5,N,0,102,222007.0,457,59360,N,0,102,0,592452,22,20,7
2,6881,163188,116799,513.80,5,0,N,0,N,N,5,N,0,100,170013.0,457,59360,N,0,102,0,590212,17,0,13
3,6881,163188,116799,513.80,5,0,N,0,N,N,5,N,0,100,165914.0,457,59360,N,0,102,0,590209,16,59,14
4,6881,163188,116799,513.80,5,0,N,0,N,N,5,N,0,102,215311.0,457,59360,N,0,102,0,592488,21,53,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421660,5975,147326,79511,633.76,5,62,N,2,N,N,5,N,0,116,111228.0,343,87282,N,5817,102,0,1187507,11,12,28
421661,6716,14305,136493,952.84,5,62,N,5,N,N,8,N,0,107,85839.0,245,48784,N,5859,102,0,1182598,8,58,39
421662,5975,156543,137963,713.42,5,62,N,4,N,N,5,N,0,111,184921.0,263,98326,N,5817,102,0,898724,18,49,21
421663,6231,156543,137964,903.94,5,62,N,5,N,N,5,N,0,114,144434.0,251,17763,N,5817,102,0,971467,14,44,34


In [14]:
df.etymd.nunique()

11

In [9]:
df.bacno.nunique()

71099

In [10]:
import time
s = time.time()

df = pd.concat([df_train, df_test], axis = 0)
df.sort_values(by = ["cano","locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], inplace = True)

#df = get_agg_time_feature(df,gby = "mchno")
#df = get_agg_time_feature(df,gby = "hcefg")
# df = get_agg_time_feature(df,gby = "mcc")
# df = get_agg_time_feature(df,gby = "contp")
# df = get_agg_time_feature(df,gby = "etymd")
df = get_agg_time_feature(df,gby = "bacno")

e = time.time()
print (e-s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


5821.417182683945


In [18]:
df[df.bacno == 3]

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,average_bacno_mean_conam_in_past_7_days,average_bacno_mean_conam_in_past_14_days
170,6581,3,190586,401.39,5,62,N,5,N,N,0.0,5,N,0,1,171218.0,17,12,18,397,5945,N,3588,102,0,1476111,523.61,690.288333
171,6032,3,190586,703.98,5,62,Y,2,N,N,0.0,5,N,0,3,210911.0,21,9,11,247,82174,N,5817,102,0,362136,523.61,690.288333
172,6581,3,190586,909.88,5,62,Y,2,N,N,0.0,5,N,0,18,122618.0,12,26,18,343,2106,N,5817,102,0,1344305,523.61,690.288333
173,6581,3,190586,513.8,5,62,Y,2,N,Y,0.0,5,N,0,30,113926.0,11,39,26,432,2134,N,5817,102,0,1708199,523.61,690.288333
174,6231,3,190586,1273.09,5,62,N,4,N,N,0.0,5,N,0,63,172310.0,17,23,10,414,19957,N,5817,102,0,942361,523.61,690.288333
175,6430,3,190586,387.93,5,62,N,5,N,N,0.0,5,N,0,71,181404.0,18,14,4,397,53974,N,3588,102,0,1908429,523.61,690.288333
176,6189,3,190586,513.8,5,62,N,4,N,N,0.0,5,N,0,83,111902.0,11,19,2,263,98841,N,5817,102,0,1241459,523.61,690.288333
177,6716,3,190586,797.48,5,62,Y,2,N,N,0.0,5,N,0,89,101831.0,10,18,31,248,34239,N,5820,102,0,958493,523.61,690.288333
178,6032,3,190586,614.14,5,62,Y,2,N,Y,0.0,5,N,0,90,204623.0,20,46,23,286,78724,N,5817,102,0,211165,523.61,690.288333


In [13]:
df[['txkey',"average_bacno_mean_conam_in_past_7_days","average_bacno_mean_conam_in_past_14_days"]] \
.to_csv("../fraud_detection/features/average_bacno_time_agg.csv",index = False)

In [None]:
print (e-s)


In [None]:
import time
s = time.time()

df = pd.concat([df_train, df_test], axis = 0)
df.sort_values(by = ["mchno","locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], inplace = True)

df = get_agg_time_feature(df,gby = "mchno")
# df = get_agg_time_feature(df,gby = "hcefg")
# df = get_agg_time_feature(df,gby = "mcc")
# df = get_agg_time_feature(df,gby = "contp")
# df = get_agg_time_feature(df,gby = "etymd")
# df = get_agg_time_feature(df,gby = "bacno")

e = time.time()
print (e-s)
