In [6]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import sys
sys.path.append("../fraud_detection/src/")

from util import s_to_time_format, string_to_datetime,hour_to_range
from tqdm import tqdm

#-----------------------------
# load data
#-----------------------------
df_train = pd.read_csv("/data/yunrui_li/fraud/dataset/train.csv")
df_test = pd.read_csv("/data/yunrui_li/fraud/dataset/test.csv")


for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour)
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)

    # removed the columns no need
    df.drop(columns = ["loctm_"], axis = 1, inplace = True)


In [12]:
def merge_and_split_dfs(df_train, df_test):
    len_train = len(df_train)
    df = pd.concat([df_train, df_test]).reset_index()

    def split_df(df):
        df = df.drop(['index'], axis=1)
        return df.iloc[:len_train], df.iloc[len_train:].drop(['fraud_ind'], axis=1)
    return df, split_df


def get_conam_dict_by_day(df, gby = "cano"):
    from collections import defaultdict

    dt_dict = defaultdict(lambda: defaultdict(lambda : 0))
    for index, row in df.iterrows():
        dt_dict[row[gby]][row['locdt']] += row['conam']

    return dt_dict

def _get_last_x_day_conam(cano, locdt, days_back, dt_dict):
    from statistics import mean 

    return mean(dict(filter(lambda dt: dt[0]<=locdt and locdt -dt[0] <=days_back, dt_dict.items())).values())

def last_x_day_conam(days_back, df, cano_dict, gby = "cano"):
    return df[[gby, 'locdt']].apply(lambda row: _get_last_x_day_conam(row[gby], row['locdt'], days_back, cano_dict[row[gby]]), axis=1)


In [15]:
# y_train = df_train['fraud_ind']
# x_train = df_train.iloc[:1000]


#del df_train

import time
s = time.time()


df, split_df = merge_and_split_dfs(df_train, df_test)

conam_dict = get_conam_dict_by_day(df, gby = "mchno")

#df['mchno_last_3_day_mean_conam_per_day'] = last_x_day_conam(3, df, conam_dict)
df['mchno_last_7_day_mean_conam_per_day'] = last_x_day_conam(7, df, conam_dict,gby = "mchno")
df['mchno_last_14_day_mean_conam_per_day'] = last_x_day_conam(14, df, conam_dict,gby = "mchno")
#df['mchno_last_30_day_mean_conam_per_day'] = last_x_day_conam(30, df, conam_dict)

e = time.time()
print (e-s)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


1006.3263487815857


In [17]:
df_train, df_test = split_df(df)

In [20]:
df[df.mchno == 1]


Unnamed: 0,index,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,mchno_last_7_day_mean_conam_per_day,mchno_last_14_day_mean_conam_per_day
275715,275715,6450,21717,78670,1631.29,5,71,N,4,N,N,0.0,5,N,0,49,191431.0,19,14,31,87,1,Y,4185,36,0,1885484,1631.29,1631.29
1740858,219071,6450,158425,208698,1803.03,5,71,N,4,N,N,,5,N,0,102,211047.0,21,10,47,87,1,N,2155,36,0,1938230,1803.03,1803.03


Unnamed: 0,index,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,loctm,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,mchno_last_7_day_mean_conam_per_day,mchno_last_14_day_mean_conam_per_day
0,0,6881,113261,38038,513.80,5,0,N,0,N,N,0.0,5,N,0,33,172652.0,17,26,52,457,59333,N,0,102,0,516056,206552.431250,202833.343333
1,1,0,134508,45725,465.62,5,0,N,2,N,N,0.0,0,N,0,9,105114.0,10,51,14,451,0,N,5817,102,0,4376,265920.801250,266498.060000
2,2,6881,15408,188328,513.80,5,0,N,0,N,N,0.0,5,N,0,6,152458.0,15,24,58,457,59333,N,0,102,0,483434,182684.141667,182684.141667
3,3,6716,157159,29967,1016.11,5,62,N,5,N,N,0.0,5,N,0,5,172946.0,17,29,46,247,50436,N,3281,102,0,1407164,46312.312000,46312.312000
4,4,5975,105985,81305,713.66,5,62,N,4,N,N,0.0,5,N,0,6,182129.0,18,21,29,263,93775,N,5817,102,0,1051004,402.230000,402.230000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943447,421660,5975,147326,79511,633.76,5,62,N,2,N,N,,5,N,0,116,111228.0,11,12,28,343,87282,N,5817,102,0,1187507,156211.070000,151521.795000
1943448,421661,6716,14305,136493,952.84,5,62,N,5,N,N,,8,N,0,107,85839.0,8,58,39,245,48784,N,5859,102,0,1182598,10587.356250,11151.808000
1943449,421662,5975,156543,137963,713.42,5,62,N,4,N,N,,5,N,0,111,184921.0,18,49,21,263,98326,N,5817,102,0,898724,1129.370000,890.042000
1943450,421663,6231,156543,137964,903.94,5,62,N,5,N,N,,5,N,0,114,144434.0,14,44,34,251,17763,N,5817,102,0,971467,2357.315714,2054.612500
