In [5]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import sys
sys.path.append("../fraud_detection/src/")

from util import s_to_time_format, string_to_datetime,hour_to_range
from tqdm import tqdm

#-----------------------------
# load data
#-----------------------------
df_train = pd.read_csv("/data/yunrui_li/fraud/dataset/train.csv")
df_test = pd.read_csv("/data/yunrui_li/fraud/dataset/test.csv")


for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour)
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)

    # removed the columns no need
    df.drop(columns = ["loctm_"], axis = 1, inplace = True)


In [169]:
df = pd.concat([df_train, df_test], axis = 0)
df.sort_values(by = ["mchno","locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], inplace = True)

df[["mchno","locdt","conam"]]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,mchno,locdt,conam
1441953,0,1,1806.49
963034,0,1,1485.11
1438765,0,1,1806.49
79513,0,1,0.00
987011,0,1,496.39
...,...,...,...
1516129,103307,3,0.00
1400871,103307,79,0.00
20244,103307,92,0.00
20241,103307,92,0.00


In [168]:
tmp = df.groupby("mchno").size().to_frame("sizes").reset_index()
tmp.sort_values(by = "sizes", ascending = False)

Unnamed: 0,mchno,sizes
0,0,63719
54553,54828,48405
53966,54238,46561
59034,59333,41203
86132,86584,26175
...,...,...
57141,57425,1
57142,57426,1
57144,57428,1
57146,57430,1


In [166]:
test = df[df.mchno.isin([86584])][["mchno","locdt","conam","fraud_ind","txkey"]]
print (test.shape)
test

(26175, 5)


Unnamed: 0,mchno,locdt,conam,fraud_ind,txkey
271113,86584,1,513.80,0.0,477029
76035,86584,1,513.80,0.0,477031
810320,86584,1,513.80,0.0,477095
955575,86584,1,513.80,0.0,477097
1192450,86584,1,513.80,0.0,477119
...,...,...,...,...,...
193171,86584,120,513.80,,609677
111125,86584,120,683.25,,609680
191768,86584,120,683.25,,609697
365529,86584,120,513.80,,609700


In [124]:
3197.64/4

799.41

In [None]:
# def average_states_in_past_n_days(df, n, m):
#     """
#     Calculate how many transaction that this user have in the past n days
#     """
#     current_day_at_this_transaction = df.locdt.tolist()
#     history_date_at_this_transaction = [(current_date-n,current_date) for current_date in current_day_at_this_transaction]
#     conam_at_this_transaction = df.conam.tolist()
#     conam_out = []
# #     for current_date, current_conam in zip(current_day_at_this_transaction,conam_at_this_transaction):
# #         history_date = current_date-n
# #         #c = 0
# #         for i in current_day_at_this_transaction:
# #             if (i >= history_date) & (i < current_date):
# #                 conam_out.append(current_conam)
#     for current_date, current_conam, history_date in zip(current_day_at_this_transaction,
#                                            conam_at_this_transaction,
#                                            history_date_at_this_transaction
#                                           ):
#         s,e = history_date
#         print (s,e)
#         if (current_date >= s) & (current_date <= e):
#             conam_out.append(current_conam)
#     if m == "mean":
#         if len(conam_out) == 0:
#             return 0
#         else:
#             return 1.0 * sum(conam_out)/ len(conam_out)
# #     print ("sum",sum(conam_out))
# #     print ("min",pd.Series(conam_out).min())
# #     print ("max",pd.Series(conam_out).max())
# #     print ("mean",pd.Series(conam_out).mean())

def average_stats_in_past_n_days(df, n, method):
    """
    Calcuate average conam given sorted df by locdt.
    """
    current_day_at_this_transaction = df.locdt.tolist()
    time_interval = [(current_date-n,current_date) for current_date in current_day_at_this_transaction]
    out = []
    check_if_exist = []
    for s,e in time_interval:
        tmp = df[(df.locdt < e) & (df.locdt >= s)]
        if len(tmp)!=0:
            if check_if_exist == tmp.index.tolist():
                pass
            else:
                if method == "mean":
                    out.append(tmp.conam.mean())
                elif method == "sum":
                    out.append(tmp.conam.sum())
                elif method == "max":
                    out.append(tmp.conam.max())
                elif method == "min":
                    out.append(tmp.conam.min())
                elif method == "std":
                    out.append(tmp.conam.std())
                else:
                    raise ValueError("this method {} we don't support".format(method))
                check_if_exist = tmp.index.tolist()
    #print (out)
    if len(out) == 0:
        return 0.0
    else:
        return 1.0* sum(out)/len(out)

def get_agg_time_feature(df, gby = "hcefg"):
    for past_n_days in [7,14]:
        for m in ["mean",
#                   "sum",
#                   "max",
#                   "min",
#                   "std"
                 ]:
            cols = "agg_{}_{}_conam_in_past_{}_days".format(gby,m,past_n_days)
            tmp = df[["{}".format(gby),"locdt","conam"]].groupby(gby).apply(lambda x: average_stats_in_past_n_days(x,past_n_days,m)).to_frame(cols).reset_index()
            df = df.merge(tmp, on = gby, how = "left")
    return df   

import time
s = time.time()

df = get_agg_time_feature(df,gby = "mchno")

# past_n_days = 3
# m = "mean"
# cols = "average_{}_{}_conam_in_past_{}_days".format("mchno",m,past_n_days)
# tmp = test[["mchno","locdt","conam"]].groupby("mchno")\
#     .apply(lambda x: average_stats_in_past_n_days(x,past_n_days,m)).to_frame(cols).reset_index()
e = time.time()
print (e-s)

tmp

In [None]:
print (e-s)


In [None]:
# def average_stats_in_past_n_days(df, n, method):
#     """
    
#     """
#     time_interval = [(i+1,i+n) for i in range(120-n+1)]

#     out = []
#     for s,e in time_interval:
#         tmp = filter_(df,s,e)
#         #print (len(tmp))
#         if len(tmp)!=0:
#             if method == "mean":
#                 out.append(pd.Series(tmp).mean())
#             elif method == "sum":
#                 out.append(pd.Series(tmp).sum())
#             elif method == "max":
#                 out.append(pd.Series(tmp).max())
#             elif method == "min":
#                 out.append(pd.Series(tmp).min())
#             elif method == "std":
#                 out.append(pd.Series(tmp).std())
#             else:
#                 raise ValueError("this method {} we don't support".format(method))

#     return 1.0* sum(out)/len(out)

