In [3]:
import numpy as np
import pandas as pd
import sys
sys.path.append("../fraud_detection/src/")

from util import s_to_time_format, string_to_datetime,hour_to_range
from tqdm import tqdm

#-----------------------------
# load data
#-----------------------------
df_train = pd.read_csv("/data/yunrui_li/fraud/dataset/train.csv")
df_test = pd.read_csv("/data/yunrui_li/fraud/dataset/test.csv")


for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour)
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)

    # removed the columns no need
    df.drop(columns = ["loctm_", "loctm","txkey"], axis = 1, inplace = True)

df_train["cano_locdt_index"] = ["{}_{}_{}_{}_{}".format(str(i),str(j),str(k),str(l),str(m)) for i,j,k,l,m in zip(df_train.cano,
                                                                                   df_train.locdt,
                                                                                   df_train.loctm_hour_of_day,
                                                                                   df_train.loctm_minute_of_hour,
                                                                                   df_train.loctm_second_of_min,
                                                                                  )]
df_test["cano_locdt_index"] = ["{}_{}_{}_{}_{}".format(str(i),str(j),str(k),str(l),str(m)) for i,j,k,l,m in zip(df_test.cano,
                                                                                  df_test.locdt,
                                                                                  df_test.loctm_hour_of_day,
                                                                                  df_test.loctm_minute_of_hour,
                                                                                  df_test.loctm_second_of_min,
                                                                                 )]

df_train["cano_help"] = df_train.cano
df_test["cano_help"] = df_test.cano

df_train["locdt_help"] = df_train.locdt
df_test["locdt_help"] = df_test.locdt


#-----------------------------
# feature extraction
#-----------------------------
df = pd.concat([df_train, df_test], axis = 0)
df.sort_values(by = ["cano", "locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], inplace = True)

#-----------------------------
# prepare training data
#-----------------------------
df_train.sort_values(by = ["cano", "locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], inplace = True)

# df_train, df_test = value_to_count(df_train, df_test)
# df_train, df_test = feature_normalization_auto(df_train, df_test)

fraud_cano_id = df_train[df_train.fraud_ind == 1].cano.unique().tolist()

df_train_normal_cano_id = df_train[~df_train.cano.isin(fraud_cano_id)]
print ("number of training data",df_train_normal_cano_id.shape)

df_train, df_test, df_train_normal_cano_id, df = value_to_count(df_train, df_test,df_train_normal_cano_id, df)
df_train_normal_cano_id, df = feature_normalization_auto(df_train, df_test,df_train_normal_cano_id, df)

#-----------------------------
# post-processing
#-----------------------------
df.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
df_train_normal_cano_id.drop(columns = ["fraud_ind"], axis = 1, inplace = True)
feats = [
   #'acqic', 'bacno', 'cano', 
   'conam',
    #'contp', 'csmcu', 'ecfg', 'etymd',
   #'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'iterm', 
    'locdt',
   #'mcc', 
    'mchno', 
#     'ovrlt', 
#     'scity', 
#     'stocn', 
#     'stscd', 
    'loctm_hour_of_day',
   'loctm_minute_of_hour', 'loctm_second_of_min'] + ["cano_locdt_index","cano_help","locdt_help"]

df = df[feats]
df_train_normal_cano_id = df_train_normal_cano_id[feats]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of training data (1390382, 27)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 17/17 [00:29<00:00,  1.75s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 23/23 [00:55<00:00,  2.41s/it]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# for f in ['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
#    'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'iterm', 'locdt',
#    'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'loctm_hour_of_day',
#    'loctm_minute_of_hour', 'loctm_second_of_min']:
#     print (df_train_normal_cano_id[f].max())
#     print (df_train_normal_cano_id[f].min())

In [2]:
def value_to_count(df_train, df_test, df_train_normal_cano_id, df_):
    """
    convert categorial features into number of occurence in the dataset.
    """
    # separate continuous feature and categorial features
    feats = ['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'iterm', 'locdt',
       'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'loctm_hour_of_day',
       'loctm_minute_of_hour', 'loctm_second_of_min'] 
    cont_feats = [
                  'conam',
                  'iterm', 
                  'locdt',
                  'loctm_hour_of_day',
                  'loctm_minute_of_hour', 
                  'loctm_second_of_min']
    feats = [f for f in feats if f not in cont_feats]
    # we only coner categorial features
    
    df = pd.concat([df_train[feats], df_test[feats]], axis = 0)
    for f in tqdm(feats):
        count_dict = df[f].value_counts(dropna = False).to_dict() 
        df_train_normal_cano_id[f] = df_train_normal_cano_id[f].apply(lambda v: count_dict[v])
        df_train[f] = df_train[f].apply(lambda v: count_dict[v])
        df_test[f] = df_test[f].apply(lambda v: count_dict[v])
        df_[f] = df_[f].apply(lambda v: count_dict[v])
    return df_train,df_test,df_train_normal_cano_id, df_

def feature_normalization_auto(df_train, df_test, df_train_normal_cano_id,df_):
    """
    return two inputs of autoencoder, one is for train and another one is for test
    """
    #from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
    feats = ['acqic', 'bacno', 'cano', 'conam', 'contp', 'csmcu', 'ecfg', 'etymd',
       'flbmk', 'flg_3dsmk', 'hcefg', 'insfg', 'iterm', 'locdt',
       'mcc', 'mchno', 'ovrlt', 'scity', 'stocn', 'stscd', 'loctm_hour_of_day',
       'loctm_minute_of_hour', 'loctm_second_of_min']
    df = pd.concat([df_train[feats], df_test[feats]], axis = 0)


    for f in tqdm(feats):
        try:
            #scaler = MinMaxScaler()
            max_ = df[f].max()
            min_ = df[f].min()
            df_train_normal_cano_id[f] = df_train_normal_cano_id[f].apply(lambda x: (x-min_)/(max_-min_))
            df_[f] = df_[f].apply(lambda x: (x-min_)/(max_-min_))
            #df_test[f] = df_test[f].apply(lambda x: (x-min_)/(max_-min_))
        except:
            print(f)
    return df_train_normal_cano_id,df_

In [None]:
df_train_normal_cano_id

In [None]:
def partition_(df, num_features):
    data = []
    for i in range(len(df)):
        out = None
        if i == 0:
            out = np.concatenate(((np.zeros((2,num_features))),df.iloc[:1].values))
        elif i== 1:
            out = np.concatenate(((np.zeros((1,num_features))),df.iloc[:i+1].values))
        else:
            out = df.iloc[i+1-3:i+1].values
        data.append(out)
    return data

def partition(df_, sequence_length = 3):
    feats = [f for f in df_.columns if f not in {"fraud_ind","cano_help","locdt_help"}]
    sequences = []
    for _, df in df_.groupby(by = "cano_help"):
        data = partition_(df[feats], num_features = len(feats))
        for d in data:
            sequences.append(d)
    return sequences

def get_sequence_dataframe(df):
    df_train_sequences = partition(df)
    df_train_sequences = np.concatenate(df_train_sequences)
    df_train_sequences = pd.DataFrame(df_train_sequences)
    return df_train_sequences
#-----------------------------
# get train/test data
#-----------------------------

X_train = get_sequence_dataframe(df_train_normal_cano_id)
Feature = get_sequence_dataframe(df)
#-----------------------------
# modeling (unsupervised learning)
#-----------------------------
import sys
#sys.path.append("/data/yunrui_li/fraud/DeepADoTS")
#from src.algorithms.dagmm import DAGMM
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0"  
from DAGMM import DAGMM
detectors = DAGMM(num_epochs=50, sequence_length=3)
detectors.fit(X_train.iloc[:,:-1].copy())

score = detectors.predict(Feature.iloc[:,:-1].copy())
output = pd.DataFrame({"cano_locdt_index":Feature.iloc[:,-1]})
output["score"] = score

print (output.shape)

output["cosine_errors_mean"] = detectors.prediction_details["cosine_errors_mean"]
output["euclidean_errors_mean"]  = detectors.prediction_details["euclidean_errors_mean"]
data = detectors.prediction_details["reconstructions_mean"]
reconstructions_mean = pd.DataFrame(data.T,
             columns = ["reconstructions_mean_latent_features_{}".format(i) for i in range(data.shape[0])]
            )
print (reconstructions_mean.shape)
data = detectors.prediction_details["latent_representations"]
latent_representations = pd.DataFrame(data.T,
             columns = ["latent_representations_latent_features_{}".format(i) for i in range(data.shape[0])]
            )
print (latent_representations.shape)
output = pd.concat([output,reconstructions_mean,latent_representations], axis = 1)
print (output.shape)

feature = []
for i in range(len(output)):
    if i%3 == 2:
        feature.append(output.iloc[i:i+1])
feature = pd.concat(feature,axis = 0)

feature.to_csv("/data/yunrui_li/fraud/fraud_detection/features/DAGMM_features_less_input.csv", index = False)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
 22%|██▏       | 11/50 [2:03:30<7:16:02, 670.82s/it]

In [7]:
feature

Unnamed: 0,cano_locdt_index,score,cosine_errors_mean,euclidean_errors_mean,reconstructions_mean_latent_features_0,reconstructions_mean_latent_features_1,reconstructions_mean_latent_features_2,reconstructions_mean_latent_features_3,reconstructions_mean_latent_features_4,reconstructions_mean_latent_features_5,latent_representations_latent_features_0,latent_representations_latent_features_1,latent_representations_latent_features_2,latent_representations_latent_features_3,latent_representations_latent_features_4
2,0_1_15_19_48,-22.662193,-1.084394,-1.117315,-0.296740,-0.068038,-0.053378,-0.296283,-0.163238,-0.179484,0.894681,-1.117315,-1.084394,-4.337092,4.635670
5,0_4_15_44_7,-27.490714,-4.578724,-3.045611,0.166659,0.176413,0.220550,0.435034,0.350870,0.346924,4.344673,-3.045611,-4.578724,-1.159269,1.482810
8,0_20_14_53_42,-27.474175,-5.092361,-3.328975,0.110268,0.328782,0.303428,0.583366,0.454730,0.445215,4.851853,-3.328975,-5.092361,-0.691998,1.019327
11,0_29_15_22_43,-27.213982,-5.102502,-3.334569,0.142504,0.339089,0.321281,0.601988,0.465076,0.455646,4.861865,-3.334569,-5.102502,-0.682776,1.010175
14,0_37_14_37_10,-26.059532,-5.084417,-3.324591,0.110154,0.350612,0.290773,0.569704,0.444154,0.436974,4.844006,-3.324591,-5.084417,-0.699225,1.026497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5830343,213570_119_13_56_3,-14.883758,-0.437768,-0.760364,-0.615070,-0.201883,-0.147716,-0.450633,-0.234637,-0.249322,0.256390,-0.760364,-0.437768,-4.925027,5.219095
5830346,213571_119_21_46_3,-14.677338,-2.131566,-1.695157,-0.354073,-0.043709,0.025809,-0.096460,0.043120,0.011579,1.928665,-1.695157,-2.131566,-3.384656,3.690858
5830349,213571_119_21_46_22,-20.515411,-3.918107,-2.680963,-0.278236,0.270400,0.166815,0.369971,0.299933,0.302892,3.692516,-2.680963,-3.918107,-1.759879,2.078836
5830352,213572_120_14_14_8,-16.658675,0.084298,-0.472286,-0.216788,-0.266204,-0.137874,-0.480453,-0.304188,-0.309235,-0.259116,-0.472286,0.084298,-5.399842,5.690127


In [None]:
df_train["cano_locdt_index"] = ["{}_{}_{}_{}_{}".format(str(i),str(j),str(k),str(l),str(m)) for i,j,k,l,m in zip(df_train.cano,
                                                                                   df_train.locdt,
                                                                                   df_train.loctm_hour_of_day,
                                                                                   df_train.loctm_minute_of_hour,
                                                                                   df_train.loctm_second_of_min,
                                                                                  )]
df_test["cano_locdt_index"] = ["{}_{}_{}_{}_{}".format(str(i),str(j),str(k),str(l),str(m)) for i,j,k,l,m in zip(df_test.cano,
                                                                                  df_test.locdt,
                                                                                  df_test.loctm_hour_of_day,
                                                                                  df_test.loctm_minute_of_hour,
                                                                                  df_test.loctm_second_of_min,
                                                                                 )]
