In [62]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import f1_score
import numpy as np
from keras.models import load_model
CATEGORY = ['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt', 'scity', 'csmcu', 'cano', 'mchno', 'hcefg', 'bacno', 'contp', 'etymd', 'acqic']


def lgb_f1_score(y_true, y_pred):
    y_hat = np.round(y_pred)
    return 'f1', f1_score(y_true, y_hat), True

def label_encoder(x_train, x_test, df_test):
    from sklearn import preprocessing

    df = pd.concat([x_train,x_test,df_test], axis = 0)
    assert len(df)== len(x_train)+len(x_test)+len(df_test), "it should be same"
    
    for cat in CATEGORY:
        le = preprocessing.LabelEncoder()
        le.fit(df[cat].tolist())

        x_train[cat] = le.transform(x_train[cat].tolist()) 
        x_test[cat] = le.transform(x_test[cat].tolist()) 
        df_test[cat] = le.transform(df_test[cat].tolist()) 

    print ("*"* 100)
    print ("finished label encoding")
    return x_train,x_test,df_test

def pre_processing_for_auto_encoder(df):
    df = df.drop(['txkey'], axis=1)
    return df

def normalizing_for_auto_encoder(x_train,x_test,df_test):
    """
    return array
    """
    from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

    df = pd.concat([x_train,x_test,df_test], axis = 0)
    assert len(df)== len(x_train)+len(x_test)+len(df_test), "it should be same"

    scaler = MinMaxScaler()
    
    scaler.fit(df)

    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    df_test = scaler.transform(df_test)
    print ("*"* 100)
    print ("finished data normalizing")
    return x_train,x_test,df_test

def add_auto_encoder_feature(df_raw,df):
    predictions = autoencoder.predict(df) # get reconstructed vector, 2-D, [num_samples, num_features]
    mse = np.mean(np.power(df - predictions, 2), axis=1) # get reconstructed error, 1-D, [num_samples,]
    df = pd.DataFrame(predictions, columns=["reconstructed_dim_{}".format(i) for i in range(predictions.shape[1])])
    df["reconstruction_error"] = mse
    out = pd.concat([df_raw.reset_index(), df.reset_index()], axis = 1)
    assert len(out)==len(df_raw)==len(df), "it should be same"
    print ("*"* 100)
    print ("finished adding auto_encoder_feature")    
    return out


In [23]:
args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file": "/data/yunrui_li/fraud/dataset/test.csv"
}
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict(args)

In [9]:
#-------------------------
# load dataset
#-------------------------
df_train = pd.read_csv(args.train_file)
df_test = pd.read_csv(args.test_file)

for cat in CATEGORY:
    df_train[cat] = df_train[cat].astype('category')#.cat.codes
    df_test[cat] = df_test[cat].astype('category')

y_train = df_train['fraud_ind']
x_train = df_train.drop('fraud_ind', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)


In [63]:
#-------------------------
# auto_encoer
#-------------------------
x_train_, x_test_, df_test_ =  x_train.copy(), x_test.copy(), df_test.copy()

x_train_, x_test_, df_test_ = label_encoder(x_train_, x_test_, df_test_)

x_train_ = pre_processing_for_auto_encoder(x_train_) 
x_test_ = pre_processing_for_auto_encoder(x_test_)
df_test_ = pre_processing_for_auto_encoder(df_test_)

x_train_, x_test_, df_test_ = normalizing_for_auto_encoder(x_train_, x_test_, df_test_)

x_train_constructed, mse_x_train = get_auto_encoder_feature(x_train_)


****************************************************************************************************
finished label encoding
****************************************************************************************************
finished data normalizing


In [16]:
autoencoder = load_model('/data/yunrui_li/fraud/fraud_detection/models/model.h5')









In [31]:
x_test_.shape

(304358, 21)

In [34]:
predictions = autoencoder.predict(x_train_) # get reconstructed vector
mse = np.mean(np.power(x_train_ - predictions, 2), axis=1) # get reconstructed error

In [47]:
df = pd.DataFrame(predictions, columns=["reconstructed_dim_{}".format(i) for i in range(predictions.shape[1])])
df["reconstruction_error"] = mse

In [61]:
pd.options.display.max_columns = 100

pd.concat([x_train.reset_index(), df.reset_index()], axis = 1)

Unnamed: 0,index,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,index.1,reconstructed_dim_0,reconstructed_dim_1,reconstructed_dim_2,reconstructed_dim_3,reconstructed_dim_4,reconstructed_dim_5,reconstructed_dim_6,reconstructed_dim_7,reconstructed_dim_8,reconstructed_dim_9,reconstructed_dim_10,reconstructed_dim_11,reconstructed_dim_12,reconstructed_dim_13,reconstructed_dim_14,reconstructed_dim_15,reconstructed_dim_16,reconstructed_dim_17,reconstructed_dim_18,reconstructed_dim_19,reconstructed_dim_20,reconstruction_error
0,246363,6716,70705,111441,1135.92,5,62,N,5,N,N,5,N,0,79,154847.0,247,70305,N,5813,102,0,1096121,0,0.922855,0.424791,0.549933,0.098942,0.802798,0.838150,0.000000,0.474100,0.0,0.0,0.534933,0.0,0.0,0.643321,0.725546,0.596005,0.669012,0.0,0.862285,0.950920,0.0,0.000846
1,1125098,5975,157315,182130,407.87,5,62,N,5,N,N,5,N,0,60,85734.0,263,92655,N,5817,102,0,1155340,1,0.836729,0.947282,0.883448,0.091057,0.778249,0.811184,0.000000,0.482356,0.0,0.0,0.510775,0.0,0.0,0.408361,0.576736,0.618658,0.897041,0.0,0.913469,0.925958,0.0,0.003152
2,899275,6413,106996,95162,592.25,5,62,N,2,N,N,5,N,0,22,145031.0,432,51387,N,5817,102,0,1703412,2,0.948547,0.647410,0.484338,0.101923,0.786149,0.740434,0.000000,0.131991,0.0,0.0,0.505655,0.0,0.0,0.199399,0.537068,0.674349,0.474580,0.0,0.831086,1.031928,0.0,0.005075
3,367068,5720,92041,177639,387.00,5,35,Y,8,N,N,5,N,0,53,90729.0,192,67459,N,6221,75,0,727442,3,0.783295,0.552764,0.875027,0.086367,0.824696,0.830714,0.961113,0.737349,0.0,0.0,0.550814,0.0,0.0,0.336500,0.577321,0.561406,0.650205,0.0,0.715640,0.720010,0.0,0.012325
4,401687,3348,54901,165434,1016.34,5,62,Y,8,N,N,5,N,0,16,22104.0,289,54828,N,3460,46,0,665781,4,0.534770,0.317893,0.808989,0.072265,0.854872,0.697914,1.000768,0.909359,0.0,0.0,0.520517,0.0,0.0,0.000000,0.355243,0.577320,0.563246,0.0,0.483750,0.468888,0.0,0.006188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217424,1282388,6881,67997,204496,513.80,5,0,N,0,N,N,5,N,0,45,190409.0,457,59333,N,0,102,0,529978,1217424,1.028681,0.411723,0.989522,0.072657,0.815654,0.041676,0.000000,0.000000,0.0,0.0,0.553140,0.0,0.0,0.426053,0.691503,0.973342,0.549823,0.0,0.000000,0.943896,0.0,0.001026
1217425,1280825,5666,27859,112208,1201.74,5,62,N,2,N,N,5,N,0,60,224407.0,343,53687,N,5824,102,0,739724,1217425,0.963878,0.168792,0.564643,0.100854,0.803854,0.770255,0.000000,0.232540,0.0,0.0,0.520199,0.0,0.0,0.602829,0.728441,0.641203,0.485972,0.0,0.801012,1.013693,0.0,0.005498
1217426,1232733,6678,112297,191126,544.32,5,62,N,5,N,N,5,N,0,72,165017.0,247,23658,N,3454,102,0,1591467,1217426,0.925965,0.678926,0.917494,0.096374,0.833090,0.776348,0.000000,0.389214,0.0,0.0,0.539277,0.0,0.0,0.602720,0.771232,0.609791,0.221057,0.0,0.664241,0.873440,0.0,0.002620
1217427,1467373,6189,156598,129093,513.80,5,62,N,4,N,N,5,N,0,41,173110.0,263,92592,N,5817,102,0,1144115,1217427,0.916396,0.948923,0.643624,0.095868,0.782237,0.823418,0.000000,0.375980,0.0,0.0,0.541139,0.0,0.0,0.413250,0.604042,0.614242,0.863875,0.0,0.848603,0.946504,0.0,0.001521


In [54]:
df.shape

(1217429, 22)

In [55]:
x_train.shape

(1217429, 22)

In [57]:
x_train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,hcefg,insfg,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
246363,6716,70705,111441,1135.92,5,62,N,5,N,N,5,N,0,79,154847.0,247,70305,N,5813,102,0,1096121
1125098,5975,157315,182130,407.87,5,62,N,5,N,N,5,N,0,60,85734.0,263,92655,N,5817,102,0,1155340
899275,6413,106996,95162,592.25,5,62,N,2,N,N,5,N,0,22,145031.0,432,51387,N,5817,102,0,1703412
367068,5720,92041,177639,387.00,5,35,Y,8,N,N,5,N,0,53,90729.0,192,67459,N,6221,75,0,727442
401687,3348,54901,165434,1016.34,5,62,Y,8,N,N,5,N,0,16,22104.0,289,54828,N,3460,46,0,665781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282388,6881,67997,204496,513.80,5,0,N,0,N,N,5,N,0,45,190409.0,457,59333,N,0,102,0,529978
1280825,5666,27859,112208,1201.74,5,62,N,2,N,N,5,N,0,60,224407.0,343,53687,N,5824,102,0,739724
1232733,6678,112297,191126,544.32,5,62,N,5,N,N,5,N,0,72,165017.0,247,23658,N,3454,102,0,1591467
1467373,6189,156598,129093,513.80,5,62,N,4,N,N,5,N,0,41,173110.0,263,92592,N,5817,102,0,1144115
