In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append("../fraud_detection/src/")

from util import s_to_time_format, string_to_datetime,hour_to_range
from tqdm import tqdm

#-----------------------------
# load data
#-----------------------------
df_train = pd.read_csv("/data/yunrui_li/fraud/dataset/train.csv")
df_test = pd.read_csv("/data/yunrui_li/fraud/dataset/test.csv")


for df in [df_train, df_test]:
    # pre-processing
    df["loctm_"] = df.loctm.astype(int).astype(str)
    df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
    # time-related feature
    df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour)
    df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
    df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)

    # removed the columns no need
    df.drop(columns = ["loctm_", "loctm"], axis = 1, inplace = True)


In [2]:
gby = "bacno"
# df = pd.concat([df_train, df_test], axis = 0)
df_train.sort_values(by = [gby,"locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], 
                     inplace = True)
df_test.sort_values(by = [gby,"locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"], 
                     inplace = True)

In [3]:
pd.options.display.max_columns = 100

df_train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,fraud_ind,hcefg,insfg,iterm,locdt,mcc,mchno,ovrlt,scity,stocn,stscd,txkey,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min
502741,6413,1,117264,934.49,5,62,N,4,N,N,0,5,N,0,3,275,53099,N,5817,102,0,1549254,20,0,0
994932,6189,1,117264,939.19,5,62,Y,2,N,N,0,5,N,0,4,317,90151,N,1463,102,0,1837177,22,14,28
606676,6189,1,117264,1267.47,5,62,Y,2,N,N,0,5,N,0,25,317,90151,N,1463,102,0,1859385,21,26,35
1388156,6231,1,117264,1017.37,5,62,N,5,N,N,0,5,N,0,30,277,12726,N,5817,102,0,994333,20,9,47
10441,6189,1,117264,613.81,5,62,N,4,N,N,0,5,N,0,34,263,92571,N,5817,102,0,1639576,15,5,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353575,5975,163884,211804,1119.11,5,62,N,5,N,N,0,5,N,0,65,288,88870,N,5817,102,0,1801193,15,52,56
1215077,6767,163884,211804,1334.91,5,62,N,5,N,N,0,5,N,0,65,247,6475,N,5817,102,0,872380,17,22,3
1225070,6767,163884,211804,1125.71,5,62,N,5,N,N,0,5,N,0,65,247,6475,N,5817,102,0,872476,17,55,48
884262,6767,163884,211804,1103.64,5,62,N,5,N,N,0,5,N,0,65,247,6475,N,5817,102,0,1658468,18,8,3


In [4]:
df = pd.concat([df_train, df_test], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
df.mchno.nunique()

102783

In [6]:
df_train.groupby("bacno").apply(lambda x: len(x)).max()

1117

In [7]:
from tqdm import tqdm

In [8]:

txkey = []
y_train = []
x_train = []
x_train_2 = []
max_seq_lent = 100
feat = ["conam","locdt","loctm_hour_of_day","loctm_minute_of_hour","loctm_second_of_min"]
num_feat = len(feat)
for _, x in tqdm(df_train.groupby(gby)):
    txkey.append("_".join([str(i) for i in x.txkey.tolist()]))
    x1 = x[feat].values
    x2 = x["mchno"].values
    #print (x2.shape)
    if x1.shape[0] > 100:
        # truncating
        x1 = x1[:100,:]
    else:
        num_pad = 100-len(x1)
        pad = np.zeros((num_pad,num_feat))
        # padding
        x1 = np.concatenate([x1,pad])

    if x2.shape[0] > 100:
        # truncating
        x2 = x2[:100]
    else:
        num_pad = 100-len(x2)
        pad = np.zeros((num_pad,))
        # padding
        x2 = np.concatenate([x2,pad])
        
    # label
    if 1 in x.fraud_ind.tolist():
        y_train.append(1)
    else:
        y_train.append(0)
    # x
    x_train.append(x1)
    x_train_2.append(x2)
    
x_test = []
txkey_test = []
x_test_2 = []
for _, x in tqdm(df_test.groupby(gby)):
    txkey_test.append("_".join([str(i) for i in x.txkey.tolist()]))
    x1 = x[feat].values
    x2 = x["mchno"].values
    if x1.shape[0] > 100:
        # truncating
        x1 = x1[:100,:]
    else:
        num_pad = 100-len(x1)
        pad = np.zeros((num_pad,num_feat))
        # padding
        x1 = np.concatenate([x1,pad])
        
    if x2.shape[0] > 100:
        # truncating
        x2 = x2[:100]
    else:
        num_pad = 100-len(x2)
        pad = np.zeros((num_pad,))
        # padding
        x2 = np.concatenate([x2,pad])
    # x
    x_test.append(x1)
    x_test_2.append(x2)

100%|██████████| 95214/95214 [01:54<00:00, 830.48it/s]
100%|██████████| 71099/71099 [01:16<00:00, 933.11it/s]


In [9]:
len(txkey),len(txkey_test)

(95214, 71099)

In [10]:
x_train = np.array(x_train)
x_train_2 = np.array(x_train_2)

y_train = np.array(y_train)
x_test = np.array(x_test)
x_test_2 = np.array(x_test_2)

print (y_train.shape[0]==x_train.shape[0])
x_train.shape

True


(95214, 100, 5)

In [11]:
np.save("x_train.npy", x_train)
np.save("y_train.npy", x_train)

In [12]:
# x_train_ = np.load("x_train.npy")

In [13]:
pd.Series(y_train).value_counts(normalize = True)

0    0.909226
1    0.090774
dtype: float64

In [14]:
x_train.shape

(95214, 100, 5)

In [15]:
x_train_2.shape

(95214, 100)

In [16]:
import keras
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model
import numpy as np
np.random.seed(0)  # Set a random seed for reproducibility

# Headline input: meant to receive sequences of 100 integers, between 1 and 10000.
# Note that we can name any layer by passing it a "name" argument.
main_input = Input(shape=(100,5,), dtype='float32', name='main_input')
print ("main_input",main_input)
main_input_2 = Input(shape=(100,), dtype='int32', name='main_input_2')
x = main_input
#x = keras.layers.concatenate([main_input])
print ("x",x.shape)
# This embedding layer will encode the input sequence
# into a sequence of dense 512-dimensional vectors.
#x = Embedding(output_dim=512, input_dim=10000, input_length=100)(main_input)
x2 = Embedding(output_dim=15, input_dim=102783+500, input_length=100)(main_input_2)
print ("x2",x2.shape)
x = keras.layers.concatenate([x, x2], axis = 2)
print ("x",x.shape)
# A LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence
lstm_out,state_h,state_c = LSTM(10, name = "lstm_out", return_sequences = True, return_state = True)(x)
#lstm_out_feature = LSTM(32, name = "lstm_out_feature", return_sequences = True)(x)
print (lstm_out)
#auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

#auxiliary_input = Input(shape=(5,), name='aux_input')

# We stack a deep densely-connected network on top
x = Dense(10, activation='relu',name="dense_one")(state_h)
x = Dense(5, activation='relu',name="dense_two")(x)
x = Dense(3, activation='relu',name="dense_three")(x)

# And finally we add the main logistic regression layer
main_output = Dense(1, activation='sigmoid', name='main_output')(x)
model = Model(inputs=[main_input, main_input_2], outputs=[main_output])
model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy',
              metrics=['accuracy'],
              loss_weights=None)

Using TensorFlow backend.



main_input Tensor("main_input:0", shape=(?, 100, 5), dtype=float32)
x (?, 100, 5)


x2 (?, 100, 15)
x (?, 100, 20)
Tensor("lstm_out/transpose_1:0", shape=(?, ?, 10), dtype=float32)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [17]:
# from sklearn.model_selection import train_test_split
# x_train_, x_test_, y_train_, y_test_ = train_test_split(
#     x_train, y_train, test_size=0.20, random_state=1030)

In [18]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [19]:
batch_size = 64
# model.fit([x_train_], y_train_,
#           batch_size=batch_size,
#           epochs=5,
#           validation_data=(x_test_, y_test_)
#          )
model.fit({'main_input': x_train, 
           'main_input_2': x_train_2,
          },
          {'main_output': y_train, 
           #'aux_output': additional_labels
          },
          epochs=10, batch_size=32, validation_split = 0.2)


Train on 76171 samples, validate on 19043 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

# train

In [None]:
# new_model = Model(model.inputs, model.get_layer("dense_three").output)
# hidden_out = new_model.predict({'main_input': x_train})
# hidden_out.shape

In [None]:
# new_model = Model(model.inputs, model.get_layer("main_output").output)
# prediction = new_model.predict({'main_input': x_train})
# prediction.shape

In [None]:
new_model = Model(model.inputs, model.get_layer("lstm_out").output)
lstm_out, state_h, state_c = new_model.predict({'main_input': x_train})
lstm_out.shape

In [None]:
key_ = []
lstm_features = []
for key, lo in zip(txkey,lstm_out):
    key_ls = key.split("_")
    actual_len = len(key_ls)
    if actual_len <= 100:
        features = lo[:actual_len]
        for k,f in zip(key_ls,features):
            key_.append(k)
            lstm_features.append(f)
            #print (k)
            #print (f)
    else:
        c = 0
        features = lo[:actual_len]
        for k,f in zip(key_ls,features):
            c+=1
            key_.append(k)
            lstm_features.append(f)
            if c==100:
                break
        for k in key_ls[100:]:
            key_.append(k)
            lstm_features.append(f)

In [None]:
no_components = 10
lstm_features = pd.DataFrame(
    lstm_features,
    columns = ["{}_latent_features_{}".format("lstm",i) for i in range(no_components)]
                            )
lstm_features

In [None]:
output = pd.concat(
    [pd.DataFrame(key_,columns = ["txkey"]),
     lstm_features
    ],
    axis = 1
)
output

# test

In [None]:
new_model = Model(model.inputs, model.get_layer("lstm_out").output)
lstm_out, state_h, state_c = new_model.predict({'main_input': x_test})
print (lstm_out.shape)

key_ = []
lstm_features = []
a = 0
for key, lo in zip(txkey_test,lstm_out):
    key_ls = key.split("_")
    actual_len = len(key_ls)
    a+=actual_len
    if actual_len <= 100:
        features = lo[:actual_len]
        for k,f in zip(key_ls,features):
            key_.append(k)
            lstm_features.append(f)
            #print (k)
            #print (f)
    else:
        c = 0
        features = lo[:actual_len]
        for k,f in zip(key_ls,features):
            key_.append(k)
            lstm_features.append(f)
            c+=1
            if c==100:
                break
        for k in key_ls[100:]:
            key_.append(k)
            lstm_features.append(f)

no_components = 10
lstm_features = pd.DataFrame(
    lstm_features,
    columns = ["{}_latent_features_{}".format("lstm",i) for i in range(no_components)]
                            )
print (lstm_features.shape)

output_test = pd.concat(
    [pd.DataFrame(key_,columns = ["txkey"]),
     lstm_features
    ],
    axis = 1
)
output_test.shape

In [None]:
output_test.lstm_latent_features_0.value_counts()

In [None]:
df_train.txkey.nunique()

In [None]:
df_test.txkey.nunique()

In [None]:
df = pd.concat([
    output,
    output_test
], axis= 0)
df.shape

In [None]:
df.to_csv("../fraud_detection/features/lstm_features.csv", index = False)