Data Preparation
* Data loading from data_train.json
* Basic features(user type, application time) processing and exploration
 * Application time ---- processed into day of week, time of the day.
 * Check the relationship between fraud and user type/appication time
* Sequential User behavior features processing and exploration
 * stay time, lag time between pages, time span of an application, page transitions overview.
* Processing data for model building
 * Sequential data padding
 * Standardization
 * Save in json format
* Generate Markov Transition Field along features. 
 * Zhang, R., Zheng, F., & Min, W. (2018). Sequential Behavioral Data Processing Using Deep Learning and the Markov Transition Field in Online Fraud Detection. arXiv preprint arXiv:1808.05329.
 * All features are bin and one-hot encoded into a binary vector.
 * matrix[i,j]  represents the probability that j=1 given i=1
* Generate Markov Transition Field along timesteps and Gramian Angular Field(code only)
 * Wang, Z., & Oates, T. (2015, April). Encoding time series as images for visual inspection and classification using tiled convolutional neural networks. In Workshops at the Twenty-Ninth AAAI Conference on Artificial Intelligence.

In [0]:
from datetime import datetime
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from scipy import stats
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import os
import pickle


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/My Drive/Online Lending/Sequential Embedding/Spring 2020/low_income_data") 
# os.chdir("") # give the path to the data file.
os.listdir()

In [0]:
data_dir = 'processed/'

# Data loading

In [0]:
# change to raw data folder
# os.chdir(os.path.abspath(os.path.join(data_dir, "../../data/raw")))

In [0]:
%%time
sequential_data = []

line_number = 0
max_lines = 100
# with open('raw/data_train.json', 'r') as f:
with open('raw/test_new.json', 'r') as f:
    for line in f:
#         if len(sequential_data) > max_lines:
#             break
        sequential_data.append(json.loads(line))
# sequential_data = sequential_data[:100]
len(sequential_data)

CPU times: user 1.49 s, sys: 420 ms, total: 1.91 s
Wall time: 2.72 s


In [0]:
sequential_driver = {}

sequential_behavior = {}

for item in sequential_data:
    user_id = item[0]
    application_time = int(item[1]['order_info']['order_time'])
    sequential_driver.update({f"{user_id}|{application_time}": item[1]['order_info']})
    sub_data = [x for x in item[1]['data']
                if x['petime'] <= application_time-100]
    # we only keep data occurs before application time. "-100" is not neccessary for offline data cleaning.
    # but sometimes we use this trick for online calculation to avoid network slowdown
    sequential_behavior.update({f"{user_id}|{application_time}": sub_data})
## driver saved user data, while behavior saved both user data and behavior sequence
len(sequential_behavior), len(sequential_driver)

(30672, 30672)

# Preprocessing data for model building

In [0]:
unique_user_session = list(set(sequential_behavior.keys()))
keys_order = list()
for i in sequential_behavior.keys():
    if sequential_behavior[i] !=[]:
        keys_order.append(i)
keys_order[:10]
# len(keys_order)

['0ed9672fa61f4d6da241a6289000e2f2|1507102200000',
 '48b72ca5b43248d3b50dfadb76c651a2|1508810700000',
 '8e669f481a4645c89de0f29f980522e9|1508354520000',
 'd9a86017a67140ab96d9c405d1ebb02d|1507129920000',
 '96e3ded73e6b47ba91108e5b6a923c81|1509348180000',
 '593c8f76b5164c08a52c7fc1bf99d283|1508970600000',
 'e87ffe735fba4541a009967d25b0e714|1507058160000',
 '8e6c5ba3d14c41539b64816c4692f142|1507671840000',
 '3f8e1ae4c91147138fbdc5ea8aa17302|1508903820000',
 'ed76629b471b4496ab9b979eda237c20|1508883240000']

## Processing Data for Word2vec Embedding

In [0]:
# This is for word2vec embedding. In this part, we will not do one hot endoing for page types.
def data_process_for_embedding(sequence_for_a_single_application):
    '''
    Function to process signle application
    '''
    sequence_for_a_single_application.sort(key=lambda x: x['petime'])
    page_sequence = [x['pname'] for x in sequence_for_a_single_application]
    pstart = [x['pstime'] for x in sequence_for_a_single_application]
    pend = ([x['petime'] for x in sequence_for_a_single_application])

#     mark some outliers as -1 and  take the logarithm of the lag time.
    page_stay_time = [np.log((y-x)/1000 +1) if (y-x)>0 and (y-x)//1000<800 else -1 for x,y in zip(pstart, pend)]
#     mark some outliers as -1 and  take the logarithm of the lag time.
#     page_lagg_time = [np.log((x-y)/1000 +1)if (x-y)>=0  else -1 for x,y in zip(pstart[1:], pend[:-1])]

#     page_lagg_time_padd=[0]
#     page_lagg_time_padd.extend(page_lagg_time)

    return page_sequence  ,page_stay_time #,page_lagg_time_padd


def get_data_for_embedding(x):
    sequence_data = []
    stay_time_sequence = list()
    overdue = []
    for keys in keys_order:
        #page_sequence, page_stay_time, page_lagg_time = data_process_for_embedding(x[keys])
        page_sequence, stay_time = data_process_for_embedding(x[keys])
        # single_entry=np.vstack((page_sequence, page_stay_time, page_lagg_time)).T
        sequence_data.append(page_sequence)
        stay_time_sequence.append(stay_time)
    return sequence_data, stay_time_sequence


sequence_data_for_embedding, sequence_stay_time = get_data_for_embedding(sequential_behavior)
# sequence_data_for_embedding=np.array(sequence_data_for_embedding)
# sequence_data_for_embedding.shape
print("some information about the the data shape to help you undestand the data:")
print(f"number of users: {len(sequence_data_for_embedding)}")
print(
    f"timestamps length of behaviors of the first user: {len(sequence_data_for_embedding[0])}")
print(
    f"timestamps length of behaviors of the second user: {len(sequence_data_for_embedding[1])}")
print("Different user may have different page viewing sequence length(i.e. timestaps length), but we later decide to modify all to 60 timestamps")

some information about the the data shape to help you undestand the data:
number of users: 19617
timestamps length of behaviors of the first user: 15
timestamps length of behaviors of the second user: 21
Different user may have different page viewing sequence length(i.e. timestaps length), but we later decide to modify all to 60 timestamps


In [0]:
# This is to train our own word2vec model. NB: we do not use pre-trained word2vec model!
model = Word2Vec(sequence_data_for_embedding, min_count=1,
                 size=50, workers=3, window=5, sg=1)

In [0]:
# After word2vec, let's see what the data strcture looks like. 
# The change is that, now every page type has been modified into a vector of length 50
# (this vector length can be changed, it is a hymperparameter of word2vec model).
sequence_data_embedded = []
for i in range(len(sequence_data_for_embedding)):
    sequence_data_embedded_for_a_single_user = []
    for j in range(len(sequence_data_for_embedding[i])):
        sequence_data_embedded_for_a_single_user.append(model.wv[sequence_data_for_embedding[i][j]])
    sequence_data_embedded.append(sequence_data_embedded_for_a_single_user)
print(f"number of users: {len(sequence_data_embedded)}")
print(f"number of behaviors of first user: {len(sequence_data_embedded[0])}")
print(f"number of behaviors of second user: {len(sequence_data_embedded[1])}")
print(f"embedding size of every page type: {len(sequence_data_embedded[0][0])}")
print("the shape of sequence_data_embedded is (19617,X,50) where X is the number of behaviors of each user, it is not a fixed number")

number of users: 19617
number of behaviors of first user: 15
number of behaviors of second user: 21
embedding size of every page type: 50
the shape of sequence_data_embedded is (19617,X,50) where X is the number of behaviors of each user, it is not a fixed number


## Pad the sequence into the same length

In [0]:
padded_sequence_data_embedded = []
padding_vector = [0]*50 # 50 is the embedding size
maxLen = 60
for i in range(len(sequence_data_embedded)):
    padded_sequence_data_embedded_for_a_user = []
    if len(sequence_data_embedded[i]) >= maxLen:
        padded_sequence_data_embedded_for_a_user = sequence_data_embedded[i].copy()[:maxLen]
    else:
        padding_size = maxLen - len(sequence_data_embedded[i])
        padded_sequence_data_embedded_for_a_user = sequence_data_embedded[i].copy()
        for n in range(padding_size):
            padded_sequence_data_embedded_for_a_user.append(padding_vector)
    padded_sequence_data_embedded.append(padded_sequence_data_embedded_for_a_user)
len(padded_sequence_data_embedded), len(padded_sequence_data_embedded[0]), len(padded_sequence_data_embedded[0][0])

(19617, 60, 50)

## Pad Stay Time

In [0]:
padded_sequence_stay_time = []
padding_vector = 0 # 50 is the embedding size
for i in range(len(sequence_stay_time)):
    padded_sequence_data_embedded_for_a_user = []
    if len(sequence_stay_time[i]) >= maxLen:
        padded_sequence_data_embedded_for_a_user = sequence_stay_time[i].copy()[:maxLen]
    else:
        padding_size = maxLen - len(sequence_stay_time[i])
        padded_sequence_data_embedded_for_a_user = sequence_stay_time[i].copy()
        for n in range(padding_size):
            padded_sequence_data_embedded_for_a_user.append(padding_vector)
    padded_sequence_stay_time.append(padded_sequence_data_embedded_for_a_user)
len(padded_sequence_stay_time), len(padded_sequence_stay_time[0])

(19617, 60)

In [0]:
# check if the len has been padded to maxLen
print(len(sequence_stay_time[0])) 
# print("the shape of padded_sequence_data_embedded is {0} where {1} is the max len we set".format(padded_sequence_data_embedded.shape, len(padded_sequence_data_embedded[0])))
[padded_sequence_stay_time[0], sequence_stay_time[0]]

15


[[2.320916049678769,
  0.8082599876604498,
  1.6397731122301733,
  1.4678743481123135,
  1.6867692553704239,
  1.7967470107390942,
  1.660701206371642,
  2.273259263612217,
  2.595180077306471,
  0.9439058989071285,
  0.8666802313208206,
  1.3724489542978375,
  1.0511712140679732,
  1.4156104154539437,
  1.7112722183153684,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [2.320916049678769,
  0.8082599876604498,
  1.6397731122301733,
  1.4678743481123135,
  1.6867692553704239,
  1.7967470107390942,
  1.660701206371642,
  2.273259263612217,
  2.595180077306471,
  0.9439058989071285,
  0.8666802313208206,
  1.3724489542978375,
  1.0511712140679732,
  1.4156104154539437,
  1.7112722183153684]]

## Save the Processed Data in Json Format

In [0]:
driver_columns=['new_client', '1', '2', '3', '4', '5', '6', '7', '1_time', '2_time','3_time', '4_time', '5_time', '6_time']
features_sequential={} ## sequential data
features_nonsequential={} ## non sequential features -- user type, application time
features_sequential_embedded={} ## embedded sequential data
label={}
# for i in range(len(keys)):
#     uid=keys[i]
#     driver_data=driver[driver["index"]==uid]
#     feature_nonsequential=list(driver_data[driver_columns].values[0].astype("float"))
#     features_sequential[uid]=padded_sequence_data[i]
#     features_nonsequential[uid]=feature_nonsequential
#     features_sequential_embedded[uid]=padded_sequence_data_embedded[i]
#     label[uid]=driver_data["label"].values[0]
#     if i%1000==0:
#         print(i,"/", len(keys))

In [0]:
# driver[["index", "label"]]

In [0]:
len(padded_sequence_data_embedded[0][0])

50

In [0]:
# len(features_nonsequential["56f889ee11df4a72955147cb2f29a638|1509322980000"])
# features_sequential_embedded = dict(zip(keys, padded_sequence_data_embedded))
# write embedding sequence into pickle file
with open(data_dir+'embedding_sequence.p', "wb") as fp:  # Pickling
    pickle.dump(padded_sequence_data_embedded, fp)
with open(data_dir+'stay_time_sequence.p', "wb") as fp:  # Pickling
    pickle.dump(padded_sequence_stay_time, fp)

In [0]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)


        
# with open(data_dir+'features_sequential_embedded.json', 'w') as f: # LSTM input with word2vec embedded 
#     json.dump(features_sequential_embedded, f, cls=NpEncoder)
# print("Done saving padded_sequential_features")


# with open(data_dir+'padded_sequential_features.json', 'w') as f:
#     json.dump(features_sequential, f, cls=NpEncoder)
# print("Done saving padded_sequential_features")

# with open(data_dir+'non_sequential_features.json', 'w') as f:
#     json.dump(features_nonsequential, f, cls=NpEncoder)
# print("Done saving features_nonsequential")

# with open(data_dir+'label.json', 'w') as f:
#     json.dump(label, f, cls=NpEncoder)
# print("Done saving label.json")

# Generate Markov Transition Field along features

In [None]:
# with open(data_dir+'padded_sequential_features_3.json') as f:
#     sequential_features = json.load(f)
sequential_features = features_sequential
feature1 = np.asarray([_ for _ in sequential_features.values()])
len(feature1[0])

- To Transform continous time features into categorical features we need to cut them into bins
- We first explore the distribution of the time features

In [0]:
x = feature1[:, :, 15].flatten()
pd.DataFrame(x).describe()

- Cut stay time, lag time into 8 categories

In [0]:
stay_time=feature1[:,:,14].flatten()
stay_time=pd.cut(x,8,labels=list(range(8)))
stay_time=np.array(stay_time).reshape(-1,60)
feature1[:,:,14]=stay_time

lag_time=feature1[:,:,15].flatten()
lag_time=pd.cut(x,8,labels=list(range(8)))
lag_time=np.array(lag_time).reshape(-1,60)
feature1[:,:,15]=lag_time

- Generate transition matrix
- A 31 by 31 matrix 
- All features are bin and one-hot encoded into a binary vector.
- matrix[i,j]  represents the probability that j=1 given i=1

In [0]:
from sklearn import preprocessing
#test
X =list(range(8))
X=np.array(X)
X=X[:,np.newaxis]
enc = preprocessing.OneHotEncoder(categories='auto')
enc.fit(X)
onehotlabels = enc.transform(X).toarray()

In [0]:
arr=[]
for i in range(feature1.shape[0]):
    seq=feature1[i]
    seq=[i for i in seq if i[0]!=-1] 
    seq=np.array(seq)
    #print(seq)
    onehot_staytime=enc.transform(seq[:,14][:,np.newaxis]).A
    onehot_lagtime=enc.transform(seq[:,15][:,np.newaxis]).A
    #print(onehot_staytime)
    seq=np.delete(seq, np.s_[0,14,15], 1)
    seq=np.concatenate([seq, onehot_staytime,onehot_lagtime],axis=1)
    matrix=np.zeros([seq.shape[1],seq.shape[1]])
    for i in range(seq.shape[1]):
        total=np.sum(seq[:,i])
        if total==0:
            continue
        sub=seq[seq[:,i]==1]
        for col in range(sub.shape[1]):
            coappear=np.sum(sub[:,col])
            matrix[i,col]=matrix[i,col]+coappear/total
    arr.append(matrix)

- Save the matrix

In [0]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)
with open(data_dir+'featurematrix.json', 'w') as f:
    json.dump(arr, f,cls=NpEncoder)