## Train LSTM Network to Predict Occurrence of Next Event

### Load Libraries

In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Dask for lazy loading and computation of data
import dask.dataframe as dd
import time
from dask import delayed
import dask.array as da

#Word embedding
from gensim.models import Word2Vec
# Keras DeepLearning Framework
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Dropout, Embedding, LSTM

Using TensorFlow backend.


### Load Data
- log_file.csv is a transformed version of orignal event log data in xes format
- Orignal dataset by:
- van Dongen, B.F. (Boudewijn) (2017) BPI Challenge 2017. Eindhoven University of Technology. Dataset. https://doi.org/10.4121/uuid:5f3067df-f10b-45da-b98b-86ae4c7a310b

In [2]:
df = dd.read_csv("log_file.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trace:concept:name,trace:ApplicationType,trace:LoanGoal,trace:RequestedAmount,caseid,event:concept:name,event:org:resource,event:EventID,event:lifecycle:transition,...,event:EventOrigin,event:FirstWithdrawalAmount,event:MonthlyCost,event:Accepted,event:CreditScore,event:OfferedAmount,event:Selected,event:NumberOfTerms,event:OfferID,Id
0,0,Application_652823628,New credit,Existing loan takeover,20000.0,Application_652823628,A_Create Application,User_1,Application_652823628,complete,...,Application,,,,,,,,,652823628
1,1,Application_652823628,New credit,Existing loan takeover,20000.0,Application_652823628,A_Submitted,User_1,ApplState_1582051990,complete,...,Application,,,,,,,,,652823628
2,2,Application_652823628,New credit,Existing loan takeover,20000.0,Application_652823628,W_Handle leads,User_1,Workitem_1298499574,schedule,...,Workflow,,,,,,,,,652823628
3,3,Application_652823628,New credit,Existing loan takeover,20000.0,Application_652823628,W_Handle leads,User_1,Workitem_1673366067,withdraw,...,Workflow,,,,,,,,,652823628
4,4,Application_652823628,New credit,Existing loan takeover,20000.0,Application_652823628,W_Complete application,User_1,Workitem_1493664571,schedule,...,Workflow,,,,,,,,,652823628


### Group events
- Group events part of same transaction
- In absence of known final event in the sequence , added event type: 'End' after occurrence of last event.

In [7]:
event_grouped = df.groupby('Id')["event:concept:name"].apply(list)
event_grouped = event_grouped.map_partitions(lambda x: x + ["End"])

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  """Entry point for launching an IPython kernel.


### Lazy loading of events for training Word2Vec Embedding

In [8]:
def gen_events(event_list):
    for x in range(event_list.npartitions):
            events = event_list.get_partition(x).compute()
            events = events.tolist()
            for x in events:
                yield x

In [56]:
class generate_Sequence():
    '''
    Streaming class to generate grouped events in a lazy way to avoid issues of RAM 
    running out of Memory.
    '''
    def __init__(self, generator_function,event_list):
        self.event_list = event_list
        self.generator_function = generator_function
        self.generator = self.generator_function(self.event_list)

    def __iter__(self):
        # reset the generator
        self.generator = self.generator_function(self.event_list)
        return self

    def __next__(self):
        result = next(self.generator)
        if result is None:
            raise StopIteration
        else:
            return result

In [10]:
iterate = generate_Sequence(gen_events,event_grouped)

In [13]:
w2vmodel = Word2Vec(iterate)

In [14]:
w2vmodel.save('w2v.model')

W0423 15:00:44.465139 140633082271488 smart_open_lib.py:379] this function is deprecated, use smart_open.open instead


In [57]:
print('Size / Length of Each Word Vector: %s'%w2vmodel.vector_size)

Size / Length of Each Word Vector: 100


In [59]:
print('Count of Distinct Events in the Journey of a Transaction: %s'%len(w2vmodel.wv.vocab))

Count of Distinct Events in the Journey of a Transaction: 27


#### Extract Event Embedding Matrix to be used in training of LSTM Network

In [None]:
embedding_matrix = np.zeros(shape = (len(w2vmodel.wv.vocab),w2vmodel.vector_size))

for i in range(len(w2vmodel.wv.vocab)):
    embedding_matrix[i] = w2vmodel.wv[w2vmodel.wv.index2word[i]]

#print(embedding_matrix.shape)
embedding_matrix[0:2]

### Data streaming pipeline for inputs to LSTM Network
- Delayed Function
- Delayed funciton output - X, y
- Rechunk the dask array to size == Batch Size to be used for Training Model (512 in current 
    scenario)
- Generator Function to initiate above computation graph and feed data in batch size of 512

In [18]:
def input_feed(series):
    X_temp , y_temp = [], []
    for event in series:
        for i in range(1, len(event)):
            temp_x = event[0:i]
            temp_x = [w2vmodel.wv.vocab.get(x).index for x in temp_x]
            #print(temp_x)
            X_temp.append(temp_x)
            temp_y = w2vmodel.wv.vocab.get(event[i]).index
            y_temp.append(temp_y)
    
    X_events = pad_sequences(X_temp,maxlen= 50)
    y_temp = da.array(y_temp)
    y = da.from_array(y_temp,chunks = {0:512})
    #print("Completed....")
    return (X_events,y)

In [20]:
input_feed_dask = delayed(input_feed) # Delayed function does lazy computation on data

In [21]:
a = (input_feed_dask)(event_grouped) # delayed output

In [22]:
X_events = a[0]
y_events = a[1]

In [23]:
X_events, y_events

(Delayed('getitem-a5720561b354d5e13fa9ff81d95d308e'),
 Delayed('getitem-5bbac8d59dc1f684b92b721666e1dba3'))

In [24]:
X_events_ar = da.from_delayed(X_events,dtype= float ,shape = (1202267, 50))
y_events = da.from_delayed(y_events,dtype = float, shape = (1202267,))

In [25]:
X_events_ar = X_events_ar.rechunk({0:512,1:-1})
y_events = y_events.rechunk({0:512})

In [26]:
chunkList = [(x,y) for x,y in zip(range(0,X_events_ar.shape[0]+X_events_ar.chunksize[0],
                    X_events_ar.chunksize[0]),
                    range(X_events_ar.chunksize[0],X_events_ar.shape[0]+X_events_ar.chunksize[0],
                                                    X_events_ar.chunksize[0]))]

In [27]:
len(chunkList)

2349

#### Generator function to stream batches to LSTM Network

In [29]:
def gen_inputs(x,y_array):
    for chunks in chunkList:
        X = x[chunks[0]:chunks[1]]
        X = X.map_blocks(np.copy)
        X = X.compute()
        y = y_array[chunks[0]:chunks[1]]
        y = y.map_blocks(np.copy)
        y = y.compute()
        y_transformed = np.zeros((len(y),27))
        y_transformed[np.arange(len(y)),y] = 1
        yield (X,y_transformed)

In [32]:
inp = gen_inputs(X_events_ar, y_events)

### LSTM Model Architecture

In [None]:
init = 'glorot_uniform'
model = Sequential()
model.add(Embedding(len(w2vmodel.wv.vocab),100,input_length= 50, 
                    weights = [embedding_matrix], trainable = False))
model.add(LSTM(100,implementation =2 , kernel_initializer = init, return_sequences = False))
model.add(Dense(50,kernel_initializer = init, activation = 'relu'))
model.add(Dense(27,kernel_initializer = init, activation = 'softmax' , name ="output"))
print(model.summary())
model.compile(loss ='categorical_crossentropy',optimizer= 'rmsprop', metrics =['accuracy'])

In [None]:
est = model.fit_generator(inp, steps_per_epoch= np.ceil(1202267/512),verbose =1
                         ,use_multiprocessing = True, workers = 6)

## Training Model by loading data in RAM

- Training is faster as ther is no overhead on CPU side to compute Dask graph for every input batch
- Drawback : with larger dataset RAM could run out of Memory

In [37]:
def input_feed(series):
    X_temp , y_temp = [], []
    for event in series:
        for i in range(1, len(event)):
            temp_x = event[0:i]
            temp_x = [w2vmodel.wv.vocab.get(x).index for x in temp_x]
            #print(temp_x)
            X_temp.append(temp_x)
            temp_y = w2vmodel.wv.vocab.get(event[i]).index
            y_temp.append(temp_y)
    
    X_events = pad_sequences(X_temp,maxlen= 50)
    y_temp = np.array(y_temp)
    #y = da.from_array(y_temp,chunks = {0:512})
    #print("Completed....")
    return (X_events,y_temp)

In [39]:
events = event_grouped.compute()

In [40]:
events.head()

Id
94215      [A_Create Application, W_Complete application,...
919303     [A_Create Application, A_Concept, W_Complete a...
2528658    [A_Create Application, A_Submitted, W_Handle l...
2595810    [A_Create Application, A_Concept, W_Complete a...
3108939    [A_Create Application, A_Concept, W_Complete a...
Name: event:concept:name, dtype: object

In [41]:
X_events, y_events = input_feed(events)

In [42]:
X_events.shape, y_events.shape

((1202267, 50), (1202267,))

In [45]:
y_transformed = np.zeros((len(y_events),27))
y_transformed[np.arange(len(y_events)),y_events] = 1

In [46]:
y_transformed.shape

(1202267, 27)

In [61]:
init = 'glorot_uniform'
model = Sequential()
model.add(Embedding(len(w2vmodel.wv.vocab),100,input_length= 50, 
                    weights = [embedding_matrix], trainable = False))
model.add(LSTM(100,implementation =2 , kernel_initializer = init, return_sequences = False))
model.add(Dense(50,kernel_initializer = init, activation = 'relu'))
model.add(Dense(27,kernel_initializer = init, activation = 'softmax' , name ="output"))
print(model.summary())
model.compile(loss ='categorical_crossentropy',optimizer= 'rmsprop', metrics =['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           2700      
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
_________________________________________________________________
output (Dense)               (None, 27)                1377      
Total params: 89,527
Trainable params: 86,827
Non-trainable params: 2,700
_________________________________________________________________
None


In [62]:
est = model.fit(X_events,y_transformed, batch_size= 512 , epochs= 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
