In [1]:
import pandas as pd
import Pipeline as pm

## Load the Data

In [2]:
df = pd.read_csv('Data/Real_Datasets/bpic13-0.1-3.csv')

In [3]:
df

Unnamed: 0,case_id,event_position,name,timestamp,org:group,org:resource,resource country,anomaly,isAnomaly
0,1-147898401,0,Accepted+In Progress,00:36.0,Org line A2,Tomas,Sweden,Normal,0
1,1-147898401,1,Accepted+In Progress,05:44.0,Org line A2,Tomas,Sweden,Normal,0
2,1-165554831,0,Accepted+In Progress,06:25.0,Org line A2,Tomas,Sweden,Normal,0
3,1-172473423,0,Accepted+In Progress,21:54.0,Org line A2,Tomas,Sweden,Normal,0
4,1-182640781,0,Accepted+In Progress,21:05.0,Org line A2,Niklas,Sweden,Normal,0
...,...,...,...,...,...,...,...,...,...
2371,1-759255125,1,Accepted+Wait,21:56.0,Org line C,Ranveer,Sweden,Normal,0
2372,1-759255152,0,Accepted+In Progress,36:37.0,Org line A2,Hanna,POLAND,Normal,0
2373,1-759255152,1,Accepted+In Progress,46:31.0,Org line A2,Hanna,POLAND,Normal,0
2374,1-759318221,0,Accepted+In Progress,17:18.0,Org line A2,Ian,United Kingdom,Normal,0


## Encode the Data into numerical format

The data should have a numerical format and every number should be in the span of 0-1. 

The encoder is able to leverage different encoding methods such as:
* *one-hot*
* *word2vec*
* *llm_chunked*: model based upon leveraging 'bert-based-uncased'
* *sentence-transformer*: BERT based model pre-trained on sentences

The encoder is designed to work on event logs containing string words, meaning groups of csv rows (**events**) that can be grouped by one key (**case_id**). The columns that should be encoded can be specified in **columns2encode**. Since we order the events the **timestamp** is essential for the encoder!

In [6]:
encoder = pm.Encoder(method='word2vec', group_column='case_id', timestamp='timestamp', columns2encode=['name'])
encoded, *_ = encoder.encode(df)

In [4]:
encoded

Unnamed: 0,case_id,0,1,2,3,4,5,6,7,8,...,3890,3891,3892,3893,3894,3895,3896,3897,3898,3899
0,1-147898401,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1-165554831,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1-172473423,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1-182640781,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1-230541365,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,1-759020352,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,1-759169644,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,1-759255125,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
817,1-759255152,-0.060084,0.035758,0.045838,0.081878,-0.003439,-0.046377,0.071399,0.075351,-0.076194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Apply the Anomaly Detection Approach

It is possible to choose between two types of anomaly detection approaches:
* *autoencoder*: find anomalies based on reconstruction errors
* *iforest* : finds anomalies based on length of decision tree

For the autoencoder the parameter **epochs** (how often should the data be viewed by the model) and **learning rate** (size of the correction taken in each batch)
In the loss analysis file you can find the predictions done by the model as well as the loss for every case.

In [5]:
anomalydetector = pm.AnomalyDetector(model_type='autoencoder')
anomalydetector.fit_train(encoded, group_column='case_id', label_column=None, epochs=2,  learning_rate=1e-3)
_, loss_analysis = anomalydetector.evaluate(encoded, group_column='case_id', label_column=None)

Epoch 1/2
Epoch 2/2
Threshold_lower: 3.058299314448325e-05 and Threshold_upper: 0.00010920435935474707


In [7]:
loss_analysis

Unnamed: 0,case_id,loss,label
0,1-147898401,0.000073,1.0
1,1-165554831,0.000040,0.0
2,1-172473423,0.000040,0.0
3,1-182640781,0.000132,1.0
4,1-230541365,0.000109,1.0
...,...,...,...
814,1-759020352,0.000040,0.0
815,1-759169644,0.000040,0.0
816,1-759255125,0.000031,0.0
817,1-759255152,0.000046,0.0
