0 Define input parameters

In [20]:
path_normative_model = "model1_1.decl"
path_auditor_model = "model1_2.decl"
path_real_model = "model1_3.decl"

1 Log generation

In [21]:
# Create synthetic event log 
import os
from Declare4Py.ProcessModels.DeclareModel import DeclareModel
from Declare4Py.ProcessMiningTasks.LogGenerator.ASP.ASPLogGenerator import AspGenerator

In [34]:
# import declarative model
declare_model: DeclareModel = DeclareModel().parse_from_file(os.path.join(path_real_model))


In [35]:
# set input parameters log generator
num_of_cases = 10000
(num_min_events, num_max_events) = (8,15)
verbose = True

In [36]:
# run generator
asp_gen: AspGenerator = AspGenerator(declare_model, num_of_cases, num_min_events, num_max_events, verbose=verbose)
asp_gen.run()

DEBUG:ASP generator:Distribution for traces UNIFORM
DEBUG:ASP generator:traces: 10000, events can have a trace min(8) max(15)
DEBUG:Distribution:Distribution() UNIFORM min_mu: 8 max_sigma: 15 num_traces: 10000 custom_prob: None
DEBUG:Distribution:Calculating Uniform Distribution
DEBUG:Distribution:Generating Uniform Probabilities since either distribution is uniform or custom probabilities are None
DEBUG:Distribution:Probabilities sum is 1
DEBUG:Distribution:Distribution result: [10  8  9 ... 11 14 10]
DEBUG:ASP generator:Prepared distribution of 10000 positive traces with distribution: Counter({14: 1318, 10: 1278, 12: 1260, 11: 1243, 15: 1238, 13: 1226, 9: 1225, 8: 1212})
DEBUG:ASP generator:Prepared distribution of 0 negative traces with distribution: Counter()
DEBUG:ASP generator:Generating positive Traces
DEBUG:ASP generator:Translate declare model to ASP
DEBUG:ASP generator:Declare model translated to ASP. Total Facts 10
DEBUG:ASP generator:ASP encoding generated
DEBUG:ASP generat

In [37]:
# Save file
model_name = 'M_TEST'
asp_gen.to_xes(f"{model_name}.xes")

exporting log, completed traces :: 100%|██████████| 10000/10000 [00:02<00:00, 4579.95it/s]


2 Conformance checking 

2.1 Check between event log and normative model (results in deviation log)

In [41]:
from Declare4Py.ProcessModels.DeclareModel import DeclareModel
from Declare4Py.D4PyEventLog import D4PyEventLog

In [43]:
log_path = os.path.join("M_TEST.xes")

In [44]:
# Load data
event_log = D4PyEventLog(case_name="case:concept:name")
event_log.parse_xes_log(log_path)

model_path = os.path.join(path_normative_model)
declare_model = DeclareModel().parse_from_file(model_path)

# Show constraints in normative model
model_constraints = declare_model.get_decl_model_constraints()
model_constraints

parsing log, completed traces :: 100%|██████████| 10000/10000 [00:02<00:00, 3615.08it/s]


['Init[g] | | |',
 'End[d] | | |',
 'Chain Precedence[c, f] | | |',
 'Alternate Response[h, f] | | |',
 'Precedence[h, b] | | |',
 'Responded Existence[j, f] | | |',
 'Chain Precedence[j, f] | | |',
 'Response[a, c] | | |',
 'Alternate Precedence[a, e] | | |',
 'Responded Existence[g, c] | | |']

In [45]:
from Declare4Py.ProcessMiningTasks.ConformanceChecking.MPDeclareAnalyzer import MPDeclareAnalyzer
from Declare4Py.ProcessMiningTasks.ConformanceChecking.MPDeclareResultsBrowser import MPDeclareResultsBrowser 

In [46]:
# conformance check between eventlog and normative model
basic_checker = MPDeclareAnalyzer(log=event_log, declare_model=declare_model, consider_vacuity=True)
conf_check_res: MPDeclareResultsBrowser = basic_checker.run()

In [54]:
# show results
df_norm = conf_check_res.get_metric(metric="state")

print("Conformant cases: " +  str(df_norm.loc[(df_norm==1).all(axis=1)].shape[0]) + 
      "\nNon-conformant cases: " + str(df_norm.loc[(df_norm==0).any(axis=1)].shape[0]))



Conformant cases: 3
Non-conformant cases: 9997


In [48]:
df_norm.to_csv("M_rules_norm_TEST.csv")

2.1 Check between deviation log and auditor model (results in anomaly log)

In [55]:
from Declare4Py.ProcessModels.DeclareModel import DeclareModel
from Declare4Py.ProcessMiningTasks.ConformanceChecking.MPDeclareAnalyzer import MPDeclareAnalyzer
from Declare4Py.ProcessMiningTasks.ConformanceChecking.MPDeclareResultsBrowser import MPDeclareResultsBrowser

In [56]:
model_path = os.path.join(path_auditor_model)
declare_model = DeclareModel().parse_from_file(model_path)

# Show constraints in auditor model
model_constraints = declare_model.get_decl_model_constraints()
model_constraints

['End[d] | | |',
 'Response[h, f] | | |',
 'Precedence[h, b] | | |',
 'Responded Existence[j, f] | | |',
 'Alternate Precedence[j, f] | | |',
 'Responded Existence[a, c] | | |',
 'Precedence[a, e] | | |',
 'Responded Existence[g, c] | | |']

In [57]:
# conformance check between deviation log and auditor model (deviation log = event log in this case, because all cases were non-conformant)
basic_checker = MPDeclareAnalyzer(log=event_log, declare_model=declare_model, consider_vacuity=True)
conf_check_res: MPDeclareResultsBrowser = basic_checker.run()

In [58]:
# show results
df_audit = conf_check_res.get_metric(metric="state")

print("Conformant cases: " +  str(df_norm.loc[(df_audit==1).all(axis=1)].shape[0]) + 
      "\nNon-conformant cases: " + str(df_norm.loc[(df_audit==0).any(axis=1)].shape[0]))

Conformant cases: 1655
Non-conformant cases: 8345


In [59]:
df_audit.to_csv("M_rules_auditor_TEST.csv")

3 Label deviations as Anomaly (1) or Exception (0)

In [60]:
import random
import numpy as np
import pandas as pd

In [62]:
df_norm = pd.read_csv("M_rules_norm_TEST.csv")
df_audit = pd.read_csv("M_rules_auditor_TEST.csv")

# log_path = os.path.join("M_eventlog.xes")
# event_log = D4PyEventLog(case_name="case:concept:name")
# event_log.parse_xes_log(log_path)

In [63]:
# Get deviations and anomalies
violations_auditor = df_audit[(df_audit == 0).any(axis='columns')]
violations_norm = df_norm[(df_norm == 0).any(axis='columns')]

# Generate lists to get indices
anomalies = violations_auditor.index.values.tolist() 
deviations = violations_norm.index.values.tolist() 
exceptions = np.setdiff1d(deviations, anomalies)

In [64]:
violations_auditor

Unnamed: 0.1,Unnamed: 0,End[d] | | |,"Response[h, f] | | |","Precedence[h, b] | | |","Responded Existence[j, f] | | |","Alternate Precedence[j, f] | | |","Responded Existence[a, c] | | |","Precedence[a, e] | | |","Responded Existence[g, c] | | |"
0,0,1,1,1,1,1,1,1,1
6,6,1,1,1,1,1,0,1,1
7,7,1,1,1,1,0,1,1,1
8,8,1,1,1,1,1,0,1,1
11,11,1,1,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...
9995,9995,1,1,1,1,0,0,1,1
9996,9996,1,1,1,1,1,0,1,1
9997,9997,1,1,1,1,0,1,1,1
9998,9998,1,0,1,1,0,1,1,1


In [74]:
len(deviations)

9997

In [65]:
event_log.to_dataframe()
df_log = event_log.get_log()

In [66]:
df_log

Unnamed: 0,concept:name,lifecycle:transition,time:timestamp,case:concept:name,case:label
0,a,complete,2024-05-22 18:04:19.882815+00:00,case_0,positive
1,c,complete,2024-05-22 19:14:47.882815+00:00,case_0,positive
2,c,complete,2024-05-22 19:57:48.882815+00:00,case_0,positive
3,c,complete,2024-05-22 21:29:30.882815+00:00,case_0,positive
4,c,complete,2024-05-22 22:45:02.882815+00:00,case_0,positive
...,...,...,...,...,...
115249,f,complete,2024-05-23 00:24:18.342162+00:00,case_9999,positive
115250,j,complete,2024-05-23 00:50:38.342162+00:00,case_9999,positive
115251,f,complete,2024-05-23 02:07:02.342162+00:00,case_9999,positive
115252,f,complete,2024-05-23 03:30:02.342162+00:00,case_9999,positive


In [77]:
# Deep copy log 
eventlog = df_log.copy(deep=True)

# Format case_id such that only integer value is shown
eventlog['case:concept:name'] = eventlog['case:concept:name'].str.replace('case_', '')
eventlog['case:concept:name'] = eventlog['case:concept:name'].astype(int)

In [78]:
eventlog

Unnamed: 0,concept:name,lifecycle:transition,time:timestamp,case:concept:name,case:label
0,a,complete,2024-05-22 18:04:19.882815+00:00,0,positive
1,c,complete,2024-05-22 19:14:47.882815+00:00,0,positive
2,c,complete,2024-05-22 19:57:48.882815+00:00,0,positive
3,c,complete,2024-05-22 21:29:30.882815+00:00,0,positive
4,c,complete,2024-05-22 22:45:02.882815+00:00,0,positive
...,...,...,...,...,...
115249,f,complete,2024-05-23 00:24:18.342162+00:00,9999,positive
115250,j,complete,2024-05-23 00:50:38.342162+00:00,9999,positive
115251,f,complete,2024-05-23 02:07:02.342162+00:00,9999,positive
115252,f,complete,2024-05-23 03:30:02.342162+00:00,9999,positive


In [79]:
# Add column that indicates whether case is deviation or not
eventlog['deviation'] = np.where(eventlog['case:concept:name'].isin(deviations), 1, 0)
len(eventlog)
# test = eventlog[eventlog['deviation'] == 0] # df should be empty 
# test

115254

In [80]:
# Delete traces without deviation ###################################
eventlog = eventlog[eventlog.deviation != 0]
len(eventlog)

115223

In [81]:
# Add column that indicates whether case is anomaly or not (this is our label)
eventlog['anomaly'] = np.where(eventlog['case:concept:name'].isin(anomalies), 1, 0)

# test = eventlog[eventlog['anomaly'] == 1]
# test

print(anomalies)

[0, 6, 7, 8, 11, 12, 13, 14, 16, 19, 21, 22, 23, 24, 27, 29, 30, 31, 34, 35, 36, 38, 39, 41, 43, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57, 59, 60, 62, 63, 65, 68, 69, 70, 71, 72, 75, 76, 78, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, 124, 125, 126, 127, 129, 133, 134, 135, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 153, 154, 156, 158, 159, 161, 162, 164, 165, 166, 167, 168, 170, 171, 172, 173, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 195, 196, 197, 198, 199, 203, 204, 205, 206, 207, 212, 213, 214, 215, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 238, 239, 241, 242, 243, 244, 245, 248, 249, 250, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 271, 272, 273, 274, 275, 276, 277, 278, 279, 282,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eventlog['anomaly'] = np.where(eventlog['case:concept:name'].isin(anomalies), 1, 0)


In [83]:
# Set input parameter: how many anomalies we want in the entire dataset
percent_anomalies = 0.1

In [94]:
# set number of anomalies and exceptions in entire dataset
num_anomalies = len(anomalies)
num_exceptions = len(exceptions)
num_deviations = num_anomalies + num_exceptions

# Check data imbalance
print("Number of anomalies: " + str(num_anomalies) + 
      "\nNumber of exceptions: " + str(num_exceptions))

adjusted_num_anomalies = round(percent_anomalies * num_exceptions)
adjusted_num_exceptions = num_exceptions - adjusted_num_anomalies

Number of anomalies: 8346
Number of exceptions: 1651


In [97]:
print("Adjusted number of anomalies: " + str(len(adjusted_anomalies)))
print("Adjusted number of exceptions: " + str(len(adjusted_exceptions)))

Adjusted number of anomalies: 165
Adjusted number of exceptions: 1486


In [98]:
# Get df with anomalies only
anomalies_df = eventlog[eventlog['anomaly']==1]
adjusted_anomalies = random.sample(sorted(anomalies), adjusted_num_anomalies)
anomalies_df['to_keep'] = np.where(anomalies_df['case:concept:name'].isin(adjusted_anomalies), 1, 0)

anomalies_df_new = anomalies_df[anomalies_df['to_keep']==1]
final_anomalies = anomalies_df_new.drop(['deviation', 'to_keep'],axis='columns')


# Balance data: reduce number of exceptions to equal number of anomalies
exceptions_df = eventlog[eventlog['anomaly']==0] 
adjusted_exceptions = random.sample(sorted(exceptions), adjusted_num_exceptions)
exceptions_df['to_keep'] = np.where(exceptions_df['case:concept:name'].isin(adjusted_exceptions), 1, 0)

exception_df_new = exceptions_df[exceptions_df['to_keep']==1]
final_exceptions = exception_df_new.drop(['deviation', 'to_keep'],axis='columns')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomalies_df['to_keep'] = np.where(anomalies_df['case:concept:name'].isin(adjusted_anomalies), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exceptions_df['to_keep'] = np.where(exceptions_df['case:concept:name'].isin(adjusted_exceptions), 1, 0)


In [99]:
# Get labeled deviation log 
labeled_deviations = pd.concat([final_anomalies,final_exceptions])
labeled_deviations

labeled_deviations.to_csv("M_labeled_deviations_TEST.csv")


4 Transform event log to trace log

In [102]:
# import dataframe deviation log
# deviations = pd.read_csv("labeled_deviations_jupyter_imbalanced.csv")
deviations = labeled_deviations

# convert 'time:timestamp' to datetime
deviations['time:timestamp'] = pd.to_datetime(deviations['time:timestamp'],format="ISO8601")
deviations_sorted = deviations.groupby('case:concept:name').apply(lambda x: x.sort_values('time:timestamp')).reset_index(drop=True)

deviations_sorted

Unnamed: 0,concept:name,lifecycle:transition,time:timestamp,case:concept:name,case:label,anomaly
0,h,complete,2024-05-22 18:02:45.882815+00:00,1,positive,0
1,j,complete,2024-05-22 19:15:46.882815+00:00,1,positive,0
2,c,complete,2024-05-22 20:15:21.882815+00:00,1,positive,0
3,a,complete,2024-05-22 21:08:46.882815+00:00,1,positive,0
4,a,complete,2024-05-22 21:47:00.882815+00:00,1,positive,0
...,...,...,...,...,...,...
18177,j,complete,2024-05-22 23:55:22.342162+00:00,9992,positive,0
18178,b,complete,2024-05-23 01:13:07.342162+00:00,9992,positive,0
18179,j,complete,2024-05-23 01:53:46.342162+00:00,9992,positive,0
18180,f,complete,2024-05-23 03:01:28.342162+00:00,9992,positive,0


In [103]:
# transform event log to trace log
traces = deviations_sorted.groupby('case:concept:name').agg({
    'concept:name': lambda x: f"<{', '.join(x)}>",
    'anomaly': 'first'
    }).reset_index()
traces.columns = ['case:concept:name', 'trace', 'label']

traces

Unnamed: 0,case:concept:name,trace,label
0,1,"<h, j, c, a, a, c, d, j, f, d>",0
1,2,"<h, c, c, c, j, b, a, c, f, d>",0
2,3,"<h, c, j, a, c, b, e, j, f, d>",0
3,4,"<j, h, g, a, c, a, b, b, f, d>",0
4,5,"<a, h, b, g, c, a, a, j, f, d>",0
...,...,...,...
1646,9959,"<h, j, h, f, g, f, f, b, f, f, d>",1
1647,9964,"<c, j, f, h, b, j, j, h, i, f, d>",0
1648,9984,"<j, d, f, h, b, d, j, b, j, f, d>",0
1649,9991,"<h, j, h, f, f, f, f, c, f, f, d>",1


In [104]:
# merge trace log and results conformance checking
# merged_norm = pd.merge(left=traces, right=df_norm, how="left", left_on="case:concept:name", right_index=True)
merged_audit = pd.merge(left=traces, right=df_audit, how="left", left_on="case:concept:name", right_index=True)

In [105]:
merged_audit.sample(20)

Unnamed: 0.1,case:concept:name,trace,label,Unnamed: 0,End[d] | | |,"Response[h, f] | | |","Precedence[h, b] | | |","Responded Existence[j, f] | | |","Alternate Precedence[j, f] | | |","Responded Existence[a, c] | | |","Precedence[a, e] | | |","Responded Existence[g, c] | | |"
1478,8397,"<c, d, d, a, a, a, a, d, d, a, d, d, e, d>",0,8397,1,1,1,1,1,1,1,1
504,2492,"<a, h, c, j, c, a, b, f, d>",0,2492,1,1,1,1,1,1,1,1
442,2226,"<j, f, d, f, d, h, d, d>",1,2226,1,0,1,1,0,1,1,1
346,1834,"<h, c, c, j, d, d, f, d>",0,1834,1,1,1,1,1,1,1,1
43,163,"<a, a, h, h, c, j, f, j, d, d>",0,163,1,1,1,1,1,1,1,1
1337,7087,"<h, d, j, f, d, d, j, d, d, d, d, f, d>",0,7087,1,1,1,1,1,1,1,1
605,2765,"<j, j, j, j, h, f, d, c, d>",0,2765,1,1,1,1,1,1,1,1
34,112,"<c, j, h, h, d, d, d, d, f, d>",0,112,1,1,1,1,1,1,1,1
600,2726,"<h, b, h, h, d, j, f, d, d>",0,2726,1,1,1,1,1,1,1,1
478,2373,"<h, j, f, d, i, d, b, d>",0,2373,1,1,1,1,1,1,1,1


In [106]:
# export files 
# merged_norm.to_csv("M_TEST_results_normative_imbalanced.csv", index=False)  
merged_audit.to_csv("M_results_TEST__.csv", index=False)

5 Apply Machine Learning technique to classify deviating traces as anomaly (1) or exception (0)

In [107]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics

DEBUG:matplotlib:CACHEDIR=C:\Users\lucp10256\.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from C:\Users\lucp10256\.matplotlib\fontlist-v330.json


In [109]:
data = merged_audit

In [113]:
# get index of 'label' column
label_index = data.columns.get_loc('label')

# select all columns after 'label' column
features = data.iloc[:, label_index + 1:]

# split dataset in features and target variable
X = features 
y = data['label'] 

# split dataset in training and test set
train_test_perc = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_perc) 

In [122]:
# apply Decision Tree Classifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

# apply Decision Tree Classifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


In [126]:
# metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)
auc_score = metrics.roc_auc_score(y_test, y_pred)

print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1-score: ', f1_score)
print('AUC-score: ', auc_score)

Accuracy:  0.9959677419354839
Precision:  1.0
Recall:  0.9629629629629629
F1-score:  0.9811320754716981
AUC-score:  0.9814814814814814


6 Save results

In [124]:
import time
import csv 

In [132]:
# Files
time_file = time.time()

result = {
    "num_constraints": [],
    "num_activities": [],
    "len_training_set": [],
    "len_test_set":[],
    "train_test_perc":[],
    "num_labeled_deviations":[],
    "perc_anomalies": [],
    "classifier":[],
    "precision": [],
    "recall": [],
    "F1_score": [],
    "accuracy": [],
    "AUC_score": [],
}
df1 = pd.DataFrame(result)
df1.to_csv("TEST_result_{0}.csv".format(time_file), sep=',',index=False)

In [None]:
# def save_results(filename):
#     file = open(filename, 'w',  encoding="utf-8") # overwrite if file already exists
#     output = Model.constraintlist.list_to_decl_extension(constraint_list, activities)
#     file.write(str(output))
#     file.close()

In [133]:
num_constraints = 0
num_activities = 0
len_training = len(y_train)
len_test = len(y_test)
num_labeled_deviations = len_training + len_test
classifier = "DT"

In [134]:
fields1 = [num_constraints, 
            num_activities, 
            len_training, 
            len_test,
            train_test_perc, 
            num_labeled_deviations, 
            percent_anomalies,
            classifier, 
            precision, 
            recall,
            f1_score,
            accuracy,
            auc_score]

with open(r"TEST_result_{0}.csv".format(time_file), 'a') as f:
    writer = csv.writer(f)
    writer.writerow(fields1)