In [15]:
import pandas as pd
import numpy as np
from datetime import date
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
def printmd(string):
    display(Markdown(string))

In [17]:
fn = 'src/process-mining-tutorial/ArtificialPatientTreatment.csv'
events = pd.read_csv(fn)

events.columns = ['patient', 'action', 'resource', 'datetime']
events['datetime'] = pd.to_datetime(events['datetime'])
events.head()

Unnamed: 0,patient,action,resource,datetime
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11
1,patient 0,Blood test,Lab,2017-01-02 12:47:33
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09


In [18]:
 ## Get the case start times to get the time deltas for the 'age' of each activity with respect to start
case_starts_ends = events.pivot_table(index='patient', aggfunc={'datetime': ['min', 'max']})
case_starts_ends = case_starts_ends.reset_index()
case_starts_ends.columns = ['patient', 'caseend', 'casestart']
events = events.merge(case_starts_ends, on='patient')
events['relativetime'] = events['datetime'] - events['casestart']
events.head()

Unnamed: 0,patient,action,resource,datetime,caseend,casestart,relativetime
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 00:00:00
1,patient 0,Blood test,Lab,2017-01-02 12:47:33,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:07:22
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:13:39
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 04:40:55
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09,2017-01-09 08:29:28,2017-01-02 11:40:11,3 days 01:42:58


In [19]:
events['action'] = events['action'].apply(lambda x: x.strip())

In [20]:
delimiter = '___'

makeEventString = lambda x: delimiter.join(x)
makeEventString.__name__ = 'makeEventString'

numEvents = lambda x: len(x)
numEvents.__name__ = 'numEvents'


caselogs = events.pivot_table(index='patient', aggfunc={'action': [makeEventString, numEvents]})
caselogs = caselogs.reset_index()
caselogs.columns = ['patient', 'action_sequence', 'numactions']

events = pd.merge(events, caselogs, on='patient')
events['caselength'] = events['caseend'] - events['casestart']

events.head()

Unnamed: 0,patient,action,resource,datetime,caseend,casestart,relativetime,action_sequence,numactions,caselength
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 00:00:00,First consult___Blood test___Physical test___S...,6,6 days 20:49:17
1,patient 0,Blood test,Lab,2017-01-02 12:47:33,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:07:22,First consult___Blood test___Physical test___S...,6,6 days 20:49:17
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:13:39,First consult___Blood test___Physical test___S...,6,6 days 20:49:17
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 04:40:55,First consult___Blood test___Physical test___S...,6,6 days 20:49:17
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09,2017-01-09 08:29:28,2017-01-02 11:40:11,3 days 01:42:58,First consult___Blood test___Physical test___S...,6,6 days 20:49:17


In [21]:
## Get day of week
events['weekday'] = events['datetime'].apply(lambda x: x.weekday())
events['date'] = events['datetime'].apply(lambda x: x.date())
events['startdate'] = events['casestart'].apply(lambda x: x.date())
events['hour'] = events['datetime'].apply(lambda x: x.time().hour)
## Get relative times in more friendly terms
events['relativetime_s'] = events['relativetime'].dt.seconds + 86400*events['relativetime'].dt.days
events['relativedays'] = events['relativetime'].dt.days

In [22]:
 events.head()

Unnamed: 0,patient,action,resource,datetime,caseend,casestart,relativetime,action_sequence,numactions,caselength,weekday,date,startdate,hour,relativetime_s,relativedays
0,patient 0,First consult,Dr. Anna,2017-01-02 11:40:11,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 00:00:00,First consult___Blood test___Physical test___S...,6,6 days 20:49:17,0,2017-01-02,2017-01-02,11,0,0
1,patient 0,Blood test,Lab,2017-01-02 12:47:33,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:07:22,First consult___Blood test___Physical test___S...,6,6 days 20:49:17,0,2017-01-02,2017-01-02,12,4042,0
2,patient 0,Physical test,Nurse Jesse,2017-01-02 12:53:50,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 01:13:39,First consult___Blood test___Physical test___S...,6,6 days 20:49:17,0,2017-01-02,2017-01-02,12,4419,0
3,patient 0,Second consult,Dr. Anna,2017-01-02 16:21:06,2017-01-09 08:29:28,2017-01-02 11:40:11,0 days 04:40:55,First consult___Blood test___Physical test___S...,6,6 days 20:49:17,0,2017-01-02,2017-01-02,16,16855,0
4,patient 0,Surgery,Dr. Charlie,2017-01-05 13:23:09,2017-01-09 08:29:28,2017-01-02 11:40:11,3 days 01:42:58,First consult___Blood test___Physical test___S...,6,6 days 20:49:17,3,2017-01-05,2017-01-02,13,265378,3


In [23]:
#!pip install pm4py --user
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.importer.xes import importer as xes_importer

# process mining 
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery

# viz
from pm4py.visualization.petrinet import visualizer as pn_visualizer
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer
from pm4py.visualization.dfg import visualizer as dfg_visualization

# misc 
from pm4py.objects.conversion.process_tree import converter as pt_converter

In [24]:
eventlog = events.copy()
### Specify which columns correspond to case (case:concept:name), 
###event (concept:name) and timestamp (time:timestamp) - rename columns in accordance
###with pm4py

eventlog.rename(columns={'datetime': 'time:timestamp', 'patient': 'case:concept:name', 'action': 'concept:name', 'resource': 'org:resource'}, inplace=True)

## Convert to log format
log = log_converter.apply(eventlog)


In [26]:
from pm4py.objects.log.util import get_log_representation

data, feature_names = get_log_representation.get_default_representation(log)
print(feature_names)

['event:concept:name@Blood test', 'event:concept:name@Final consult', 'event:concept:name@First consult', 'event:concept:name@Medicine', 'event:concept:name@Physical test', 'event:concept:name@Second consult', 'event:concept:name@Surgery', 'event:concept:name@X-ray scan', 'event:relativetime_s', 'event:relativedays', 'event:hour', 'event:numactions', 'event:weekday']


In [27]:
data

array([[ 1,  1,  1, ...,  8,  6,  0],
       [ 1,  1,  1, ..., 16,  7,  4],
       [ 1,  1,  1, ..., 11,  7,  1],
       ...,
       [ 1,  1,  1, ..., 14,  7,  0],
       [ 1,  1,  1, ..., 14,  7,  2],
       [ 1,  1,  1, ..., 12,  7,  2]])

In [28]:
import pandas as pd
df = pd.DataFrame(data, columns=feature_names)
df

Unnamed: 0,event:concept:name@Blood test,event:concept:name@Final consult,event:concept:name@First consult,event:concept:name@Medicine,event:concept:name@Physical test,event:concept:name@Second consult,event:concept:name@Surgery,event:concept:name@X-ray scan,event:relativetime_s,event:relativedays,event:hour,event:numactions,event:weekday
0,1,1,1,0,1,1,1,0,593357,6,8,6,0
1,1,1,1,1,1,1,0,1,359926,4,16,7,4
2,1,1,1,1,1,1,0,1,1129919,13,11,7,1
3,1,1,1,1,1,1,0,1,1026368,11,13,7,1
4,1,1,1,1,1,1,0,1,1196574,13,8,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,1,1,0,1,1,1,0,954266,11,10,6,4
96,1,1,1,1,1,1,0,1,799596,9,16,7,2
97,1,1,1,1,1,1,0,1,453235,5,14,7,0
98,1,1,1,0,1,1,1,1,1221565,14,14,7,2


In [29]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
df2 = pd.DataFrame(pca.fit_transform(df))

In [30]:
df2

Unnamed: 0,0,1,2,3,4
0,234099.440006,-5.056761,-1.348567,1.783601,0.378244
1,467530.440037,2.834607,1.321287,-0.172491,-0.183911
2,-302462.560028,-0.447830,-0.713862,-0.340515,0.079107
3,-198911.560007,1.192413,-1.001870,-0.317632,0.163057
4,-369117.560032,-3.424009,-1.322569,-0.414187,0.097664
...,...,...,...,...,...
95,-126809.560018,-1.585827,2.512814,1.221182,0.672444
96,27860.440009,3.737435,-0.462293,-0.171639,0.058702
97,374221.440024,0.613836,-2.340869,0.220279,-0.272090
98,-394108.560025,2.861815,0.130340,0.804980,-0.345579


In [31]:
from sklearn.ensemble import IsolationForest
model=IsolationForest()
model.fit(df2)
df2["scores"] = model.decision_function(df2)

In [32]:
df2["@@index"] = df2.index
df2 = df2[["scores", "@@index"]]
df2 = df2.sort_values("scores")
df2

Unnamed: 0,scores,@@index
0,-0.142624,0
40,-0.122759,40
15,-0.105262,15
25,-0.097450,25
95,-0.086860,95
...,...,...
41,0.091552,41
34,0.092615,34
13,0.096559,13
53,0.097770,53
