In [1]:
import pandas as pd
import numpy as np
import os
import pm4py

In [2]:
df_small = pd.read_csv(os.getcwd() + '/small3insert.csv')

In [3]:
df_small = df_small.rename(columns={"Case": "caseid", "Activity": "activity", "Timestamp": "ts", "Resource": "resource", "resource_anomaly_type": "outcome"})

In [4]:
event_log = df_small[["caseid","activity","ts", "resource", "outcome"]]

In [None]:
# Convert 'ts' to datetime format
event_log['ts'] = pd.to_datetime(event_log['ts'])

In [6]:
event_log

Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,case_0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,case_0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
4,case_0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal
...,...,...,...,...,...
45842,case_999,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal
45843,case_999,Activity K,1970-01-01 14:00:00,Resource_Group1_res_1,normal
45844,case_999,Activity Q,1970-01-01 15:00:00,Resource_Group0_res_0,normal
45845,case_999,Activity L,1970-01-01 16:00:00,Resource_Group0_res_0,normal


In [None]:
# Use the `replace()` function to remove commas
event_log['caseid'] = event_log['caseid'].replace('case_', '', regex=True)

In [None]:
# Convert the column to floats
event_log['caseid'] = event_log['caseid'].astype(int)

In [9]:
event_log

Unnamed: 0,caseid,activity,ts,resource,outcome
0,0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
4,0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal
...,...,...,...,...,...
45842,999,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal
45843,999,Activity K,1970-01-01 14:00:00,Resource_Group1_res_1,normal
45844,999,Activity Q,1970-01-01 15:00:00,Resource_Group0_res_0,normal
45845,999,Activity L,1970-01-01 16:00:00,Resource_Group0_res_0,normal


In [10]:
def extract_prefixes(event_log, L):
    # Group the event log by caseid
    grouped_by_case = event_log.groupby('caseid')

    # Filter cases with at least L events
    valid_cases = grouped_by_case.filter(lambda x: len(x) >= L)

    # Extract the prefixes of length L
    prefixes = valid_cases.groupby('caseid').head(L)

    return prefixes

In [11]:
L = 4  # Replace with your value of L
prefixes = extract_prefixes(event_log, L)

In [12]:
prefixes

Unnamed: 0,caseid,activity,ts,resource,outcome
0,0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
7,1,Activity A,1970-01-01 09:00:00,Resource_Group2_res_0,insert
...,...,...,...,...,...
45831,998,Activity D,1970-01-01 12:00:00,Resource_Group2_res_1,normal
45838,999,Activity A,1970-01-01 09:00:00,Resource_Group2_res_0,normal
45839,999,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
45840,999,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal


In [13]:
prefixes['ts']

0       1970-01-01 09:00:00
1       1970-01-01 10:00:00
2       1970-01-01 11:00:00
3       1970-01-01 12:00:00
7       1970-01-01 09:00:00
                ...        
45831   1970-01-01 12:00:00
45838   1970-01-01 09:00:00
45839   1970-01-01 10:00:00
45840   1970-01-01 11:00:00
45841   1970-01-01 12:00:00
Name: ts, Length: 20000, dtype: datetime64[ns]

In [14]:
def aggregate_encoding(prefix):
    prefix['ts'] = prefix['ts'].astype('int64')
    # Aggregate "Activity" and "Resource" using frequency
    activity_counts = prefix['activity'].value_counts(normalize=True).to_dict()
    resource_counts = prefix['resource'].value_counts(normalize=True).to_dict()

    # Aggregate "Timestamp" using average
    avg_timestamp = prefix['ts'].mean()

    # Create a dictionary to store the aggregated values
    aggregated_values = {
        'activity_frequency': activity_counts,
        'resource_frequency': resource_counts,
        'avg_timestamp': avg_timestamp
    }

    return aggregated_values


In [15]:
aggregated_values = aggregate_encoding(prefixes)

In [16]:
aggregated_values

{'activity_frequency': {'Activity A': 0.25,
  'Activity B': 0.2482,
  'Activity C': 0.2432,
  'Activity D': 0.23945,
  'Activity S': 0.00185,
  'Activity L': 0.0016,
  'Activity R': 0.00155,
  'Activity Q': 0.00155,
  'Activity O': 0.0015,
  'Activity P': 0.00145,
  'Activity G': 0.0013,
  'Activity I': 0.00125,
  'Activity T': 0.00125,
  'Activity N': 0.0012,
  'Activity J': 0.0012,
  'Activity M': 0.0012,
  'Activity H': 0.00115,
  'Activity K': 0.0011},
 'resource_frequency': {'Resource_Group0_res_0': 0.25615,
  'Resource_Group2_res_1': 0.16645,
  'Resource_Group2_res_0': 0.1626,
  'Resource_Group2_res_2': 0.1617,
  'Resource_Group1_res_1': 0.127,
  'Resource_Group1_res_0': 0.1261},
 'avg_timestamp': 37699142750000.0}

In [17]:
def index_based_encoding(prefix):
    # One-hot encode "Activity" and "Resource"
    activity_encoded = pd.get_dummies(prefix['activity'], prefix='activity', drop_first=True).astype(int)
    resource_encoded = pd.get_dummies(prefix['resource'], prefix='resource', drop_first=True).astype(int)

    # Concatenate the encoded columns with the original DataFrame
    encoded_prefix = pd.concat([prefix, activity_encoded, resource_encoded], axis=1)

    # Drop the original "Activity" and "Resource" columns
    encoded_prefix = encoded_prefix.drop(['activity', 'resource'], axis=1)

    # Encode "outcome" column
    encoded_prefix['outcome'] = (encoded_prefix['outcome'] == 'normal').astype(int)

    return encoded_prefix


In [18]:
encoded_prefix = index_based_encoding(prefixes)

In [19]:
encoded_prefix

Unnamed: 0,caseid,ts,outcome,activity_Activity B,activity_Activity C,activity_Activity D,activity_Activity G,activity_Activity H,activity_Activity I,activity_Activity J,...,activity_Activity P,activity_Activity Q,activity_Activity R,activity_Activity S,activity_Activity T,resource_Resource_Group1_res_0,resource_Resource_Group1_res_1,resource_Resource_Group2_res_0,resource_Resource_Group2_res_1,resource_Resource_Group2_res_2
0,0,32400000000000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,36000000000000,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,39600000000000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,43200000000000,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,1,32400000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45831,998,43200000000000,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
45838,999,32400000000000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
45839,999,36000000000000,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45840,999,39600000000000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def train_decision_tree(event_log, L):
    # Extract prefixes of length L
    prefixes = extract_prefixes(event_log, L)
    prefixes['ts'] = prefixes['ts'].astype('int64')

    # Index-based encoding for prefixes
    encoded_prefixes = index_based_encoding(prefixes)

    # Split data into features (X) and target variable (y)
    X = encoded_prefixes.drop('outcome', axis=1)
    y = encoded_prefixes['outcome']

    # Split the data into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize the Decision Tree Classifier
    dt_classifier = DecisionTreeClassifier(random_state=42, max_depth=5)

    # Train the Decision Tree model
    dt_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = dt_classifier.predict(X_test)

    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    return dt_classifier, accuracy

In [21]:
# Train Decision Tree for L=2
dt_model_L2, accuracy_L2 = train_decision_tree(event_log, L=2)
print(f"Decision Tree Accuracy for L=2: {accuracy_L2}")

# Train Decision Tree for L=5
dt_model_L5, accuracy_L5 = train_decision_tree(event_log, L=5)
print(f"Decision Tree Accuracy for L=5: {accuracy_L5}")

Decision Tree Accuracy for L=2: 0.8733333333333333
Decision Tree Accuracy for L=5: 0.8746666666666667


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_random_forest(event_log, L, max_depth=None, n_estimators=100, test_size=0.3, random_state=42):
    # Extract prefixes of length L
    prefixes = extract_prefixes(event_log, L)
    prefixes['ts'] = prefixes['ts'].astype('int64')

    # Index-based encoding for prefixes
    encoded_prefixes = index_based_encoding(prefixes)

    # Split data into features (X) and target variable (y)
    X = encoded_prefixes.drop('outcome', axis=1)
    y = encoded_prefixes['outcome']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize the Random Forest classifier
    rf_classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=random_state)

    # Train the Random Forest model
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return rf_classifier, accuracy

In [25]:
# Train RF for L=2
rf_model_L2, rf_accuracy_L2 = train_random_forest(event_log, L=2, max_depth=5, n_estimators=100, test_size=0.3, random_state=42)
print(f"Random Forest Accuracy for L=2: {rf_accuracy_L2}")

# Train RF for L=5
rf_model_L2, rf_accuracy_L5 = train_random_forest(event_log, L=5, max_depth=5, n_estimators=100, test_size=0.3, random_state=42)
print(f"Random Forest Accuracy for L=5: {rf_accuracy_L5}")

Random Forest Accuracy for L=2: 0.8733333333333333
Random Forest Accuracy for L=5: 0.8765333333333334
