In [1]:
import pm4py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
log_path = '/Users/nick/BPPSO/BPI Challenge 2017.xes'
log = pm4py.read_xes(log_path)

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 31509/31509 [00:29<00:00, 1076.84it/s]


In [3]:
if isinstance(log, pd.DataFrame):
    log = pm4py.convert_to_event_log(log)


bpmn_graph = pm4py.read_bpmn("BPMN.bpmn")
net, initial_marking, final_marking = pm4py.convert_to_petri_net(bpmn_graph)

print(f"Loaded Log: {len(log)} traces")
print(f"Loaded Model: {len(net.transitions)} transitions")
print(f"Model loaded: {len(net.transitions)} transitions")

Loaded Log: 31509 traces
Loaded Model: 34 transitions
Model loaded: 34 transitions


In [4]:
#Log traces are aligned to the Petri net
alignments = pm4py.algo.conformance.alignments.petri_net.algorithm.apply(
    log, net, initial_marking, final_marking
)

print(f"Computed {len(alignments)} alignments.")

aligning log, completed variants :: 100%|██████████| 15930/15930 [12:35<00:00, 21.08it/s] 

Computed 31509 alignments.





In [5]:
X_raw = []
y_raw = []

for trace_idx, alignment in enumerate(alignments):
    original_trace = log[trace_idx]
    
    case_attributes = {
        'RequestedAmount': float(original_trace.attributes.get('RequestedAmount', 0)),
        'LoanGoal': str(original_trace.attributes.get('LoanGoal', 'Unknown'))
    }
    
    running_history = []
    
    for step in alignment['alignment']:
        model_move = step[1]
        
        if model_move != ">>" and model_move is not None:
            last_activity = running_history[-1] if running_history else "START"
            history_summary = {f"past_{k}": running_history.count(k) for k in set(running_history)}
            
            features = {
                'last_activity': last_activity,
                **case_attributes,
                **history_summary
            }
            
            X_raw.append(features)
            y_raw.append(model_move) 
            
            running_history.append(model_move)

    # Adding the "End" decision point to explicitly model process termination
    last_activity = running_history[-1] if running_history else "START"
    history_summary = {f"past_{k}": running_history.count(k) for k in set(running_history)}
    
    features_end = {
        'last_activity': last_activity,
        **case_attributes,
        **history_summary
    }
    
    X_raw.append(features_end)
    y_raw.append('__END__') #

print(f"Extracted {len(X_raw)} decisions (including Ends).")

Extracted 1000470 decisions (including Ends).


In [7]:
# 1. Transform dictionary features into a matrix
vectorizer = DictVectorizer(sparse=False)
X_matrix = vectorizer.fit_transform(X_raw)
y_vector = np.array(y_raw)

# 2. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_matrix, y_vector, test_size=0.2, random_state=42)

# 3. Training
clf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
clf.fit(X_train, y_train)

# 4. Evaluation
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {acc:.4f}")

Model Accuracy: 0.8987


In [None]:
joblib.dump(clf, 'decision_model.pkl')
joblib.dump(vectorizer, 'feature_vectorizer.pkl')

print("Files saved: 'decision_model.pkl' and 'feature_vectorizer.pkl'")

Files saved: 'decision_model.pkl' and 'feature_vectorizer.pkl'
