## Desirability of MP-Declare constraint violations w.r.t. costs

### Importing a log (P2P Process BPIC'19)

In [1]:
import time
from tqdm import tqdm
import pm4py

import matplotlib.pyplot as plt
from collections import Counter

from process_atoms.mine.declare.enums.mp_constants import Template
from process_atoms.processatoms import ProcessAtoms
from process_atoms.mine.declare.regexchecker import RegexChecker
import pandas as pd
import seaborn as sns

from process_atoms.models.event_log import EventLog, EventLogSchemaTypes
from process_atoms.models.column_types import (
    CaseID,
    Categorical,
    EventType,
    EventTime,
    Continuous,
)

def penalty(duration, slack, amount, percentage):
    penalty = (duration / slack) * percentage * amount
    return penalty

schema = EventLogSchemaTypes(
    # schema for case-level attributes
    cases={
        "Case ID": CaseID,
        "(case) Company": Categorical,
        "(case) Document Type": Categorical,
        "(case) GR-Based Inv. Verif.": Categorical,
        "(case) Goods Receipt": Categorical,
        "(case) Item": Categorical,
        "(case) Item Category": Categorical,
        "(case) Item Type": Categorical,
        "(case) Name": Categorical,
        "(case) Purch. Doc. Category name": Categorical,
        "(case) Purchasing Document": Categorical,
        "(case) Source": Categorical,
        "(case) Spend area text": Categorical,
        "(case) Spend classification text": Categorical,
        "(case) Sub spend area text": Categorical,
        "(case) Vendor": Categorical,
        "Cumulative net worth (EUR)": Continuous,
    },
    events={
        "Case ID": CaseID,
        "Activity": EventType,
        "Complete Timestamp": EventTime,
        "Resource": Categorical,
    },
)

LOG_PATH = "data/BPI_Challenge_2019.csv"
PROCESS = "BPIC_19"
# read the full log
log = pd.read_csv(LOG_PATH, parse_dates=["Complete Timestamp"])
# split into case and event attributes
df_cases = log[list(schema.cases.keys())].drop_duplicates(subset="Case ID")
df_events = log[list(schema.events.keys())]

# create event log object
event_log = EventLog(df_cases, df_events, schema)

### Create the process model by mining declarative constraints from the log 

In [2]:
considered_templates = [Template.RESPONSE.templ_str, Template.PRECEDENCE.templ_str]
api = ProcessAtoms()

start_time = time.time()
atoms = api.mine_atoms_from_log(
    PROCESS,
    event_log,
    considered_templates,
    min_support=(0.1 * len(event_log)) / len(event_log),
    local=True,
    consider_vacuity=False,
)
end_time = time.time()

delta = end_time - start_time

print(f"Atom mining took {delta} seconds.")
def atoms_to_df(atoms):
    records = [
        {
            "type": atom.atom_type,
            "op_0": atom.operands[0],
            "op_1": atom.operands[1] if len(atom.operands) > 0 else "",
            "support": atom.support,
            "confidence": atom.attributes["confidence"],
        }
        for atom in atoms
    ]
    return pd.DataFrame.from_records(records).sort_values(
        by="confidence", ascending=False
    )
atoms_df = atoms_to_df(atoms)
atoms_df["durations"] = atoms_df.apply(
    lambda x: [
        dur / ((24 * 3600) * 10**9)
        for dur in event_log.activity_pair_durations(a=x["op_0"], b=x["op_1"])
    ],
    axis=1,
)

100%|██████████| 27/27 [00:01<00:00, 17.99it/s]


Atom mining took 1.8090260028839111 seconds.


### Select three constraints in which the time between both is restricted

In [3]:
constraints=[3,7,16]
constraints_dict={}
for con in constraints:
    constraints_dict[con]={}
    constraints_dict[con]['type']=atoms_df.type[con]
    constraints_dict[con]['op_0']=atoms_df.op_0[con]
    constraints_dict[con]['op_1']=atoms_df.op_1[con]
    constraints_dict[con]['required_time']=90 # between op_0 and op_1, maximum 90 days should pass
    constraints_dict[con]['cost_function']=[7, 0.05] # for every 7 days after that, 5% are added to the cost
constraints_dict

{3: {'type': 'Precedence',
  'op_0': 'Create Purchase Order Item',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05]},
 7: {'type': 'Precedence',
  'op_0': 'Vendor creates invoice',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05]},
 16: {'type': 'Response',
  'op_0': 'Record Invoice Receipt',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05]}}

### Quantify the impact of each deviation based on cost function 

In [4]:
for con in constraints:
    durations = sorted(
        atoms_df.loc[
            (atoms_df["type"] == constraints_dict[con]['type'])
            & (atoms_df["op_0"] == constraints_dict[con]['op_0'])
            & (atoms_df["op_1"] == constraints_dict[con]['op_1']),
            "durations",
        ].values[0]
    )
    the_atom = None
    for atom in atoms:
        if atom.atom_type == constraints_dict[con]['type'] and atom.operands == [
            constraints_dict[con]['op_0'],
            constraints_dict[con]['op_1'],
        ]:
            the_atom = atom
    checker = RegexChecker(PROCESS, event_log)
    activities = checker.log.unique_activities()
    activity_map = checker._map_activities_to_letters(activities)
    variant_frame = checker.create_variant_frame_from_log(activity_map)
    variant_frame["sat"] = checker.compute_satisfaction(
        the_atom, variant_frame, activity_map, consider_vacuity=False
    )
    variant_frame = variant_frame[variant_frame["sat"]]
    case_ids = set(val for cases in variant_frame["case_ids"].values for val in cases)


    metric_satisfaction = dict()
    for case_id in tqdm(case_ids):
        metric_satisfaction[case_id] = checker.check_time_constraint_violation(
            case_id, the_atom, "max", constraints_dict[con]['required_time'], "d"
        )
    penalties = {}
    for case_id, violation in metric_satisfaction.items():
        if violation:
            penalties[case_id] = penalty(
                violation,
                constraints_dict[con]['cost_function'][0],  # slack in days
                event_log.cases.loc[
                    event_log.cases["Case ID"] == case_id, "Cumulative net worth (EUR)"
                ].values[0],
                constraints_dict[con]['cost_function'][1],  # added penalties in percentage,
            )
        else:
            penalties[case_id] = 0
    constraints_dict[con]['total_costs']= sum(penalties.values())

100%|██████████| 9676/9676 [08:02<00:00, 20.05it/s]
100%|██████████| 9674/9674 [08:02<00:00, 20.06it/s]
100%|██████████| 9498/9498 [07:53<00:00, 20.04it/s]


In [5]:
constraints_dict

{3: {'type': 'Precedence',
  'op_0': 'Create Purchase Order Item',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05],
  'total_costs': 8573453.846279772},
 7: {'type': 'Precedence',
  'op_0': 'Vendor creates invoice',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05],
  'total_costs': 25320962.23710812},
 16: {'type': 'Response',
  'op_0': 'Record Invoice Receipt',
  'op_1': 'Clear Invoice',
  'required_time': 90,
  'cost_function': [7, 0.05],
  'total_costs': 35450.55287698414}}

In [6]:
constraints_dict[con]['total_costs']= sum(penalties.values())

### Defining the severity function
* 0 is fine
* Anything below 100 is low
* Anything below 500 is medium
* Anything below 1,000 is high
* Anything above 1,000 is critical

In [9]:
boundaries = {
    " == 0": "fine",
    " <= 100000": "low",
    " <= 1000000": "medium",
    " > 1000000": "high"
}


def get_severity(penalty, boundaries):
    for boundary, category in boundaries.items():
        if eval(str(penalty) + boundary):
            return category
    return "unknown"


severities = {}
for con in constraints_dict.keys():
    severities[con] = get_severity(constraints_dict[con]['total_costs'], boundaries)

In [10]:
severities

{3: 'high', 7: 'high', 16: 'low'}