# Validation config generator

In [37]:
from pathlib import Path

In [None]:
base_config = '''
- seed: 42
  dataset:
    path: <dataset>
    case_id_col: case:concept:name
    activity_col: concept:name
    timestamp_col: time:timestamp
  split:
    type: temporal
    params:
      test_ratio: 0.15
      val_ratio: 0.15
      seed: 42
  prefix:
    min_prefix_len: 1
    max_prefix_len: null
  target: remaining_time
  model:
    type: lstm
    params:
      task: regression
      num_layers: 2
      hidden_size: 256
      lr: 1e-3
      dropout: 0.3
      batch_size: 32 
      epochs: 50
      early_stopping_patience: 10
      reduce_lr_patience: 5
      reduce_lr_factor: 0.5
      verbose: 0
  features:
  # Always included
<always_features>

  
  # Cycliality based
<cyclicality_features>

  transformer:
    key: lstm
    params:
      maxlen: null
      pad_value: 0.0
      X_normalization: 0/1
      y_normalization: 0/1
  name: <name>
'''

In [24]:
## v03 features

always_features_v03 = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot

  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: day
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: week 
'''

cyclicality_daily_features_v03 = '''
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
  - feature_key: time_in_day
    encoding_key: sincos
    source_col_name: time:timestamp
    encoding_params: {period: 86400}
    granularity_key: minute
  - feature_key: time_in_day
    encoding_key: onehot
    source_col_name: time:timestamp
    granularity_key: hour
'''

cyclicality_weekly_features_v03 = '''
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: day
  - feature_key: time_in_week
    encoding_key: sincos
    source_col_name: time:timestamp
    encoding_params: {period: 604800}
    granularity_key: day
  - feature_key: time_in_week
    encoding_key: onehot
    source_col_name: time:timestamp
    granularity_key: day
'''

cyclicality_monthly_features_v03 = '''
  - feature_key: time_in_month
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: day
  - feature_key: time_in_month
    encoding_key: sincos
    source_col_name: time:timestamp
    encoding_params: {period: 2630016}
    granularity_key: day
  - feature_key: time_in_month
    encoding_key: onehot
    source_col_name: time:timestamp
    granularity_key: day
'''

cyclicality_yearly_features_v03 = '''
  - feature_key: time_in_year
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: week
  - feature_key: time_in_year
    encoding_key: sincos
    source_col_name: time:timestamp
    encoding_params: {period: 31557600}
    granularity_key: week
  - feature_key: time_in_year
    encoding_key: onehot
    source_col_name: time:timestamp
    granularity_key: month
'''

In [25]:
## v04 features

always_features_v04 = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot

  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
'''

cyclicality_daily_features_v04 = ""

cyclicality_weekly_features_v04 = ""

cyclicality_monthly_features_v04 = ""

cyclicality_yearly_features_v04 = ""


In [26]:
## v05 features

always_features_v05 = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot

  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
'''

cyclicality_daily_features_v05 = '''
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_weekly_features_v05 = '''
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_monthly_features_v05 = '''
  - feature_key: time_in_month
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_yearly_features_v05 = '''
  - feature_key: time_in_year
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

In [27]:
## v06 features

always_features_v06 = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot

  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: day

  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: day
'''

cyclicality_daily_features_v06 = '''
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_weekly_features_v06 = '''
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_monthly_features_v06 = '''
  - feature_key: time_in_month
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

cyclicality_yearly_features_v06 = '''
  - feature_key: time_in_year
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: minute
'''

In [32]:
## v07 features

always_features_v07 = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot

  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: day

  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: minute
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: hour
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: day
'''

cyclicality_daily_features_v07 = '''
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: second
'''

cyclicality_weekly_features_v07 = '''
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: hour
'''

cyclicality_monthly_features_v07 = '''
  - feature_key: time_in_month
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: day
'''

cyclicality_yearly_features_v07 = '''
  - feature_key: time_in_year
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: month
'''

In [None]:
always_features = always_features_v06
cyclicality_daily_features = cyclicality_daily_features_v06
cyclicality_weekly_features = cyclicality_weekly_features_v06
cyclicality_monthly_features = cyclicality_monthly_features_v06
cyclicality_yearly_features = cyclicality_weekly_features_v06

In [7]:
## benchmark features

tax_etal_features ='''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: second
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: second
    '''

verenich_etal_features = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: hour
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: day
'''

oyamada_etal_features = '''
  - feature_key: activity
    source_col_name: concept:name
    encoding_key: onehot
  - feature_key: time_since_last_event
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_in_day
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: second
  - feature_key: time_in_week
    encoding_key: numeric
    source_col_name: time:timestamp
    granularity_key: second
  
  - feature_key: time_since_start
    encoding_key: numeric
    source_col_name: time:timestamp
    case_source: case:concept:name
    granularity_key: second
  - feature_key: time_in_week
    encoding_key: onehot
    source_col_name: time:timestamp
    granularity_key: day
  '''

## Generate configs for synthetic datasets

### Guided feature selection

In [34]:
folder_path = Path("../../../data/raw/syn")
configs_batch1 = "experiments:\n"
configs_batch2 = "experiments:\n"

# prepare iterator and total number of iterations
files = [f for f in folder_path.iterdir() if f.is_file()]
#files = [f for f in files if f.name.startswith("syn_complex")] # only for complex datasets
total_iterations = len(files)
files_iterator = iter(files)

for i, file in enumerate(files_iterator, start=1):
    # insert dataset path
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative))

    # insert always used features
    config = config.replace("<always_features>", always_features)
    
    # insert cyclicality-based features
    cyclicality_features = ""
    if "complex" not in file.name:
        if "daily" in file.name:
            cyclicality_features += cyclicality_daily_features
        if "weekly" in file.name:
            cyclicality_features += cyclicality_weekly_features
        if "monthly" in file.name:
            cyclicality_features += cyclicality_monthly_features
        if "yearly" in file.name:
            cyclicality_features += cyclicality_yearly_features   
    
    config = config.replace("<cyclicality_features>", cyclicality_features)
    
    # insert config name
    config = config.replace("<name>", f"validation_guided_{file.name}")

    if i % 2 == 0:
        configs_batch1 = configs_batch1 + "\n#-------\n#-------\n" + config
    else:
        configs_batch2 = configs_batch2 + "\n#-------\n#-------\n" + config
    
    print(f"({i}/{total_iterations}) config generated for: {file.name}")



(1/60) config generated for: syn_complex_3-5d_monthly0-8_0-50.xes
(2/60) config generated for: syn_3-5d_daily0-6_0-25.xes
(3/60) config generated for: syn_complex_3-5d_yearly0-4_0-25_7y.xes
(4/60) config generated for: syn_complex_12h_daily0-9_0-50.xes
(5/60) config generated for: syn_complex_12h_yearly0-4_0-25_7y.xes
(6/60) config generated for: syn_complex_2w_none_0-25.xes
(7/60) config generated for: syn_2w_monthly0-7_0-25.xes
(8/60) config generated for: syn_12h_none_0-50.xes
(9/60) config generated for: syn_2w_daily0-5_0-25.xes
(10/60) config generated for: syn_2w_yearly0-4_0-50_7y.xes
(11/60) config generated for: syn_complex_12h_weekly0-5_0-25.xes
(12/60) config generated for: syn_3-5d_yearly0-35_0-25_7y.xes
(13/60) config generated for: syn_2w_weekly0-5_0-25.xes
(14/60) config generated for: syn_complex_12h_monthly0-5_0-50.xes
(15/60) config generated for: syn_12h_weekly0-2_0-25.xes
(16/60) config generated for: syn_12h_daily0-3_0-50.xes
(17/60) config generated for: syn_comple

In [None]:
''''with open("internal/guided_v07/validation_guided_v07_syn_batch1.yaml", "w") as f:
    f.write(configs_batch1)

with open("internal/guided_v07/validation_guided_v07_syn_batch2.yaml", "w") as f:
    f.write(configs_batch2)''''

### Benchmark feature selection

In [10]:
folder_path = Path("../../../data/raw/syn")
configs_tax_etal = "experiments:\n"
configs_verenich_etal = "experiments:\n"
configs_oyamada_etal = "experiments:\n"
count = 0

# prepare iterator and total number of iterations
files = [f for f in folder_path.iterdir() if f.is_file()]
#files = [f for f in files if f.name.startswith("syn_complex")] # only for complex datasets
total_iterations = len(files)
files_iterator = iter(files)

for i, file in enumerate(files_iterator, start=1):
    ## TAX et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", tax_etal_features) # insert Tax et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_tax_{file.name}") # insert config name
    configs_tax_etal = configs_tax_etal + "\n#-------\n#-------\n" + config

    ## VERENICH et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", verenich_etal_features) # insert Verenich et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_verenich_{file.name}") # insert config name
    configs_verenich_etal = configs_verenich_etal + "\n#-------\n#-------\n" + config

    ## OYAMDADA et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", oyamada_etal_features) # insert Oyamada et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_oyamada_{file.name}") # insert config name
    configs_oyamada_etal = configs_oyamada_etal + "\n#-------\n#-------\n" + config

    print(f"({i}/{total_iterations}) config generated for: {file.name}")



(1/60) config generated for: syn_complex_3-5d_monthly0-8_0-50.xes
(2/60) config generated for: syn_3-5d_daily0-6_0-25.xes
(3/60) config generated for: syn_complex_3-5d_yearly0-4_0-25_7y.xes
(4/60) config generated for: syn_complex_12h_daily0-9_0-50.xes
(5/60) config generated for: syn_complex_12h_yearly0-4_0-25_7y.xes
(6/60) config generated for: syn_complex_2w_none_0-25.xes
(7/60) config generated for: syn_2w_monthly0-7_0-25.xes
(8/60) config generated for: syn_12h_none_0-50.xes
(9/60) config generated for: syn_2w_daily0-5_0-25.xes
(10/60) config generated for: syn_2w_yearly0-4_0-50_7y.xes
(11/60) config generated for: syn_complex_12h_weekly0-5_0-25.xes
(12/60) config generated for: syn_3-5d_yearly0-35_0-25_7y.xes
(13/60) config generated for: syn_2w_weekly0-5_0-25.xes
(14/60) config generated for: syn_complex_12h_monthly0-5_0-50.xes
(15/60) config generated for: syn_12h_weekly0-2_0-25.xes
(16/60) config generated for: syn_12h_daily0-3_0-50.xes
(17/60) config generated for: syn_comple

In [11]:
'''with open("internal/validation_tax_etal_syn.yaml", "w") as f:
    f.write(configs_tax_etal)

with open("internal/validation_verenich_etal_syn.yaml", "w") as f:
    f.write(configs_verenich_etal)

with open("internal/validation_oyamada_etal_syn.yaml", "w") as f:
    f.write(configs_oyamada_etal)'''

'with open("internal/validation_tax_etal_syn.yaml", "w") as f:\n    f.write(configs_tax_etal)\n\nwith open("internal/validation_verenich_etal_syn.yaml", "w") as f:\n    f.write(configs_verenich_etal)\n\nwith open("internal/validation_oyamada_etal_syn.yaml", "w") as f:\n    f.write(configs_oyamada_etal)'

## Generate configs for real-world datasets

In [None]:
datasets = {#"BPI_2012_W_sample": ("daily", "weekly", "yearly"),
            #"BPI_2015_2": ("daily", "weekly", "monthly", "yearly"),
            #"BPI_2017_W_sample": ("daily", "weekly", "monthly", "yearly"),
            
            
            
            "BPI_2012_A": ("daily", "weekly", "yearly"),
            "BPI_2012_O": ("daily", "weekly", "yearly"),
            "BPI_2012_W": ("daily", "weekly", "monthly", "yearly"),
            "BPI_2013_closed_problems": ("daily", "weekly", "yearly"),
            #"environmental_permit": ("daily", "weekly"),
            "helpdesk": ("daily", "weekly", "monthly", "yearly"),
            "prepaid_travel_cost": ("daily", "weekly", "yearly"),
            "request_for_payment": ("daily", "weekly", "yearly"),
            }

### Guided feature selection

In [17]:
folder_path = Path("../../../data/raw")
configs = "experiments:\n"

# prepare iterator and total number of iterations
files = [f for f in folder_path.iterdir() if f.is_file() and f.stem in datasets]
total_iterations = len(files)
files_iterator = iter(files)

for i, file in enumerate(files_iterator, start=1):
    # insert dataset path
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative))

    # insert always used features
    config = config.replace("<always_features>", always_features)
    
    # insert cyclicality-based features
    cyclicality_features = ""
    if "daily" in datasets.get(file.stem, []):
        cyclicality_features += cyclicality_daily_features
    if "weekly" in datasets.get(file.stem, []):
        cyclicality_features += cyclicality_weekly_features
    if "monthly" in datasets.get(file.stem, []):
        cyclicality_features += cyclicality_monthly_features
    if "yearly" in datasets.get(file.stem, []):
        cyclicality_features += cyclicality_yearly_features   
    
    config = config.replace("<cyclicality_features>", cyclicality_features)
    
    # insert config name
    config = config.replace("<name>", f"validation_guided_{file.name}")

    configs = configs + "\n#-------\n#-------\n" + config
    print(f"({i}/{total_iterations}) config generated for: {file.name}")



(1/8) config generated for: BPI_2012_A.xes
(2/8) config generated for: environmental_permit.xes
(3/8) config generated for: request_for_payment.xes
(4/8) config generated for: helpdesk.xes
(5/8) config generated for: BPI_2012_W.xes
(6/8) config generated for: BPI_2013_closed_problems.xes
(7/8) config generated for: prepaid_travel_cost.xes
(8/8) config generated for: BPI_2012_O.xes


In [None]:
'''with open("external/v04/validation_guided_v05_external.yaml", "w") as f:
    f.write(configs)'''

### Benchmark feature selection

In [15]:
folder_path = Path("../../../data/raw")
configs_tax_etal = "experiments:\n"
configs_verenich_etal = "experiments:\n"
configs_oyamada_etal = "experiments:\n"

# prepare iterator and total number of iterations
files = [f for f in folder_path.iterdir() if f.is_file() and f.stem in datasets]
total_iterations = len(files)
files_iterator = iter(files)


for i, file in enumerate(files_iterator, start=1):
    
    ## TAX et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", tax_etal_features) # insert Tax et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_tax_{file.name}") # insert config name
    configs_tax_etal = configs_tax_etal + "\n#-------\n#-------\n" + config

    ## VERENICH et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", verenich_etal_features) # insert Verenich et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_verenich_{file.name}") # insert config name
    configs_verenich_etal = configs_verenich_etal + "\n#-------\n#-------\n" + config

    ## OYAMDADA et al. feature set
    config = base_config
    relative = file.relative_to(Path("../../../"))
    config = config.replace("<dataset>", str(relative)) # insert dataset path
    config = config.replace("<always_features>", oyamada_etal_features) # insert Oyamada et al. features
    config = config.replace("<cyclicality_features>", "") # no cyclicality features
    config = config.replace("<name>", f"validation_oyamada_{file.name}") # insert config name
    configs_oyamada_etal = configs_oyamada_etal + "\n#-------\n#-------\n" + config

    print(f"({i}/{total_iterations}) configs generated for: {file.name}")


(1/8) configs generated for: BPI_2012_A.xes
(2/8) configs generated for: environmental_permit.xes
(3/8) configs generated for: request_for_payment.xes
(4/8) configs generated for: helpdesk.xes
(5/8) configs generated for: BPI_2012_W.xes
(6/8) configs generated for: BPI_2013_closed_problems.xes
(7/8) configs generated for: prepaid_travel_cost.xes
(8/8) configs generated for: BPI_2012_O.xes


In [16]:
'''
with open("external/validation_tax_etal_external.yaml", "w") as f:
    f.write(configs_tax_etal)

with open("external/validation_verenich_etal_external.yaml", "w") as f:
    f.write(configs_verenich_etal)

with open("external/validation_oyamada_etal_external.yaml", "w") as f:
    f.write(configs_oyamada_etal)
'''


'\nwith open("external/validation_tax_etal_external.yaml", "w") as f:\n    f.write(configs_tax_etal)\n\nwith open("external/validation_verenich_etal_external.yaml", "w") as f:\n    f.write(configs_verenich_etal)\n\nwith open("external/validation_oyamada_etal_external.yaml", "w") as f:\n    f.write(configs_oyamada_etal)\n'