In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

pd.set_option('display.max_columns', None)


<IPython.core.display.Javascript object>

In [2]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os
import pandas as pd


# standardize column names: [case_id_col, 'activity', 'resource', 'timestamp'] for all logs
case_id_col = "case_id"
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"


<IPython.core.display.Javascript object>

In [13]:
import pandas as pd


def custom_encode(value):
    if value == '[0]':
        return 0
    elif value in ('[]', '[1]'): # high uncertainty: empty set or set with positive outcome
        return 1
    else:
        return value

def encode_conformal_data(data):
    # Apply the encoding function to columns that start with an alphabetical character
    columns_to_encode = [col for col in data.columns if col.startswith(('alpha'))]
    data[columns_to_encode] = data[columns_to_encode].applymap(custom_encode)
    return data



# Function to read and preprocess data
def read_and_preprocess_data(data_type, sample_nr=0, log_name="bpic2012"):
    # Read CSV files
    data_csv = pd.read_csv(f"./prepared_data/{log_name}/{data_type}_{log_name}.csv", sep=';')[
        [case_id_col, activity_col, timestamp_col, resource_col, label_col, "Treatment1"]
    ]

    data_encoded = pd.read_csv(f"./prepared_data/{log_name}/{data_type}_encoded_{log_name}.csv", sep=";")

    bpic2012_sample = pd.read_csv(f"./realcause_datasets_{log_name}/{log_name}_sample{sample_nr}.csv")

    common_columns = data_encoded.columns.intersection(bpic2012_sample.columns)
    merged_df = pd.merge(data_encoded, bpic2012_sample, on=list(common_columns)).iloc[:, -5:]


    # Read predictive + preds conformal
    data_preds_conformal = pd.read_csv(f"./results/conformal/{log_name}/conformal_{data_type}_{log_name}.csv", sep=";")

    # Read causal + conformal_causal
    data_conformal_causal = pd.read_csv(f"./results/conformal_causal/{log_name}/conformalizedTE_{log_name}_1_{data_type}.csv", sep=",").iloc[:, -24:]

    # Read Survival
    data_survival = pd.read_csv(f"./results/survival/{log_name}/survival_{data_type}_{log_name}.csv", sep=";").iloc[:, -27:]

    data_csv.reset_index(drop=True, inplace=True)
    data_preds_conformal.reset_index(drop=True, inplace=True)
    data_conformal_causal.reset_index(drop=True, inplace=True)
    data_survival.reset_index(drop=True, inplace=True)
    merged_df.reset_index(drop=True, inplace=True)

    data_all = pd.concat([data_csv, data_preds_conformal, data_conformal_causal, data_survival, merged_df], axis=1)
    data_all = data_all.dropna()

    # Encode conformal data
    data_all = encode_conformal_data(data_all)
    data_all = assign_causal_class_labels(data_all)

    sorting_cols = [timestamp_col]
    data_all = data_all.sort_values(by=sorting_cols).reset_index(drop=True)

    # cheack of results_from_vm folder exists
    if not os.path.exists(f"./results{log_name}"):
        os.makedirs(f"./results/{log_name}")

    # save data
    data_all.to_csv(f"./results/{log_name}/{data_type}_{log_name}_all.csv", sep=";", index=False)


    return data_all

logs = ["bpic2012", "bpic2017"]
for log_name in logs:
    sample_nr = 0
    test_data = read_and_preprocess_data("test", sample_nr, log_name)
    valid_data = read_and_preprocess_data("valid", sample_nr, log_name)

    print("Done!")


Shape of test data: (235189, 80)

Shape of valid data: (339430, 80)

Statistics for test data:
Number of unique cases: 6283
Number of unique activities: 25
Number of unique resources: 110
Number of unique timestamps: 235177
Number of unique labels: 2
Number of unique treatments: 2

Number of persuadable cases: 3433
Number of doNotDistirub cases: 0
Number of sureThing cases: 0
Number of lostCause cases: 6282

Statistics for valid data:
Number of unique cases: 9423
Number of unique activities: 25
Number of unique resources: 118
Number of unique timestamps: 339413
Number of unique labels: 2
Number of unique treatments: 2

Number of persuadable cases: 5167
Number of doNotDistirub cases: 0
Number of sureThing cases: 0
Number of lostCause cases: 9421



<IPython.core.display.Javascript object>