In [1]:
# !pip install pandas pyarrow

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, Trials
from tqdm import tqdm
import threading
import random
import time
import gc

# show all columns in pandas
pd.set_option('display.max_columns', None)

# standardize column names: ['case_id', 'activity', 'resource', 'timestamp'] for all logs
case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"

# Define positive_activities_dict
positive_activities_dict = {
    'bpic2017': ["A_Pending"],
    'bpic2012': ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED"],
    # 'trafficFines': ["Send for Credit Collection"]
}

negative_activities_dict = {
    
    'bpic2017': ["A Denied", "A Cancelled"],
    'bpic2012': ["A_CANCELLED", "A_DECLINED"],
}

<IPython.core.display.Javascript object>

In [3]:
# Inter-case 1
def add_nr_ongoing_cases(df):
    # Function to get the number of cases that have been started and did not finish

    # Sort the DataFrame by 'timestamp' column
    df = df.sort_values(['timestamp', 'event_nr'])#.reset_index(drop=True)
    
    # Extract relevant columns as NumPy arrays
    case_ids = df['case_id'].values
    event_nrs = df['event_nr'].values 
    case_lengths = df['case_length'].values
    
    # Initialize an array to store the inter-case feature values
    inter_case_feature = np.zeros_like(case_ids)
    
    # Keep track of the cases that have been started but not finished
    started_cases = set()    
    # Keep track of the cases that have been finished
    finished_cases = set()   
    
    
    # Iterate over each row in the DataFrame
    for i in range(len(df)):
        case_id = case_ids[i]
        # Check if the case has been started but not finished
        if case_id in started_cases:
            inter_case_feature[i] = len(started_cases) 
        else:
            started_cases.add(case_id)
            inter_case_feature[i] = len(started_cases)   # Exclude the current case
        
        # Check if the case has been finished

        if event_nrs[i] == case_lengths[i]:
            finished_cases.add(case_id)
            started_cases.remove(case_id)

            inter_case_feature[i] = len(started_cases) #- 1
    
    # Add the inter-case feature as a new column to the DataFrame
    df['nr_ongoing_cases'] = inter_case_feature
    return df

# Inter-case 2
def add_nr_past_events_in_60_minute_intervals(df):
    # Function to calculate the number of events per 60 minute intervals

    # Convert timestamp to datetime if not already in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort the DataFrame by 'timestamp' column
    df = df.sort_values(['timestamp', 'event_nr'])
    
    # Get the start and end timestamps
    start_timestamp = df['timestamp'].iloc[0]
    end_timestamp = df['timestamp'].iloc[-1]
    
    # Create intervals of 60 minutes
    intervals = pd.date_range(start=start_timestamp.floor('60min'), end=end_timestamp.ceil('60min'), freq='60min')
    
    # Bin the timestamps into intervals
    df['interval'] = pd.cut(df['timestamp'], bins=intervals, right=False, labels=False)
    
    # Group by intervals and sum the events
    event_counts = df.groupby('interval').size()
    #print(event_counts)
    
    # Fill missing intervals with 0 events
    event_counts = event_counts.reindex(range(len(intervals)), fill_value=0)
    
    # Calculate cumulative sum of event counts
    cumulative_events = event_counts
    
    # Map cumulative events to the DataFrame
    df['nr_past_events'] = df['interval'].map(cumulative_events)
    
    return df


# Demand intesity features:
# arrival rate, case creation rate, and case completion rate
def calculate_arrival_rate(df, time_window='1H'):
    # Convert 'timestamp' to datetime if not already in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort the DataFrame by 'timestamp'
    df = df.sort_values('timestamp')
    
    # Calculate the number of cases created within the specified time window
    df['arrival_rate'] = df.resample(time_window, on='timestamp').size().fillna(0).reset_index(drop=True)
    # Calculate the number of cases created and completed within the specified time window
    df['case_creation_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().diff().fillna(0).reset_index(drop=True)
    df['case_completion_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().shift(-1).diff().fillna(0).reset_index(drop=True)
    
    # Set negative values to 0
    df['case_creation_rate'] = df['case_creation_rate'].clip(lower=0)
    df['case_completion_rate'] = df['case_completion_rate'].clip(lower=0)
    df['arrival_rate'] = df['arrival_rate'].clip(lower=0)
    
    
    df.fillna(0, inplace=True)
    
    return df






<IPython.core.display.Javascript object>

In [4]:
# Add remaining and elapsed time columns to the DataFrame
# remaining time: the remaining time untill last event occurs
# elapsed time: the elapsed time from the first event to the current event
# the remaining time is different from the time_to_last_event.
# the time_to_last_event is used for survivial analysis.
# the time_to_last_event represents the time from the first event to the negative event, i.e., the event that represents the negative outcome.

def add_remtime_column(data):
    # Convert timestamp_col to NumPy array
    timestamps = data[timestamp_col].values
    
    # Find start and end dates for each case
    start_dates = data.groupby(case_id_col)[timestamp_col].transform('min')
    end_dates = data.groupby(case_id_col)[timestamp_col].transform('max')
        
    
    # Calculate elapsed time for the current case
    elapsed = timestamps - start_dates
    elapsed[np.isnan(elapsed)] = np.timedelta64(0)  # Replace NaN values with 0
    elapsed = elapsed / np.timedelta64(1, 'D')  # Convert to days
    
    # Calculate remaining time for the current case
    remtime = end_dates - timestamps
    remtime[np.isnan(remtime)] = np.timedelta64(0)  # Replace NaN values with 0
    remtime = remtime / np.timedelta64(1, 'D')  # Convert to days

    
    # Assign elapsed and remaining time to DataFrame
    data["elapsed"] = elapsed
    data["remtime"] = remtime # this column will work with bot remaining time and time-to-negative-event
    
    #data['time_to_negative_event'] = 
    
    #data['time_to_negative_event'] = np.where(data['label'] == 'deviant', data['remtime'], 0)
    
    # Sort data by timestamp_col in ascending order
    data.sort_values(timestamp_col, ascending=True, inplace=True)
    
    return data

<IPython.core.display.Javascript object>

In [5]:

def label_cases(data, p_activities):
    # Create a set of all positive activities
    positive_activities = set()
    for activities in {index: [value] for index, value in enumerate(p_activities)}.values():        
        positive_activities.update(activities)
    
    # Convert positive activities to numpy array for efficient masking
    positive_activities_array = np.array(list(positive_activities))
    
    # Create a dictionary to store case activities
    case_activities_dict = dict(data.groupby(case_id_col)[activity_col].apply(lambda x: tuple(np.unique(x))))
    
    # Create a boolean mask to check if any positive activities exist for each case ID
    has_positive_activity = np.array([np.any(np.isin(case_activities, positive_activities_array)) 
                                      for case_activities in case_activities_dict.values()])
    
    # Assign labels based on the boolean mask
    labels = np.where(has_positive_activity, 'regular', 'deviant')
    
    # Map labels back to case IDs
    label_map = dict(zip(case_activities_dict.keys(), labels))
    # positive outcome: regular
    # negative outcome: deviant
    data['label'] = data[case_id_col].map(label_map).fillna('deviant')
    
    # The event column is 1 for regular cases (where the negative event do not occurr) and 0 for deviant cases.
    data['event'] = np.where(data['label'] == 'deviant', 1, 0)
    
    
    return data

<IPython.core.display.Javascript object>

In [6]:
def get_columns_details(data):
    
    print(data.shape)


    # Get the list of all column names
    all_columns = data.columns
    print(len(all_columns))

    # Identify the column types
    numeric_cols = data.select_dtypes(include='number').columns
    categorical_cols = data.select_dtypes(include='object').columns

    # Exclude specified columns from consideration
    exclude_cols = [case_id_col, label_col, treatment_col, timestamp_col, 'remtime']

    # Remove specified columns from all columns
    considered_cols = [col for col in all_columns if col not in exclude_cols]

    # Create empty lists to store inferred static and dynamic columns
    static_num_cols = []
    dynamic_num_cols = []
    static_cat_cols = []
    dynamic_cat_cols = []

    # Define threshold for unique values to consider a column as static categorical
    unique_value_threshold = 5

    # Group data by 'case_id'
    grouped_data = data.groupby(case_id_col)

    # Calculate standard deviation for numeric columns within each group
    std_dev = grouped_data[numeric_cols].std()

    # Calculate number of unique values for categorical columns within each group
    unique_values = grouped_data[categorical_cols].nunique()

    # Identify static and dynamic columns based on characteristics
    for col in considered_cols:
        if col in numeric_cols:
            # If the standard deviation is close to zero, consider it static
            if std_dev[col].max() < 1e-6:
                static_num_cols.append(col)
            else:
                dynamic_num_cols.append(col)
        elif col in categorical_cols:
            # If the number of unique values is below the threshold, consider it static
            if unique_values[col].max() <= unique_value_threshold:
                static_cat_cols.append(col)
            else:
                dynamic_cat_cols.append(col)

    # Print or use the results as needed
    print("Static Numeric Columns:", static_num_cols)
    print("Dynamic Numeric Columns:", dynamic_num_cols)
    print("Static Categorical Columns:", static_cat_cols)
    print("Dynamic Categorical Columns:", dynamic_cat_cols)
    print("Total columns:", len(static_num_cols) + len(static_cat_cols) + len(dynamic_num_cols) + len(dynamic_cat_cols) + len(exclude_cols))


<IPython.core.display.Javascript object>

In [7]:
import os
from pandas import read_csv

dfs = []

logs = [
        'bpic2012',
        'bpic2017', 
        #'trafficFines',
        ]



for log_name in logs:
    print(f"Log...{log_name}")
   
    log = pd.read_parquet(os.path.join(f"./../prepared_data/{log_name}/date_with_treatments_{log_name}.parquet"))
    
    # read the log
    #log = read_csv(os.path.join(f"./../prepared_data/{log_name}/date_with_treatments_{log_name}.csv"), sep=';')
    print(log.shape)
    print("Add number of cases that started but not finished...")
    df = add_nr_ongoing_cases(log)
    print(df.shape, "1")
    print("Add number of past events that occured in the last 60 minutes...")
    df = add_nr_past_events_in_60_minute_intervals(df)
    print(df.shape, "2")
    print("Add demand intensity inter-case features...")
    df = calculate_arrival_rate(df)
    print(df.shape, "3")
    print("Add remaining and elapsed times...")
    df = add_remtime_column(df)
    print(df.shape, "4")
    print("Label cases...")
    df = label_cases(df,  positive_activities_dict[log_name])
    print(df.shape, "5")
    dfs.append(df)
    
    df.name = "data_with_inter_case_features_%s.parquet" % log_name
    
    print("Getting columns details...")
    get_columns_details(df)
    
    
    
    print("Saving csv file...")
    results_dir = "./../prepared_data/%s/" % log_name
    import os

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))
        
    df.to_parquet( os.path.join(
            results_dir, df.name 
        ))

    print("Done!\n")    
    print("")



Log...bpic2012


(156962, 14)
Add number of cases that started but not finished...
(156962, 15) 1
Add number of past events that occured in the last 60 minutes...
(156962, 17) 2
Add demand intensity inter-case features...


  df['arrival_rate'] = df.resample(time_window, on='timestamp').size().fillna(0).reset_index(drop=True)
  df['case_creation_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().diff().fillna(0).reset_index(drop=True)
  df['case_completion_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().shift(-1).diff().fillna(0).reset_index(drop=True)


(156962, 20) 3
Add remaining and elapsed times...
(156962, 22) 4
Label cases...
(156962, 24) 5
Getting columns details...
(156962, 24)
24
Static Numeric Columns: ['amount_req', 'case_length', 'Total_Offers', 'event']
Dynamic Numeric Columns: ['event_nr', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_of_year', 'time_to_last_event_days', 'nr_ongoing_cases', 'interval', 'nr_past_events', 'arrival_rate', 'case_creation_rate', 'case_completion_rate', 'elapsed']
Static Categorical Columns: []
Dynamic Categorical Columns: ['activity', 'resource']
Total columns: 24
Saving csv file...
Done!


Log...bpic2017
(1198319, 33)
Add number of cases that started but not finished...
(1198319, 34) 1
Add number of past events that occured in the last 60 minutes...
(1198319, 36) 2
Add demand intensity inter-case features...


  df['arrival_rate'] = df.resample(time_window, on='timestamp').size().fillna(0).reset_index(drop=True)
  df['case_creation_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().diff().fillna(0).reset_index(drop=True)
  df['case_completion_rate'] = df.resample(time_window, on='timestamp')['case_id'].nunique().shift(-1).diff().fillna(0).reset_index(drop=True)


(1198319, 39) 3
Add remaining and elapsed times...
(1198319, 41) 4
Label cases...
(1198319, 42) 5
Getting columns details...
(1198319, 42)
42
Static Numeric Columns: ['requestedamount', 'case_length', 'Mail_and_Online_Count', 'Online_Only_Count', 'Total_Offers', 'event']
Dynamic Numeric Columns: ['firstwithdrawalamount', 'monthlycost', 'numberofterms', 'offeredamount', 'creditscore', 'event_nr', 'month', 'weekday', 'hour', 'open_cases', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_of_year', 'time_to_last_event_days', 'interval', 'nr_past_events', 'arrival_rate', 'case_creation_rate', 'case_completion_rate', 'elapsed']
Static Categorical Columns: ['applicationtype', 'loangoal', 'action', 'eventorigin', 'accepted', 'selected']
Dynamic Categorical Columns: ['activity', 'resource', 'lifecycle:transition', 'nr_ongoing_cases']
Total columns: 42
Saving csv file...
Done!




<IPython.core.display.Javascript object>

In [8]:
df17 = pd.read_parquet("./../prepared_data/bpic2017/data_with_inter_case_features_bpic2017.parquet")
df17.head()
df17.Treatment1.value_counts()

Treatment1
Treatment    1195809
Control         2510
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [9]:
df17.dtypes

applicationtype                    object
loangoal                           object
requestedamount                   float64
case_id                            object
label                              object
activity                           object
resource                           object
action                             object
eventorigin                        object
lifecycle:transition               object
accepted                           object
selected                           object
firstwithdrawalamount             float64
monthlycost                       float64
numberofterms                     float64
offeredamount                     float64
creditscore                       float64
event_nr                            int64
month                               int64
weekday                             int64
hour                                int64
open_cases                          int64
timestamp                  datetime64[ns]
case_length                       

<IPython.core.display.Javascript object>

In [10]:
#df17[df17[case_id_col]=="Application_652823628"]

<IPython.core.display.Javascript object>

In [11]:

df12 = pd.read_parquet("./../prepared_data/bpic2012/data_with_inter_case_features_bpic2012.parquet")
df12.Treatment1.value_counts()


Treatment1
Treatment    115125
Control       41837
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [12]:
# df12.dtypes

<IPython.core.display.Javascript object>

In [13]:
# def get_columns_details(data):
    
#     print(data.shape)


#     # Get the list of all column names
#     all_columns = data.columns
#     print(len(all_columns))

#     # Identify the column types
#     numeric_cols = data.select_dtypes(include='number').columns
#     categorical_cols = data.select_dtypes(include='object').columns

#     # Exclude specified columns from consideration
#     exclude_cols = [case_id_col, label_col, treatment_col, timestamp_col, 'remtime']

#     # Remove specified columns from all columns
#     considered_cols = [col for col in all_columns if col not in exclude_cols]

#     # Create empty lists to store inferred static and dynamic columns
#     static_num_cols = []
#     dynamic_num_cols = []
#     static_cat_cols = []
#     dynamic_cat_cols = []

#     # Define threshold for unique values to consider a column as static categorical
#     unique_value_threshold = 5

#     # Group data by 'case_id'
#     grouped_data = data.groupby(case_id_col)

#     # Calculate standard deviation for numeric columns within each group
#     std_dev = grouped_data[numeric_cols].std()

#     # Calculate number of unique values for categorical columns within each group
#     unique_values = grouped_data[categorical_cols].nunique()

#     # Identify static and dynamic columns based on characteristics
#     for col in considered_cols:
#         if col in numeric_cols:
#             # If the standard deviation is close to zero, consider it static
#             if std_dev[col].max() < 1e-6:
#                 static_num_cols.append(col)
#             else:
#                 dynamic_num_cols.append(col)
#         elif col in categorical_cols:
#             # If the number of unique values is below the threshold, consider it static
#             if unique_values[col].max() <= unique_value_threshold:
#                 static_cat_cols.append(col)
#             else:
#                 dynamic_cat_cols.append(col)

#     # Print or use the results as needed
#     print("Static Numeric Columns:", static_num_cols)
#     print("Dynamic Numeric Columns:", dynamic_num_cols)
#     print("Static Categorical Columns:", static_cat_cols)
#     print("Dynamic Categorical Columns:", dynamic_cat_cols)
#     print("Total columns:", len(static_num_cols) + len(static_cat_cols) + len(dynamic_num_cols) + len(dynamic_cat_cols) + len(exclude_cols))


<IPython.core.display.Javascript object>

In [14]:
# import pandas as pd
# import numpy as np

# # Assuming 'df12' is your DataFrame
# data = df17.copy(deep=True)
# print(data.shape)


# # Get the list of all column names
# all_columns = data.columns
# print(len(all_columns))

# # Identify the column types
# numeric_cols = data.select_dtypes(include='number').columns
# categorical_cols = data.select_dtypes(include='object').columns

# # Exclude specified columns from consideration
# exclude_cols = [case_id_col, label_col, treatment_col, timestamp_col]

# # Remove specified columns from all columns
# considered_cols = [col for col in all_columns if col not in exclude_cols]

# # Create empty lists to store inferred static and dynamic columns
# static_num_cols = []
# dynamic_num_cols = []
# static_cat_cols = []
# dynamic_cat_cols = []

# # Define threshold for unique values to consider a column as static categorical
# unique_value_threshold = 5

# # Group data by 'case_id'
# grouped_data = data.groupby(case_id_col)

# # Calculate standard deviation for numeric columns within each group
# std_dev = grouped_data[numeric_cols].std()

# # Calculate number of unique values for categorical columns within each group
# unique_values = grouped_data[categorical_cols].nunique()

# # Identify static and dynamic columns based on characteristics
# for col in considered_cols:
#     if col in numeric_cols:
#         # If the standard deviation is close to zero, consider it static
#         if std_dev[col].max() < 1e-6:
#             static_num_cols.append(col)
#         else:
#             dynamic_num_cols.append(col)
#     elif col in categorical_cols:
#         # If the number of unique values is below the threshold, consider it static
#         if unique_values[col].max() <= unique_value_threshold:
#             static_cat_cols.append(col)
#         else:
#             dynamic_cat_cols.append(col)

# # Print or use the results as needed
# print("Static Numeric Columns:", static_num_cols)
# print("Dynamic Numeric Columns:", dynamic_num_cols)
# print("Static Categorical Columns:", static_cat_cols)
# print("Dynamic Categorical Columns:", dynamic_cat_cols)
# print("Total columns:", len(static_num_cols) + len(static_cat_cols) + len(dynamic_num_cols) + len(dynamic_cat_cols) + len(exclude_cols))


<IPython.core.display.Javascript object>

In [15]:
# import pandas as pd

# # Assuming 'df12' is your DataFrame
# data = df12.copy(deep=True)
# print(data.shape)

# # Assuming 'case_id_col', 'timestamp_col', 'label_col', and 'treatment_col' are defined elsewhere in your code
# case_id_col = 'case_id'
# timestamp_col = 'timestamp'
# label_col = 'label'
# treatment_col = 'treatment'

# # Get the list of all column names
# all_columns = data.columns

# # Identify the column types
# numeric_cols = data.select_dtypes(include='number').columns
# categorical_cols = data.select_dtypes(include='object').columns

# # Create empty lists to store inferred static and dynamic columns
# static_num_cols = []
# dynamic_num_cols = []
# static_cat_cols = []
# dynamic_cat_cols = []

# # Define threshold for unique values to consider a column as static categorical
# unique_value_threshold = 5

# # Group data by 'case_id'
# grouped_data = data.groupby(case_id_col)

# # Iterate over each group (case)
# for case_id, group in grouped_data:
#     # Identify static and dynamic columns based on characteristics
#     for col in all_columns:
#         if col in numeric_cols:
#             # If the standard deviation is close to zero, consider it static
#             if group[col].std() < 1e-6:
#                 static_num_cols.append(col)
#             else:
#                 dynamic_num_cols.append(col)
#         elif col in categorical_cols:
#             # If the number of unique values is below the threshold, consider it static
#             if group[col].nunique() <= unique_value_threshold:
#                 static_cat_cols.append(col)
#             else:
#                 dynamic_cat_cols.append(col)

# # Remove duplicates from static and dynamic column lists
# static_num_cols = list(set(static_num_cols))
# dynamic_num_cols = list(set(dynamic_num_cols))
# static_cat_cols = list(set(static_cat_cols))
# dynamic_cat_cols = list(set(dynamic_cat_cols))

# # Print or use the results as needed
# print("Static Numeric Columns:", static_num_cols)
# print("Dynamic Numeric Columns:", dynamic_num_cols)
# print("Static Categorical Columns:", static_cat_cols)
# print("Dynamic Categorical Columns:", dynamic_cat_cols)
# print("Total columns:", len(static_num_cols) + len(static_cat_cols) + len(dynamic_num_cols) + len(dynamic_cat_cols))


<IPython.core.display.Javascript object>

In [16]:
# data = df12.copy(deep=True)
# print(data.shape)
# # Assuming your dataset is loaded into a DataFrame named 'data'
# # Assuming 'all_columns' is a list of all column names
# all_columns = data.columns
# # Identify the column types
# numeric_cols = data.select_dtypes(include='number').columns
# categorical_cols = data.select_dtypes(include='object').columns

# # Create empty lists to store inferred static and dynamic columns
# static_cols = []
# dynamic_cols = []
# # Define threshold for unique values to consider a column as static categorical
# unique_value_threshold = 5

# # Infer static and dynamic columns based on characteristics
# for col in all_columns:
#     if col in numeric_cols:
#         # If the standard deviation is close to zero, consider it static
#         if data[col].std() < 1e-6:
#             static_cols.append(col)
#         else:
#             dynamic_cols.append(col)
#     elif col in categorical_cols:
#         # If the number of unique values is below the threshold, consider it static
#         if data[col].nunique() <= unique_value_threshold:
#             static_cols.append(col)
#         else:
#             dynamic_cols.append(col)

# # Exclude specified columns from static categorical columns
# exclude_static_cat_cols = [case_id_col] + [label_col] + [treatment_col] #['label', 'Treatment1', 'Treatment2', 'Treatment3', 'Treatment4']
# static_cat_cols = [col for col in static_cols if col in categorical_cols and col not in exclude_static_cat_cols]

# # Exclude specified columns from dynamic categorical columns
# exclude_dynamic_cat_cols = case_id_col + timestamp_col #['caseid', 'timestamp']
# dynamic_cat_cols = [col for col in dynamic_cols if col in categorical_cols and col not in exclude_dynamic_cat_cols]

# # Now, separate numeric and categorical columns for static and dynamic features
# static_num_cols = [col for col in static_cols if col in numeric_cols]
# dynamic_num_cols = [col for col in dynamic_cols if col in numeric_cols]

# # Print or use the results as needed
# print("Static Numeric Columns:", static_num_cols)
# print("Static Categorical Columns:", static_cat_cols)
# print("Dynamic Numeric Columns:", dynamic_num_cols)
# print("Dynamic Categorical Columns:", dynamic_cat_cols)

<IPython.core.display.Javascript object>

In [17]:
# # file_path = "./../prepared_data/bpic2017/date_with_treatments_bpic2017.csv"
# # data = pd.read_csv(file_path, sep=";")


# import os
# import pandas as pd
# from sklearn.pipeline import FeatureUnion

# # Define a function to encode prefixes
# def encode_prefixes(prefixes, feature_combiner, treatment_cols, label_col, results_dir, data_type, log_name):
#     x = feature_combiner.fit_transform(prefixes)
#     y = prefixes[label_col]
#     y_numeric = [1 if label == 'deviant' else 0 for label in y]

#     T_numeric = []
    
#     if log_name == "bpic2017":
#         treatment_cols = ['Treatment1', 'Treatment2', 'Treatment3', 'Treatment4',]
#     else:
#         treatment_cols = ["Treatment1"]

#     for treatment in treatment_cols:
#         y_treatment = prefixes[treatment]
#         y_treatment_numeric = [1 if label == "Treatment" else 0 for label in y_treatment]
#         T_numeric.append(y_treatment_numeric)
#         #y_numeric = np.vstack((y_numeric, y_treatment_numeric)).T

#     print("Read encoded data...")
#     df_agg = pd.read_csv(os.path.join(results_dir, 'dt_transformed_agg_%s.csv'%dataset_name), low_memory=False,  sep=';')
#     try:
#         df_static = pd.read_csv(os.path.join(results_dir, 'dt_transformed_static_%s.csv'%dataset_name), low_memory=False,  sep=';')
#         static_agg_df = pd.concat([df_static, df_agg], axis=1)
#     except:
#         static_agg_df = df_agg#pd.concat([df_static, df_agg], axis=1)
#     #static_agg_df = pd.concat([df_static, df_agg], axis=1)
#     print(list(static_agg_df.columns))

#     data = pd.concat([
#         pd.DataFrame(x, columns=list(static_agg_df.columns)),
#         pd.DataFrame(y_numeric, columns=['Outcome']),
#         pd.DataFrame(np.array(T_numeric).T, columns=treatment_cols),
#         #pd.DataFrame(prefixes.groupby('case_id').first()[treatment_cols].values, columns=treatment_cols),
#     ], axis=1)
    
#     data.to_pickle(os.path.join(results_dir, f"{data_type}_.pkl"))

#     return data
# # data

# for log_name in logs:
#     print("\n==================\n Log: %s\n==================\n" % (log_name,))
#     dataset_name = log_name
#     results_dir = "./../prepared_data/%s/" % dataset_name

#     data = results_data3[logs.index(log_name)]



#     # Convert timestamp column to datetime
#     data[timestamp_col] = pd.to_datetime(data[timestamp_col], format="mixed", infer_datetime_format=True)

#     import pandas as pd

#     # Assuming your dataset is loaded into a DataFrame named 'data'
#     # Assuming 'all_columns' is a list of all column names
#     all_columns = data.columns
#     # Identify the column types
#     numeric_cols = data.select_dtypes(include='number').columns
#     categorical_cols = data.select_dtypes(include='object').columns

#     # Create empty lists to store inferred static and dynamic columns
#     static_cols = []
#     dynamic_cols = []

#     # Define threshold for unique values to consider a column as static categorical
#     unique_value_threshold = 5

#     # Infer static and dynamic columns based on characteristics
#     for col in all_columns:
#         if col in numeric_cols:
#             # If the standard deviation is close to zero, consider it static
#             if data[col].std() < 1e-6:
#                 static_cols.append(col)
#             else:
#                 dynamic_cols.append(col)
#         elif col in categorical_cols:
#             # If the number of unique values is below the threshold, consider it static
#             if data[col].nunique() <= unique_value_threshold:
#                 static_cols.append(col)
#             else:
#                 dynamic_cols.append(col)

#     # Exclude specified columns from static categorical columns
#     exclude_static_cat_cols = [label_col] + treatment_cols #['label', 'Treatment1', 'Treatment2', 'Treatment3', 'Treatment4']
#     static_cat_cols = [col for col in static_cols if col in categorical_cols and col not in exclude_static_cat_cols]

#     # Exclude specified columns from dynamic categorical columns
#     exclude_dynamic_cat_cols = case_id_col + timestamp_col #['caseid', 'timestamp']
#     dynamic_cat_cols = [col for col in dynamic_cols if col in categorical_cols and col not in exclude_dynamic_cat_cols]

#     # Now, separate numeric and categorical columns for static and dynamic features
#     static_num_cols = [col for col in static_cols if col in numeric_cols]
#     dynamic_num_cols = [col for col in dynamic_cols if col in numeric_cols]

#     # Print or use the results as needed
#     print("Static Numeric Columns:", static_num_cols)
#     print("Static Categorical Columns:", static_cat_cols)
#     print("Dynamic Numeric Columns:", dynamic_num_cols)
#     print("Dynamic Categorical Columns:", dynamic_cat_cols)

#     dataset_name = "bpic2017"
#     results_dir = "./../prepared_data/%s/" % dataset_name
#     cls_encoder_args = {'case_id_col': case_id_col,
#                         'static_cat_cols': static_cat_cols,
#                         'static_num_cols': static_num_cols,
#                         'dynamic_cat_cols': dynamic_cat_cols,
#                         'dynamic_num_cols': dynamic_num_cols,
#                         'fillna': True,
#                         'dataset_name':dataset_name,
#                         "results_dir":results_dir}

#     min_prefix_length = 1
#     max_prefix_length = int(
#         np.ceil(data.groupby(case_id_col).size().quantile(1))
#     )


#     import numpy as np

#     def split_data(data, train_ratio, val_ratio, split_type="temporal", seed=22):
#         # Split data into train, val, and test sets based on the specified ratios
#         grouped = data.groupby(case_id_col)
#         start_timestamps = grouped[timestamp_col].min().reset_index()

#         # Sort start_timestamps based on the split_type
#         if split_type == "temporal":
#             start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind="mergesort")
#         elif split_type == "random":
#             np.random.seed(seed)
#             start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))

#         train_size = int(train_ratio * len(start_timestamps))
#         val_size = int(val_ratio * len(start_timestamps))
#         test_size = len(start_timestamps) - train_size - val_size

#         train_ids = list(start_timestamps[case_id_col])[:train_size]
#         val_ids = list(start_timestamps[case_id_col])[train_size:train_size + val_size]
#         test_ids = list(start_timestamps[case_id_col])[train_size + val_size:]

#         train = data[data[case_id_col].isin(train_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)
#         val = data[data[case_id_col].isin(val_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)
#         test = data[data[case_id_col].isin(test_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)

#         return train, val, test

#     # 
#     sorting_cols = [timestamp_col, activity_col]


#     # Specify the desired ratios
#     train_ratio = 0.5
#     val_ratio = 0.3
#     test_ratio = 0.2

#     # Split the data into train, val, and test
#     train, val, test = split_data(data, train_ratio, val_ratio, split_type="temporal", seed=22)
#     print("Number of training cases: ", train[case_id_col].nunique())
#     print("Number of validation cases: ", val[case_id_col].nunique())
#     print("Number of test cases: ", test[case_id_col].nunique())

#     print("shape of train: ", train.shape)
#     print("shape of val: ", val.shape)
#     print("shape of test: ", test.shape)

#     # Save the train, val, and test sets to separate files
#     print("Saving the train, val, and test sets to separate files...")
#     train.to_csv(results_dir + "train_%s.csv"%dataset_name, sep=";", index=False)
#     val.to_csv(results_dir + "val_%s.csv"%dataset_name, sep=";", index=False)
#     test.to_csv(results_dir + "test_%s.csv"%dataset_name, sep=";", index=False)

#     print("Done!")

#     # Create a FeatureUnion with specified methods
#     feature_combiner = FeatureUnion([
#         (method, EncoderFactory.get_encoder(method, **cls_encoder_args))
#         for method in ["static", "agg"]
#     ])
#     print("Start encoding...")

#     # train = pd.read_csv(results_dir + "train_%s.csv"%dataset_name, sep=";")
#     # val = pd.read_csv(results_dir + "val_%s.csv"%dataset_name, sep=";")
#     # test = pd.read_csv(results_dir + "test_%s.csv"%dataset_name, sep=";")
#     # print(train.columns)

#     if log_name == "bpic2017":
#         treatment_cols = ['Treatment1', 'Treatment2', 'Treatment3', 'Treatment4',]
#     else:
#         treatment_cols = ["Treatment1"]


#     # Encode training data
#     train_encoded = encode_prefixes(train, feature_combiner, treatment_cols, label_col, results_dir, "train", log_name)
#     val_encoded = encode_prefixes(val, feature_combiner, treatment_cols, label_col, results_dir, "valid", log_name)
#     test_encoded = encode_prefixes(test, feature_combiner, treatment_cols, label_col, results_dir, "test", log_name)

#     print("Train Encoded data shape: ", train_encoded.shape)
#     print("Val Encoded data shape: ",  val_encoded.shape)
#     print("Test Encoded data shape: ",  test_encoded.shape)

#     # save encoded data
#     print("Save encoded data...")
#     train_encoded.to_csv(results_dir + "train_encoded_%s.csv"%dataset_name, sep=";", index=False)
#     val_encoded.to_csv(results_dir + "val_encoded_%s.csv"%dataset_name, sep=";", index=False)
#     test_encoded.to_csv(results_dir + "test_encoded_%s.csv"%dataset_name, sep=";", index=False)



<IPython.core.display.Javascript object>

In [18]:
# df12.dtypes

<IPython.core.display.Javascript object>