In [1]:
# !pip install pandas
# !pip install modin[all]
# !pip install bokeh==2.4.2
# !pip install psutil memory-profiler
# !pip install jupyter-server-proxy
# !pip install pm4py


<IPython.core.display.Javascript object>

In [2]:

logs = [
        'bpic2012',
        'bpic2017', 
        #'trafficFines',
        ]


# standardize column names: ['case_id', 'activity', 'resource', 'timestamp'] for all logs
case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'

# dataset_name: [case_id_col, activity_col, resource_col, timestamp_col]
# NOTE - Basic column names are defined by users. These names will be renamed and standardized later 
# standardize column names: ['case_id', 'activity', 'resource', 'timestamp']
dataset_dict = {
    'bpic2012': ['case_id', 'activity', 'resource', 'start_time'],
    'bpic2017': ['case_id', 'activity', 'org:resource', 'time:timestamp'],
    #'trafficFines': ['case:concept:name', 'concept:name', 'org:resource', "time:timestamp"], 
}
incomplete_dict = {
    'bpic2012': ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED"],
    'bpic2017': ["A_Pending", "A_Denied", "A_Cancelled",],
    #'trafficFines': ["Send for Credit Collection"]
}

negative_events = {
    'bpic2012': ["A_CANCELLED", "A_DECLINED"],
    'bpic2017': ["A_Denied", "A_Cancelled",],
    #'trafficFines': ["Send for Credit Collection"]
}


# Define positive and negative labels
label_old = "label"
neg_label = "deviant"
pos_label = "regular"
positive_activities_dict = {
    'bpic2017': ["A_Pending"],
    'bpic2012': ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED",],
    #'trafficFines': ["Send for Credit Collection"]
}

chunk_s = 10000

<IPython.core.display.Javascript object>

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import csv

import psutil
import time
from memory_profiler import profile
from pandas import Timestamp
import pm4py
import pandas as pd
import numpy as np

from concurrent.futures import ProcessPoolExecutor, as_completed


class DataPreprocessorModinDask:
    def __init__(self, log_name, input_data_folder, output_data_folder):
        self.log_name = log_name
        self.input_data_folder = input_data_folder
        self.output_data_folder = output_data_folder

    def _print_metrics(self, elapsed_time):
        # Helper function to print resource usage metrics
        cpu_percent = psutil.cpu_percent()
        memory_info = psutil.virtual_memory()
        # print(f"Time: {elapsed_time} seconds")
        # print(f"CPU Usage: {cpu_percent}%")
        # print(f"Memory Usage: {memory_info.percent}%")

    @profile
    def read_log(self):
        start_time = time.time()
        print("Reading log...")

        # Get the first file in the input folder
        files = os.listdir(self.input_data_folder)[0]
        log_path = os.path.join(self.input_data_folder, files)

        # Read the log based on its extension
        if log_path.lower().endswith('.csv'):
            with open(log_path, 'r') as file:
                # Use Sniffer to infer the delimiter
                dialect = csv.Sniffer().sniff(file.read(10000))
            try:
                # Read CSV with Dask, specifying dtype for certain columns
                log_file = pd.read_csv(log_path, sep=dialect.delimiter, dtype={'Resource': 'object', 'article': 'object'})
            except:
                log_file = pd.read_csv(log_path, sep=dialect.delimiter)
            log_file = log_file.rename(columns=lambda x: x.strip().lower().replace(' ', '_'))
            log_file = log_file.rename(
                columns=dict(zip(dataset_dict[self.log_name], [case_id_col, activity_col, resource_col, timestamp_col])))
        elif log_path.lower().endswith('.xes') or log_path.lower().endswith('.xes.gz'):
            # Read XES log using pm4py and convert to Dask DataFrame
            log_file = pm4py.read_xes(log_path)
            log_file = dataframe_utils.convert_traces_dataframe(log_file)
            log_file = from_pandas(log_file, npartitions=1)
            log_file = log_file.rename(
                columns=dict(zip(dataset_dict[self.log_name], [case_id_col, activity_col, resource_col, timestamp_col])))
        else:
            raise ValueError("Unsupported file extension. Supported extensions: .csv, .xes")

        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)
        return log_file

    @profile
    def clean_data(self, log_file):
        start_time = time.time()
        print("Cleaning data...")

        # Convert timestamp column to datetime
        log_file[timestamp_col] = pd.to_datetime(log_file[timestamp_col], format="mixed", infer_datetime_format=True)
        log_file = log_file.sort_values(by=[timestamp_col])

       # Remove white spaces from column values
        log_file = log_file.applymap(lambda x: x.strip() if isinstance(x, str) else x)

        # Convert the 'resource' column to string
        log_file[resource_col] = log_file[resource_col].astype(str)

        # Replace unique resource values with 'res{i}' format
        unique_resources = log_file[resource_col].unique()
        resource_mapping = {original_value: f'res{i + 1}' for i, original_value in enumerate(unique_resources)}
        log_file[resource_col] = log_file[resource_col].replace(resource_mapping)

        threshold_percentage = 25
        threshold = len(log_file) * (threshold_percentage / 100)
        columns_to_drop = log_file.columns[log_file.isna().sum() > threshold].tolist()
        log_file = log_file.drop(columns=columns_to_drop)

        log_file = log_file.dropna()

        activities_to_check = incomplete_dict[log_name]  # ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED"]       
        contains_activity = log_file[activity_col].isin(activities_to_check).groupby(log_file[case_id_col]).max().reset_index()
        complete_cases = contains_activity[contains_activity[activity_col] == True][case_id_col]
        log_file = log_file[log_file[case_id_col].isin(complete_cases.tolist())]       


        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)
        return log_file

    
    @profile
    def extract_temporal_features(self, log_file):
        start_time = time.time()
        print("Extracting timestamp features...")


        # Calculate event_nr
        log_file['event_nr'] = log_file.groupby(case_id_col).cumcount() + 1

        # Calculate case_length
        log_file['case_length'] = log_file.groupby(case_id_col)['event_nr'].transform('max')

        # Extract temporal context information
        log_file['hour_of_day'] = log_file[timestamp_col].dt.hour
        log_file['day_of_week'] = log_file[timestamp_col].dt.dayofweek + 1  # Monday is 1, Sunday is 7
        log_file['day_of_month'] = log_file[timestamp_col].dt.day
        log_file['month_of_year'] = log_file[timestamp_col].dt.month

        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)
        
        # Get the list of columns to check
        columns_to_check = log_file.columns
        print(columns_to_check)

        # Iterate through the columns and delete those containing 'time' except for 'timestamp'
        for col in columns_to_check:
            if ('time' in col.lower() or 'date' in col.lower()) and col != timestamp_col:
                print(f"column to be deleted: {col}")
                del log_file[col]


        return log_file



<IPython.core.display.Javascript object>

In [4]:
# Usage example:
results_data = []
# Initialize your DataPreprocessorModinDask object and call read_log() and clean_data()
for log_name in logs:
    print("\n==================\n Log: %s\n==================\n" % (log_name,))
    input_data_folder = "./../data/%s" % (log_name,)
    output_data_folder = "./../prepared_data/%s" % log_name

    data_preprocessor = DataPreprocessorModinDask(log_name, input_data_folder, output_data_folder)
    log_file = data_preprocessor.read_log()
    print(log_file.shape)
    cleaned_data = data_preprocessor.clean_data(log_file)
    features_data = data_preprocessor.extract_temporal_features(cleaned_data)
    features_data.name = "prepared_features_data_%s.csv" % log_name
    results_data.append(features_data)

    
    
    print("Saving csv file...")
    results_dir = "./../prepared_data/%s/" % log_name
    import os

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))

    features_data.to_csv(
        os.path.join(
            results_dir, features_data.name
        ),
        index=False,
        sep=";",
    )
    print(features_data.shape)
    features_data.name = "prepared_features_data_%s.parquet" % log_name
    features_data.to_parquet( os.path.join(
            results_dir, features_data.name
        ))

    print("Done!")


 Log: bpic2012

ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Reading log...
(164509, 7)
ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Cleaning data...
ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Extracting timestamp features...
Index(['case_id', 'timestamp', 'end_time', 'amount_req', 'reg_date',
       'activity', 'resource', 'event_nr', 'case_length', 'hour_of_day',
       'day_of_week', 'day_of_month', 'month_of_year'],
      dtype='object')
column to be deleted: end_time
column to be deleted: reg_date
Saving csv file...
(156962, 11)
Done!

 Log: bpic2017

ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Reading log...
(1198366, 26)
ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Cleaning data...
ERROR: Could not find file /tmp/ipykernel_336143/3469093111.py
Extracting timestamp features...
Index(['applicationtype', 'loangoal', 'requestedamount', 'case_id', 'label',
       'activity', 'resource', 'act

<IPython.core.display.Javascript object>

# Possible Interventions For BPIC2017 from the winner student report:

1. **Sending Another Loan Offer:**
   - *Intervention:* Send offers to clients as soon as possible. For all case endpoints, this has been shown to have the greatest effect on cancellation rates. Sending offers to clients within 4 days may decrease cancellation rates by 5% up to 10%.
   - *Treatment 1:* Cases that receive only one offer are in the control group, while cases that receive more than one offer are in the treatment group.



In [5]:
# import pandas as pd
# from concurrent.futures import ProcessPoolExecutor, as_completed


# results_data2 = []
# for log_name in logs:
#     print("\n==================\n Log: %s\n==================\n" % (log_name,))
#     positive_activities = positive_activities_dict[log_name]

#     # Initialize a global state to keep track of positive activities across cases
#     global_positive_cases = set()

#     # Define a function to label each group
#     def label_group(chunk):
#         # Check if any positive activity exists in the case
#         chunk['is_positive'] = chunk.groupby('case_id')[activity_col].transform(lambda x: any(activity in positive_activities for activity in x))
        
#         # Update the global positive cases state
#         positive_cases_from_chunk = set(chunk.loc[chunk['is_positive'], 'case_id'])
#         global_positive_cases.update(positive_cases_from_chunk)
        
#         # Assign label based on the result
#         chunk['label'] = chunk.apply(lambda row: pos_label if row['is_positive'] else neg_label, axis=1)
        
#         return chunk.drop('is_positive', axis=1)

#     # Set the chunk size based on your system's memory constraints
#     chunk_size = chunk_s

#     # Split the DataFrame into chunks
#     features_data = results_data[logs.index(log_name)]

#     num_chunks = len(features_data) // chunk_size
#     chunks = [features_data.iloc[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks + 1)]

#     # Initialize a ProcessPoolExecutor for parallel processing
#     with ProcessPoolExecutor() as executor:
#         futures = [executor.submit(label_group, chunk) for chunk in chunks]

#         # Collect the results as they become available
#         results = []
#         for future in as_completed(futures):
#             results.append(future.result())

#     # Concatenate the results of all chunks into a final DataFrame
#     labeled_data = pd.concat(results, ignore_index=True)
#     results_data2.append(labeled_data)
    

#     labeled_data.name = "labeled_data_%s.csv" % log_name

#     print("Saving csv file...")
#     results_dir = "./../prepared_data/%s/" % log_name
#     import os

#     if not os.path.exists(os.path.join(results_dir)):
#         os.makedirs(os.path.join(results_dir))

#     labeled_data.to_csv(
#         os.path.join(
#             results_dir, labeled_data.name
#         ),
#         index=False,
#         sep=";",
#     )

#     print("Done!\n")


<IPython.core.display.Javascript object>

In [6]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm


results_data3 = []

relevant_activities_dict = {
    "bpic2017": ['O_Sent (mail and online)', 'O_Sent (online only)'],
    "bpic2012": ['O_SENT'],
    "trafficFines": ['Add penalty']
    }

# Function for Treatment 1: Increase the number of offers
def apply_treatment1(group, log_name):

    treatment2_col = "numberofterms"
    treatment3_col = "firstwithdrawalamount"
    treatment4_col = "monthlycost"


    
    if log_name == "bpic2017":        
        # Count how much offers sent in mail and online versus online only within each group
        group['Mail_and_Online_Count'] = (group[activity_col] == 'O_Sent (mail and online)').sum()
        group['Online_Only_Count'] = (group[activity_col] == 'O_Sent (online only)').sum()
        group['Total_Offers'] = group['Mail_and_Online_Count'] + group['Online_Only_Count']

        # Determine the treatment or control based on the counts within each group
        group['Treatment1'] = 'Control'
        if group['Mail_and_Online_Count'].sum() > 1 or group['Online_Only_Count'].sum() > 1:
            group['Treatment1'] = 'Treatment'

        # Reset the index for each group
        group.reset_index(drop=True, inplace=True)



        return group

    elif log_name == "bpic2012":
        group['Total_Offers'] = (group[activity_col] == 'O_SENT').sum()
        group['Treatment1'] = 'Control'
        if group['Total_Offers'].sum() > 1:
            group['Treatment1'] = 'Treatment'
        
        # Reset the index for each group
        group.reset_index(drop=True, inplace=True)
        return group








treatments_functions = [apply_treatment1,] # apply_treatment2, apply_treatment3, apply_treatment4]

# Set the maximum number of cases per chunk
max_cases_per_chunk = 100


# List to store futures
futures = []

# Set the maximum number of worker processes
max_workers = 7  # You can adjust this number based on your system's capabilities

import pandas as pd

def add_time_to_last_event_column(df, case_id_col, timestamp_col, activity_col, activities_to_track):
    """
    Add a new column representing the time (in days) to the last occurrence of specified activities for each case.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the event log.
    - case_id_col (str): The column name representing case IDs.
    - timestamp_col (str): The column name representing timestamps.
    - activity_col (str): The column name representing activity names.
    - activities_to_track (list): List of activity names to track the time to the last occurrence.

    Returns:
    - pd.DataFrame: The DataFrame with the new column added.
    """
    # Ensure timestamps are in datetime format
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # Filter for specified activities
    filtered_df = df[df[activity_col].isin(activities_to_track)]

    # Find the last occurrence for each case
    last_occurrences = filtered_df.groupby(case_id_col)[timestamp_col].max()

    # Merge the last occurrences back to the original DataFrame
    df = pd.merge(df, last_occurrences, how='left', on=case_id_col, suffixes=('', '_last'))

    # Calculate the time difference in days
    df['time_to_last_event_days'] = (df['timestamp_last'] - df[timestamp_col]).dt.total_seconds() / (60 * 60 * 24)

    # Fill NaN values (cases without the specified activities) with a default value, e.g., -1
    df['time_to_last_event_days'].fillna(-1, inplace=True)

    # Drop the auxiliary columns used for the calculation
    df.drop(['timestamp_last'], axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df


def determine_overall_treatment(row):
    treatments = ['Treatment1', 'Treatment2', 'Treatment3', 'Treatment4']
    for treatment in treatments:
        if row[treatment] == 'Treatment':
            return treatment
    return 'Controle'

for log_name in logs:
    print("\n==================\n Log: %s\n==================\n" % (log_name,))    
    relevant_activities = relevant_activities_dict[log_name]
    labeled_data = results_data[logs.index(log_name)]
  
    futures = []
    results = []
        
    # Initialize a ProcessPoolExecutor for parallel processing
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        for treatment_function in treatments_functions:
            # Split the DataFrame into chunks based on case_id_col
            grouped = None
            grouped = labeled_data.groupby(case_id_col, as_index=False)
            # Iterate over groups and submit tasks to the executor
            for name, group in tqdm(grouped, desc="Submitting Tasks"):
                # Reset the index for each group
                group.reset_index(drop=True, inplace=True)


                # Submit the task to the executor for each group
                future = executor.submit(treatment_function, group.copy(), log_name,)
                futures.append(future)

    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Collecting Results"):
        try:
            treated_chunk = future.result()
            results.append(treated_chunk)
        except Exception as e:
            print(f"Error collecting result for {future}: {e}")

    try:
        final_result = pd.concat(results, ignore_index=True)
        final_result.reset_index(drop=True, inplace=True)  # Resetting index to avoid duplicate indices
        # print("Successfully concatenated results into final DataFrame.")
    except Exception as e:
        print(f"Error concatenating results into final DataFrame: {e}")

    
    # add time to last event column
    activities_to_track = incomplete_dict[log_name]
    #print("Activities to track:", activities_to_track)
    final_result = add_time_to_last_event_column(final_result, case_id_col, timestamp_col, activity_col, activities_to_track)

    # Display the final result
    print("\nDone! - Final DataFrame:\n")
    final_result.name = "date_with_treatments_%s.csv" % log_name
    #print("shape after treatment:", final_result.shape[0], "rows", final_result.shape[1], "columns")

    print("Saving csv file...")
    results_dir = "./../prepared_data/%s/" % log_name
    import os

    results_data3.append(final_result)

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))

    final_result.to_csv(
        os.path.join(
            results_dir, final_result.name
        ),
        index=False,
        sep=";",
    )
    final_result.name = "date_with_treatments_%s.parquet" % log_name
    print(final_result.shape)
    
    final_result.to_parquet( os.path.join(
            results_dir, final_result.name
        ))

    print("Done!\n")
    final_result = None



 Log: bpic2012



Submitting Tasks: 100%|██████████| 12688/12688 [01:04<00:00, 195.78it/s]
Collecting Results: 100%|██████████| 12688/12688 [00:00<00:00, 65651.53it/s]



Done! - Final DataFrame:

Saving csv file...
(156962, 14)
Done!


 Log: bpic2017



Submitting Tasks: 100%|██████████| 31411/31411 [03:08<00:00, 166.46it/s]
Collecting Results: 100%|██████████| 31411/31411 [00:00<00:00, 79203.47it/s]



Done! - Final DataFrame:

Saving csv file...
(1198319, 33)
Done!



<IPython.core.display.Javascript object>

In [7]:
# import pandas as pd
# import numpy as np
# import os



# # Define the label column
# label_col = 'label'


# def split_data(data, train_ratio, val_ratio, split_type="temporal", seed=22):
#     # Split data into train, val, and test sets based on the specified ratios
#     grouped = data.groupby(case_id_col)
#     start_timestamps = grouped[timestamp_col].min().reset_index()

#     # Sort start_timestamps based on the split_type
#     if split_type == "temporal":
#         start_timestamps = start_timestamps.sort_values(timestamp_col, ascending=True, kind="mergesort")
#     elif split_type == "random":
#         np.random.seed(seed)
#         start_timestamps = start_timestamps.reindex(np.random.permutation(start_timestamps.index))

#     train_size = int(train_ratio * len(start_timestamps))
#     val_size = int(val_ratio * len(start_timestamps))
#     test_size = len(start_timestamps) - train_size - val_size

#     train_ids = list(start_timestamps[case_id_col])[:train_size]
#     val_ids = list(start_timestamps[case_id_col])[train_size:train_size + val_size]
#     test_ids = list(start_timestamps[case_id_col])[train_size + val_size:]

#     train = data[data[case_id_col].isin(train_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)
#     val = data[data[case_id_col].isin(val_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)
#     test = data[data[case_id_col].isin(test_ids)].sort_values(sorting_cols, ascending=True, kind='mergesort').reset_index(drop=True)

#     return train, val, test

# def encode_data(data, treatment_cols, label_col, data_type):
#     y_numeric = np.where(data[label_col] == 'deviant', 0, 1)

#     T_numeric = [np.where(data[treatment] == "Treatment", 1, 0) for treatment in treatment_cols]

#     numeric_cols = data.select_dtypes(include='number').columns
#     # Exclude label_col and treatment_cols from categorical_cols
#     exclude_cols = [label_col] + treatment_cols
#     categorical_cols = data.select_dtypes(exclude='number').columns.difference(exclude_cols)


#     # Convert numeric columns to float
#     data[numeric_cols] = data[numeric_cols].astype(float)

#     # Convert categorical columns to categorical type
#     data[categorical_cols] = data[categorical_cols].astype('category')

#     # Apply cat.codes only to categorical columns
#     data[categorical_cols] = data[categorical_cols].apply(lambda x: x.cat.codes)

#     # One-hot encode categorical columns
#     data_encoded = pd.concat([
#         pd.DataFrame(data[numeric_cols]),  # Keep numeric columns
#         pd.DataFrame(y_numeric, columns=['Outcome']),
#         pd.DataFrame(np.array(T_numeric).T, columns=treatment_cols),
#         pd.DataFrame(data[categorical_cols], columns=categorical_cols),
#     ], axis=1)


#     return data_encoded

# def save_data(data, results_dir, dataset_name, data_type):
#     data.to_csv(os.path.join(results_dir, f"{data_type}_{dataset_name}.csv"), sep=";", index=False)

# def save_data_encoded(data_encoded, results_dir, dataset_name, data_type):
#     data_encoded.to_csv(os.path.join(results_dir, f"{data_type}_encoded_{dataset_name}.csv"), sep=";", index=False)


# sorting_cols = [timestamp_col, activity_col]


# # Specify the desired ratios
# train_ratio = 0.5
# val_ratio = 0.3
# test_ratio = 0.2



# for log_name in logs:
#     print("\n==================\n Log: %s\n==================\n" % (log_name,))
#     treatments_data = results_data3[logs.index(log_name)]

#     data = treatments_data.copy()
#     all_columns = data.columns

#     # Split the data into train, val, and test
#     train, val, test = split_data(data, train_ratio, val_ratio, split_type="temporal", seed=22)

   
#     treatment_cols = ['Treatment1',]

#     #dataset_name = log_name
#     results_dir = "./../prepared_data/%s/" % log_name


#     save_data(train, results_dir, log_name, "train")
#     save_data(test, results_dir, log_name, "test")
#     save_data(val, results_dir, log_name, "valid")


#     train_data_encoded = encode_data(train, treatment_cols, label_col, "train")
#     test_data_encoded = encode_data(test, treatment_cols, label_col, "test")
#     val_data_encoded = encode_data(val, treatment_cols, label_col, "valid")

    
    
#     print("Saving the train, val, and test sets to separate files...")
#     save_data_encoded(train_data_encoded, results_dir, log_name, "train")
#     save_data_encoded(test_data_encoded, results_dir, log_name, "test")
#     save_data_encoded(val_data_encoded, results_dir, log_name, "valid")




<IPython.core.display.Javascript object>

In [8]:
# import pandas as pd
# train_encoded_17 = pd.read_csv("./../prepared_data/bpic2017/train_encoded_bpic2017.csv", sep=";")
# test_encoded_17 = pd.read_csv("./../prepared_data/bpic2017/test_encoded_bpic2017.csv", sep=";")
# valid_encoded_17 = pd.read_csv("./../prepared_data/bpic2017/valid_encoded_bpic2017.csv", sep=";")
# bpic2017_encoded = pd.concat([train_encoded_17, test_encoded_17, valid_encoded_17], axis=0)
# # save the encoded data to separate files

# bpic2017_encoded.to_csv("./../prepared_data/bpic2017/bpic2017_encoded.csv", sep=";", index=False)

<IPython.core.display.Javascript object>

In [9]:
# import pandas as pd
# train_encoded_12 = pd.read_csv("./../prepared_data/bpic2012/train_encoded_bpic2012.csv", sep=";")
# test_encoded_12 = pd.read_csv("./../prepared_data/bpic2012/test_encoded_bpic2012.csv", sep=";")
# valid_encoded_12 = pd.read_csv("./../prepared_data/bpic2012/valid_encoded_bpic2012.csv", sep=";")

# bpic2012_encoded = pd.concat([train_encoded_12, valid_encoded_12, test_encoded_12], axis=0)
# # save the encoded data to separate files

# bpic2012_encoded.to_csv("./../prepared_data/bpic2012/bpic2012_encoded.csv", sep=";", index=False)

<IPython.core.display.Javascript object>