In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import eventlog file
filename = ('SampleData.csv')
data = pd.read_csv(filename)

In [3]:
class Enrich:
    def __init__(self, input_log):
        if not isinstance(input_log, pd.DataFrame):
            raise ValueError("EventLog must be a pandas DataFrame.")
        self.data = input_log
        
    def clean_log(self, CaseID='caseid_column', Activity='activity_column', StartTimeStamp='start_column', EndTimeStamp='end_column'):
        
        # create a dictionary to map old column names to new column names
        rename_dict = {CaseID: 'CaseID', Activity: 'Activity', StartTimeStamp: 'StartTimeStamp', EndTimeStamp: 'EndTimeStamp'}
        
        # use the rename method to rename the columns in the DataFrame
        self.data = self.data.rename(columns=rename_dict)
       
        # fix columns data types in Python
        # Convert CaseID column to string
        self.data['CaseID'] = self.data['CaseID'].astype(str)
        
        # convert datetime column and set the correct format (dd-mm)
        self.data['StartTimeStamp'] = pd.to_datetime(self.data['StartTimeStamp'], dayfirst=True)
        self.data['EndTimeStamp'] = pd.to_datetime(self.data['EndTimeStamp'], dayfirst=True)
        
        
    def add_start_event(self):
        data = self.data
        
        app_numbers = data['CaseID'].drop_duplicates().tolist()

        for app in app_numbers:
            app_events = data[data['CaseID']==app]

            #Sort events based on StartTimestamp
            app_events = app_events.sort_values (by='StartTimeStamp', ascending = True)

            # Duplicate first raw to dataset
            row_to_duplicate = app_events.iloc[0]
            data = data.append(row_to_duplicate, ignore_index=True)

            # Get and sort app events again
            app_events = data[data['CaseID']==app]

            #Sort events based on StartTimestamp
            app_events = app_events.sort_values (by='StartTimeStamp', ascending = True)    

            # Replace values in the newly created row
            new_index = app_events.index[0]
            new_activity_name = 'start'
        #     new_role_name = 'start'
            new_end_timestamp = app_events ['StartTimeStamp'].iloc[0]

            data.at[data.index[new_index],'Activity'] = new_activity_name
            data.at[data.index[new_index],'EndTimeStamp'] = new_end_timestamp
        #     data.at[data.index[new_index],'Role'] = new_role_nameevents 

        # Generate AHT table
        # Create a list of undublicated activities in a DataFrame
        AHT_DataEntry = data['Activity'].drop_duplicates().to_frame()

        # Create a column to fill AHT data
        AHT_DataEntry['AHT (min) - please fill']=None
        AHT_DataEntry.to_csv('AHT_DataEntry.csv', index = False)

        print('please fill AHT data in the [AHT_DataEntry.csv] file ')
        
        self.log_with_start_activity = data

        return self.log_with_start_activity
    

    def enrich_with_AHT_simple(self, AHT_DataEntry_path = 'AHT_DataEntry.csv'):
        
        """This function will take the log enriched with Start Step in addition to AHT values provided by the user and add AHT for each activity
        Simple means this function will change start time stamp to accommodate the exact AHT provided by the user
        (unless the data duration is lower than the AHT)
        """
        
        data = self.log_with_start_activity
        AHT_DataEntry = pd.read_csv(AHT_DataEntry_path)

        # Add AHT from table to data
        dataset = pd.merge(data, AHT_DataEntry, on='Activity', how='left')

        # Convert AHT to TimeDelta
        dataset['AHT (min) - please fill'] = pd.to_timedelta(dataset['AHT (min) - please fill'], unit="m")

        # Change "StartTimeStamp" to "AssignedTimeStamp"
        dataset.rename(columns={'StartTimeStamp': 'AssignedTimeStamp'}, inplace=True)

        # Create activity duration column
        dataset["activity_duration"] = dataset['EndTimeStamp'] - dataset['AssignedTimeStamp']

        # Generate the new StartTimeStamps
        dataset['StartTimeStamp'] = dataset['EndTimeStamp'] - dataset[['AHT (min) - please fill', 'activity_duration']].min(axis=1)

        # Drop unneeded columns
        dataset = dataset.drop(['AHT (min) - please fill', 'activity_duration'], axis=1)

        self.enriched_log = dataset
        
        return self.enriched_log
   

In [4]:
log = Enrich(data)

In [5]:
log.clean_log(CaseID= "CaseId", 
              Activity = 'ActivityName', 
              StartTimeStamp= 'StartTimestamp',
              EndTimeStamp= 'EndTimestamp' )

In [6]:
log.add_start_event()

please fill AHT data in the [AHT_DataEntry.csv] file 


Unnamed: 0,Location,Role,VendorID,InvoiceValue,Resource,StartTimeStamp,EndTimeStamp,CaseID,Activity
0,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-28 09:23:00,2022-07-28 09:23:00,0,start
1,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-28 17:59:00,2022-07-29 00:43:00,0,Check Customer Payment
2,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-29 09:43:00,2022-07-29 15:04:00,0,Credit Memo Entry
3,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-29 15:04:00,2022-07-29 20:49:00,0,Refund Customer
4,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-08-01 09:49:00,2022-08-01 18:08:00,0,Re-issuing the invoice
...,...,...,...,...,...,...,...,...,...
1132,"San Francisco, USA",Invoice Processor,Vendor1,1109.71,Aiden,2022-08-15 10:35:00,2022-08-15 12:15:00,128,Invoice Entry
1133,"San Francisco, USA",Invoice Processor,Vendor2,1770.58,Aiden,2022-08-15 10:55:00,2022-08-15 13:03:00,129,Invoice Entry
1134,"San Francisco, USA",Invoice Processor,Vendor4,190.83,Aiden,2022-07-14 10:09:00,2022-07-14 12:09:00,130,Invoice Entry
1135,"San Francisco, USA",Invoice Processor,Vendor2,2549.04,Aiden,2022-08-23 09:38:00,2022-08-23 11:34:00,131,Invoice Entry


In [7]:
log.enrich_with_AHT_simple(AHT_DataEntry_path = 'AHT_DataEntry.csv')

Unnamed: 0,Location,Role,VendorID,InvoiceValue,Resource,AssignedTimeStamp,EndTimeStamp,CaseID,Activity,StartTimeStamp
0,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-28 09:23:00,2022-07-28 09:23:00,0,start,2022-07-28 09:23:00
1,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-28 17:59:00,2022-07-29 00:43:00,0,Check Customer Payment,2022-07-29 00:33:00
2,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-29 09:43:00,2022-07-29 15:04:00,0,Credit Memo Entry,2022-07-29 14:52:00
3,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-07-29 15:04:00,2022-07-29 20:49:00,0,Refund Customer,2022-07-29 20:36:00
4,"San Diego, USA",Invoice Processor,Vendor2,729.27,Casey,2022-08-01 09:49:00,2022-08-01 18:08:00,0,Re-issuing the invoice,2022-08-01 17:54:00
...,...,...,...,...,...,...,...,...,...,...
1132,"San Francisco, USA",Invoice Processor,Vendor1,1109.71,Aiden,2022-08-15 10:35:00,2022-08-15 12:15:00,128,Invoice Entry,2022-08-15 11:49:06
1133,"San Francisco, USA",Invoice Processor,Vendor2,1770.58,Aiden,2022-08-15 10:55:00,2022-08-15 13:03:00,129,Invoice Entry,2022-08-15 12:37:06
1134,"San Francisco, USA",Invoice Processor,Vendor4,190.83,Aiden,2022-07-14 10:09:00,2022-07-14 12:09:00,130,Invoice Entry,2022-07-14 11:43:06
1135,"San Francisco, USA",Invoice Processor,Vendor2,2549.04,Aiden,2022-08-23 09:38:00,2022-08-23 11:34:00,131,Invoice Entry,2022-08-23 11:08:06
