In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import eventlog file
filename = ('SampleData.csv')
data = pd.read_csv(filename)

In [3]:
class Filter:
    def __init__(self, input_log):
        if not isinstance(input_log, pd.DataFrame):
            raise ValueError("EventLog must be a pandas DataFrame.")
        self.data = input_log
        
    def clean_log(self, CaseID='caseid_column', Activity='activity_column', StartTimeStamp='start_column', EndTimeStamp='end_column'):
        
        # create a dictionary to map old column names to new column names
        rename_dict = {CaseID: 'CaseID', Activity: 'Activity', StartTimeStamp: 'StartTimeStamp', EndTimeStamp: 'EndTimeStamp'}
        
        # use the rename method to rename the columns in the DataFrame
        self.data = self.data.rename(columns=rename_dict)
       
        # fix columns data types in Python
        # Convert CaseID column to string
        self.data['CaseID'] = self.data['CaseID'].astype(str)
        
        # convert datetime column and set the correct format (dd-mm)
        self.data['StartTimeStamp'] = pd.to_datetime(self.data['StartTimeStamp'], dayfirst=True)
        self.data['EndTimeStamp'] = pd.to_datetime(self.data['EndTimeStamp'], dayfirst=True)
        
    def generate_case_log(self, case_attributes=[]):
    
        # Create a table for case Attributes without duplicates
        case_attributes_table = self.data[['CaseID'] + case_attributes].drop_duplicates(subset=['CaseID'])

        # Create a list of unduplicated Case IDs
        case_ids = self.data['CaseID'].drop_duplicates().tolist()

        # Initialize list to store case logs
        rows = []

        # Loop to create a Case Log
        for case in case_ids:
            case_activities = self.data[self.data['CaseID'] == case]

            # Sort events based on start date & time
            case_activities = case_activities.sort_values(by='StartTimeStamp', ascending=True)

            # Get first timestamp in case
            first_start_stamp = case_activities['StartTimeStamp'].iloc[0]

            # Get last timestamp in case
            last_end_stamp = case_activities['EndTimeStamp'].iloc[-1]

            # Get number of activities per case
            activity_instances = len(case_activities)

            # Write all application journey in one string
            variant = case_activities['Activity'].str.cat(sep='->')

            rows.append({
                'CaseID': case,
                'first_start_stamp': first_start_stamp,
                'last_end_stamp': last_end_stamp,
                'activity_instances': activity_instances,
                'variant': variant
            })

        # Create DataFrame from rows
        case_log = pd.DataFrame(rows)

        # Add case attributes
        self.case_log = pd.merge(case_log, case_attributes_table, on='CaseID', how='left')

        # Add case duration column
        self.case_log['case_duration_days'] = (self.case_log['last_end_stamp'] - self.case_log['first_start_stamp']).dt.total_seconds() / (60 * 60 * 24)

        
    def filter_cases_by_activity(self, activities_list=['New Case', 'General Enquiries'], action='retain', matching='any value'):
        
        if (action == 'retain') and (matching == 'any value'):
            print(f'Retain cases that contain any of the following activities ({activities_list})')
            relevant_caseids = self.case_log[self.case_log['variant'].str.contains('|'.join(activities_list))][["CaseID"]]
        
        if (action == 'retain') and (matching == 'all values'):
            print(f'Retain cases that contain all the following activities ({activities_list})')
            relevant_caseids = self.case_log[self.case_log['variant'].str.contains('&'.join(activities_list))][['CaseID']]
        
        if (action == 'remove') and (matching == 'any value'):
            print(f'Remove cases that contain any of the following activities ({activities_list})')
            relevant_caseids = self.case_log[~self.case_log['variant'].str.contains('|'.join(activities_list))][['CaseID']]
        
        if (action == 'remove') and (matching == 'all values'):
            print(f'Remove cases that contain all the following activities ({activities_list})')
            relevant_caseids = self.case_log[~self.case_log['variant'].str.contains('&'.join(activities_list))][['CaseID']]

        filtered_log = self.data[self.data['CaseID'].isin(relevant_caseids['CaseID'])]
        return filtered_log

In [4]:
log = Filter(data)

In [5]:
log.clean_log(CaseID= "CaseId", 
              Activity = 'ActivityName', 
              StartTimeStamp= 'StartTimestamp',
              EndTimeStamp= 'EndTimestamp' )

In [6]:
log.generate_case_log(case_attributes=['Location', 'VendorID'])

In [7]:
log.case_log.head(3)

Unnamed: 0,CaseID,first_start_stamp,last_end_stamp,activity_instances,variant,Location,VendorID,case_duration_days
0,0,2022-07-28 09:23:00,2022-08-01 18:08:00,5,Invoice Entry->Check Customer Payment->Credit ...,"San Diego, USA",Vendor2,4.364583
1,1,2022-07-21 09:43:00,2022-07-21 19:05:00,5,Invoice Entry->Check Customer Payment->Credit ...,"San Francisco, USA",Vendor2,0.390278
2,2,2022-07-19 10:03:00,2022-07-25 22:57:00,8,Invoice Entry->Confirm Payment Received->Refun...,"San Diego, USA",Vendor2,6.5375


In [8]:
filtered_log = log.filter_cases_by_activity(activities_list=['Check Customer Payment', 'Reject Invoice'], 
                             action = 'remove' , 
                             matching = 'any value')

Remove cases that contain any of the following activities (['Check Customer Payment', 'Reject Invoice'])


In [9]:
len(log.data)

1004

In [10]:
len(filtered_log)

328

## Explore rework filter

In [11]:
log.case_log['variant'].str.count('Check Customer Payment')

0      1
1      1
2      0
3      0
4      0
      ..
128    0
129    1
130    1
131    0
132    1
Name: variant, Length: 133, dtype: int64