In [None]:
%pip install torch~=2.2.2 transformers~=4.39.3 pm4py~=2.7.11.4 pandas~=2.2.1 matplotlib~=3.8.3 numpy~=1.25.2 tqdm~=4.66.2 Pillow~=10.0.0

In [69]:
__file__ = '/home/jupyter/datasphere/project/application.ipynb'

In [70]:
import os
import warnings

warnings.filterwarnings("ignore")


class FileManager:
    @staticmethod
    def get_filename(xes_file):
        base_name = os.path.basename(xes_file)
        file_name, _ = os.path.splitext(base_name)
        return file_name

    @staticmethod
    def get_save_path(default_filename, default_extension):
        '''
        :param default_filename: file name with extension
        :param default_extension: str extension with dot in front, like ".png"
        :return: path to file
        '''
        save_path = input('Provide a file path with extension to save the file:')
        return save_path

    @staticmethod
    def get_in_path(base_dir, default_extension):
        '''
        :param base_dir: directory to start searching files
        :param default_extension: str extension with dot in front, like ".txt"
        :return: path to file
        '''
        in_path = input('Provide a file path: ')
        return in_path

In [71]:
import pm4py
import pandas as pd

from tqdm import tqdm


class XESTracesProcessor:
    def __init__(self, input_filepath, output_filepath=None, num_needed_indexes=25):
        self.input_filepath = input_filepath
        self.output_filepath = output_filepath
        self.num_needed_indexes = num_needed_indexes

    def process_file(self):
        print('\033[93m' + "processing log..." + '\033[0m')
        
        log = self.__get_log(self.input_filepath)
        
        event_log = self.__get_needed_columns_log(log)
        
        needed_indexes = self.__get_needed_indexes(event_log[event_log["ManagedThreadId"] != -1])

        outliers = self.__get_outliers(event_log, needed_indexes)

        event_log = event_log[event_log['ManagedThreadId'].isin(needed_indexes)]
        traces_log = self.__get_traces_log(event_log)
        final_trace_log = self.__get_final_log(traces_log, outliers)

        if self.output_filepath is not None:
            self.__write_traces_to_file(final_trace_log, self.output_filepath)

        return self.__get_list_of_traces(final_trace_log)

    def __get_needed_indexes(self, event_log):
        value_counts = event_log['ManagedThreadId'].value_counts()
        needed_indexes = list(value_counts[:self.num_needed_indexes].index)

        return needed_indexes

    def __get_log(self, filepath):
        return pm4py.read_xes(filepath)

    def __get_needed_columns_log(self, log):
        event_log = log[['ManagedThreadId', 'concept:name', 'time:timestamp']]
        event_log = event_log.astype({"ManagedThreadId": int, 'concept:name': str, 'time:timestamp': str})

        regex = r'(\d{2}:\d{2}:\d{2}.\d{6}\+\d{2}:\d{2})'
        event_log['time:timestamp'] = event_log['time:timestamp'].str.extract(regex)
        event_log['time:timestamp'] = pd.to_datetime(event_log['time:timestamp'])
        return event_log

    def __get_outliers(self, event_log, needed_indexes):
        outliers = event_log[~event_log['ManagedThreadId'].isin(needed_indexes)]
        return outliers

    def __get_traces_log(self, event_log):
        event_log = event_log.copy()
        event_log['time:timestamp'] = event_log['time:timestamp'].dt.tz_localize(None)
        traces = event_log.groupby('ManagedThreadId').apply(
            lambda x: [[row['concept:name'], row['time:timestamp']] for index, row in x.iterrows()]
        )
        start_times = event_log.groupby('ManagedThreadId')['time:timestamp'].min()
        end_times = event_log.groupby('ManagedThreadId')['time:timestamp'].max()
        traces_log = pd.DataFrame({
            'ManagedThreadId': traces.index,
            'Trace': traces.values,
            'Start Time': start_times.values,
            'End Time': end_times.values
        })
        return traces_log

    def __get_final_log(self, traces_log, outliers):
        traces_log = traces_log.copy()
        intervals = pd.arrays.IntervalArray.from_arrays(traces_log['Start Time'], traces_log['End Time'], closed='both')
        outliers = outliers.copy()
        outliers['time:timestamp'] = outliers['time:timestamp'].dt.tz_localize(None)

        for idx, row in tqdm(outliers.iterrows(), desc="creating dataframe", total=len(outliers)):
            timestamp = row['time:timestamp']
            concept_name = row['concept:name']
            pair = [concept_name, timestamp]
            mask = intervals.contains(timestamp)
            trace_indices = traces_log[mask].index
            for trace_idx in trace_indices:
                traces_log.at[trace_idx, 'Trace'].append(pair)

        def sort_trace(trace):
            filtered_trace = [event for event in trace if event[1] is not None]
            return [event for event, timestamp in sorted(filtered_trace, key=lambda elem_pair: elem_pair[1])]

        traces_log['Trace'] = traces_log['Trace'].apply(sort_trace)

        return traces_log

    def __write_traces_to_file(self, log, filepath):
        with open(filepath, "a") as file:
            for trace in log["Trace"]:
                file.write(' '.join(trace) + "\n")

    def __get_list_of_traces(self, log):
        traces = []
        for trace in log["Trace"]:
            traces.append(trace)
        return traces


In [72]:
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import pm4py


class LogProcessor:
    max_LoA = 13

    def __init__(self,
                 xes_path,
                 case_id='ManagedThreadId',
                 activity_key='concept:name',
                 timestamp_key='time:timestamp'
                 ):

        self.xes_path = xes_path

        self.case_id = case_id
        self.activity_key = activity_key
        self.timestamp_key = timestamp_key

        self.tokenized_log = None
        self.LoA = None

        self.traces_processor = TracesProcessor()
        self.traces = None

    def __extract_traces(self):
        processor = XESTracesProcessor(self.xes_path)
        self.traces = processor.process_file()

    def __process_trace(self, case_id, tokens, start_time):
        time = start_time
        prev_act = None
        events = []
        for activity in tokens:
            if activity != prev_act:
                events.append({
                    "case:concept:name": case_id,
                    "concept:name": activity,
                    "time:timestamp": time
                })
                time += timedelta(minutes=1)  # +1 minute to time of past event
                prev_act = activity
        return events

    def create_tokenized_event_log(self, LoA):
        if LoA <= 0 or LoA > self.max_LoA:
            raise ValueError(f"LOA must be between 1 and {self.max_LoA}")

        if self.tokenized_log is not None and self.LoA == LoA:
            return self.tokenized_log

        if not self.traces:
            self.__extract_traces()

        processed_traces = self.traces_processor.process_traces(self.traces, LoA)
        events = []
        start_time = datetime.now()

        print('\033[93m' + "finishing dataframe creation ..." + '\033[0m')
        
        for case_id, tokens in enumerate(processed_traces, start=1):
            time = start_time
            prev_act = None
            for activity in tokens:
                if activity != prev_act:
                    events.append({
                        "case:concept:name": case_id,
                        "concept:name": activity,
                        "time:timestamp": time
                    })
                    time += timedelta(minutes=1)
                    prev_act = activity

        log_df = pd.DataFrame(events)

        format_df = pm4py.format_dataframe(log_df, case_id='case:concept:name',
                                           activity_key='concept:name',
                                           timestamp_key='time:timestamp')

        self.tokenized_log = pm4py.convert_to_event_log(format_df)
        self.LoA = LoA

        return self.tokenized_log

    def get_traces(self):
        if not self.traces:
            self.__extract_traces()

        return self.traces

In [73]:
import json
import os

from transformers import PreTrainedTokenizerFast


class TokenizerManager:
    base_dir = os.path.dirname(__file__)
    _base_dir = os.path.join(base_dir, 'tokenizers')
    _paths = {
        1: "fast_bpe_l_5_v_512",
        2: "fast_bpe_l_10_v_1109",
        3: "fast_bpe_l_20_v_1707",
        4: "fast_bpe_l_25_v_2304",
        5: "fast_bpe_l_30_v_2901",
        6: "fast_bpe_l_40_v_3499",
        7: "fast_bpe_l_60_v_4096",
        8: "fast_bpe_l_NONE_v_5000",
        9: "fast_bpe_l_NONE_v_12500",
        10: "fast_bpe_l_NONE_v_20000",
        11: "fast_bpe_l_100_v_5000",
        12: "fast_bpe_l_200_v_12500",
        13: "fast_bpe_l_300_v_20000"
    }

    _tokenizers = {}

    _mappings = {}

    @staticmethod
    def get_tokenizer(level):
        if level not in TokenizerManager._tokenizers:
            tokenizer_dir = TokenizerManager._paths.get(level)
            tokenizer_dir = os.path.join(TokenizerManager._base_dir, tokenizer_dir)
            if tokenizer_dir:
                TokenizerManager._tokenizers[level] = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
            else:
                raise ValueError(f"No tokenizer defined for level {level}")
        return TokenizerManager._tokenizers[level]

    @staticmethod
    def get_mapping(level):
        if level not in TokenizerManager._mappings:
            tokenizer_dir = TokenizerManager._paths.get(level)
            mapping_path = os.path.join(TokenizerManager._base_dir, tokenizer_dir)
            mapping_path = os.path.join(mapping_path, "new_vocab_mapping_uni_rep.json")
            if mapping_path:
                TokenizerManager._mappings[level] = TokenizerManager.__read_dict_from_json_file(mapping_path)
            else:
                raise ValueError(f"No mapping defined for level {level}")
        return TokenizerManager._mappings[level]

    @staticmethod
    def __read_dict_from_json_file(filepath):
        with open(filepath, 'r') as json_file:
            dictionary = json.load(json_file)
        return dictionary

In [74]:
import hashlib
import sqlite3
import json

class TraceDatabase:
    def __init__(self, db_path):
        self.db_path = db_path
        self._create_tables()

    def _create_tables(self):
        conn = None
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS traces (
                id INTEGER PRIMARY KEY,
                trace_hash TEXT UNIQUE,
                trace TEXT
            )
            ''')
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS probs_scores (
                id INTEGER PRIMARY KEY,
                score REAL,
                token TEXT,
                trace_id INTEGER,
                FOREIGN KEY (trace_id) REFERENCES traces (id)
            )
            ''')
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS error_scores (
                id INTEGER PRIMARY KEY,
                score REAL,
                token TEXT,
                trace_id INTEGER,
                FOREIGN KEY (trace_id) REFERENCES traces (id)
            )
            ''')
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS brier_scores (
                id INTEGER PRIMARY KEY,
                score REAL,
                token TEXT,
                trace_id INTEGER,
                FOREIGN KEY (trace_id) REFERENCES traces (id)
            )
            ''')
            conn.commit()
        finally:
            if conn:
                conn.close()

    def hash_trace(self, trace):
        trace_str = json.dumps(trace)
        return hashlib.sha256(trace_str.encode('utf-8')).hexdigest()

    def get_trace(self, trace):
        conn = None
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            trace_hash = self.hash_trace(trace)
            cursor.execute("SELECT id FROM traces WHERE trace_hash = ?", (trace_hash,))
            result = cursor.fetchone()
            if result:
                trace_id = result[0]
                probs_anomalies = self._get_scores(cursor, "probs_scores", trace_id)
                error_anomalies = self._get_scores(cursor, "error_scores", trace_id)
                brier_scores = self._get_scores(cursor, "brier_scores", trace_id)
                return trace_id, (probs_anomalies, error_anomalies, brier_scores)
            else:
                return None, None
        finally:
            if conn:
                conn.close()

    def _get_scores(self, cursor, table, trace_id):
        cursor.execute(f"SELECT score, token FROM {table} WHERE trace_id = ?", (trace_id,))
        rows = cursor.fetchall()
        if table == "brier_scores":
            scores_dict = {}
            for score, token in rows:
                if score not in scores_dict:
                    scores_dict[score] = []
                scores_dict[score].append(token)
            return [(score, tokens) for score, tokens in scores_dict.items()]
        else:
            return [(row[0], row[1]) for row in rows]

    def save_trace(self, trace, probs_anomalies, error_anomalies, brier_scores):
        conn = None
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            trace_hash = self.hash_trace(trace)
            trace_str = json.dumps(trace)
            cursor.execute("INSERT OR IGNORE INTO traces (trace_hash, trace) VALUES (?, ?)", (trace_hash, trace_str))
            trace_id = cursor.execute("SELECT id FROM traces WHERE trace_hash = ?", (trace_hash,)).fetchone()[0]
            self._save_scores(cursor, "probs_scores", trace_id, probs_anomalies)
            self._save_scores(cursor, "error_scores", trace_id, error_anomalies)
            self._save_scores(cursor, "brier_scores", trace_id, brier_scores)
            conn.commit()
        finally:
            if conn:
                conn.close()

    def _save_scores(self, cursor, table, trace_id, scores):
        for score, sample in scores:
            if isinstance(sample, list):
                for token in sample:
                    cursor.execute(f"INSERT INTO {table} (score, token, trace_id) VALUES (?, ?, ?)", (score, token, trace_id))
            else:
                cursor.execute(f"INSERT INTO {table} (score, token, trace_id) VALUES (?, ?, ?)", (score, sample, trace_id))


In [75]:
import torch
import numpy as np
import torch.nn.functional as F
from transformers import BatchEncoding
from tqdm import tqdm
import copy


class TraceEvaluatorDB:
    def __init__(self, model, tokenizer, AET=0.05, APT=0.85, BS=0.5, mask_share=0.2, db_path=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.mask_token_id = self.tokenizer.mask_token_id
        self.pad_token_id = self.tokenizer.pad_token_id
        self.abnormal_error_threshold = AET
        self.abnormal_prob_threshold = APT
        self.brier_score_threshold = BS
        self.mask_share = mask_share
        self.db = TraceDatabase(db_path) if db_path else None

    def evaluate_traces(self, traces: list[list[str]]):
        if not all(isinstance(trace, list) and all(isinstance(elem, str) for elem in trace) for trace in traces):
            raise ValueError("traces must be list of str lists")

        str_abnormal = "abnormal"
        eval_results = []

        for trace in tqdm(traces, desc='analyzing anomalies by model'):
            trace_results = [["normal", []] for _ in range(3)]

            if self.db:
                trace_id, db_results = self.db.get_trace(trace)
            else:
                trace_id, db_results = None, None

            if db_results:
                probs_anomalies, error_anomalies, brier_scores = db_results
            else:
                tokenized_trace_init = self.__preprocess_trace(trace)
                tokens = self.tokenizer.convert_ids_to_tokens(tokenized_trace_init['input_ids'][0])
                brier_scores = self.evaluate_trace_brier(tokenized_trace_init, tokens)
                probs_anomalies, error_anomalies = self.evaluate_trace_by_tokens(tokenized_trace_init, tokens)

                if self.db:
                    self.db.save_trace(trace, probs_anomalies, error_anomalies, brier_scores)

            if len(probs_anomalies) > 0:
                trace_results[0] = [str_abnormal, probs_anomalies]
            if len(error_anomalies) > 0:
                trace_results[1] = [str_abnormal, error_anomalies]
            if len(brier_scores) > 0:
                trace_results[2] = [str_abnormal, brier_scores]

            eval_results.append(trace_results)

        return eval_results

    def __preprocess_trace(self, trace):
        sequence = TraceProcessor.get_chars(trace)
        tokenized_trace_init = self.tokenizer(sequence, return_tensors="pt").to(self.device)

        return tokenized_trace_init

    def brier_multi(self, targets, probs):
        return np.mean(np.sum((probs - targets) ** 2))

    def mask_tokens_and_evaluate(self, tokenized_trace_init, num_tokens, mask_indices):
        tokenized_trace = copy.deepcopy(tokenized_trace_init)
        true_indices_20pct = [tokenized_trace['input_ids'][0][idx].item() for idx in mask_indices]

        for idx in mask_indices:
            tokenized_trace['input_ids'][0][idx] = self.mask_token_id

        with torch.no_grad():
            logits = self.model(**tokenized_trace).logits

        predicted_probs = []
        true_labels = []
        for idx, true_idx in zip(mask_indices, true_indices_20pct):
            mask_token_logits = logits[0, idx, :]
            mask_token_probs = F.softmax(mask_token_logits, dim=-1).cpu().numpy()
            predicted_probs.append(mask_token_probs)

            true_label = np.zeros_like(mask_token_probs)
            true_label[true_idx] = 1
            true_labels.append(true_label)

        predicted_probs_array = np.array(predicted_probs)
        true_labels_array = np.array(true_labels)

        return self.brier_multi(true_labels_array, predicted_probs_array)

    def evaluate_trace_brier(self, tokenized_trace_init, tokens):
        brier_scores = []
        batch_encodings = [tokenized_trace_init] if len(
            tokenized_trace_init['input_ids'][0]) <= 510 else self.split_trace_to_batch_encoding(tokenized_trace_init)

        for tokenized_trace in batch_encodings:
            num_tokens = len(tokenized_trace['input_ids'][0])
            for _ in range(10):
                mask_indices_20pct = np.random.choice(num_tokens, int(num_tokens * self.mask_share), replace=False)
                brier_score = self.mask_tokens_and_evaluate(tokenized_trace, num_tokens, mask_indices_20pct)
                if brier_score > self.brier_score_threshold:
                    masked_tokens = [tokens[idx] for idx in mask_indices_20pct]
                    brier_scores.append((brier_score, masked_tokens))
        return brier_scores

    def evaluate_token(self, tokenized_trace, idx):
        true_idx = tokenized_trace['input_ids'][0][idx].item()
        tokenized_trace['input_ids'][0][idx] = self.mask_token_id
        with torch.no_grad():
            logits = self.model(**tokenized_trace).logits
        mask_token_logits = logits[0, idx, :]
        abnormal_error = F.cross_entropy(mask_token_logits.view(1, -1).to(self.device),
                                         torch.tensor([true_idx]).to(self.device))
        abnormal_prob = F.softmax(mask_token_logits, dim=-1)[true_idx].item()
        token_value = self.tokenizer.convert_ids_to_tokens(true_idx)
        error_anomaly = (abnormal_error.item(), token_value) if abnormal_error > self.abnormal_error_threshold else None
        prob_anomaly = (abnormal_prob, token_value) if abnormal_prob < self.abnormal_prob_threshold else None
        return prob_anomaly, error_anomaly

    def split_trace_to_batch_encoding(self, tokenized_trace):
        max_length = 510
        input_ids = tokenized_trace['input_ids'][0]
        token_type_ids = tokenized_trace['token_type_ids'][0]
        attention_mask = tokenized_trace['attention_mask'][0]

        def split_component(component):
            return [component[i:i + max_length] for i in range(0, len(component), max_length)]

        input_ids_chunks = split_component(input_ids)
        token_type_ids_chunks = split_component(token_type_ids)
        attention_mask_chunks = split_component(attention_mask)

        batch_encodings = []
        for i in range(len(input_ids_chunks)):
            batch_encoding = BatchEncoding({
                'input_ids': input_ids_chunks[i].unsqueeze(0),
                'token_type_ids': token_type_ids_chunks[i].unsqueeze(0),
                'attention_mask': attention_mask_chunks[i].unsqueeze(0)
            }, tensor_type='pt')
            batch_encodings.append(batch_encoding)
        return batch_encodings

    def evaluate_trace_by_tokens(self, tokenized_trace_init, tokens):
        error_anomalies = []
        probs_anomalies = []
        batch_encodings = [tokenized_trace_init] if len(
            tokenized_trace_init['input_ids'][0]) <= 510 else self.split_trace_to_batch_encoding(tokenized_trace_init)

        for tokenized_trace in batch_encodings:
            num_tokens = len(tokenized_trace['input_ids'][0])
            for idx in range(num_tokens):
                tokenized_trace_copy = copy.deepcopy(tokenized_trace)
                if tokenized_trace['input_ids'][0][idx] == self.pad_token_id:
                    break
                prob_anomaly, error_anomaly = self.evaluate_token(tokenized_trace_copy, idx)
                if error_anomaly:
                    error_anomalies.append(error_anomaly)
                if prob_anomaly:
                    probs_anomalies.append(prob_anomaly)
        return probs_anomalies, error_anomalies


In [76]:
import json
import os

class TraceProcessor:
    base_dir = os.path.dirname(__file__)
    file_path_e_v = os.path.join(base_dir, 'data', 'event_codes.json')
    event_codes = None

    @classmethod
    def initialize_event_codes(cls):
        with open(cls.file_path_e_v, 'r', encoding='utf-8') as file:
            cls.event_codes = json.load(file)

    @staticmethod
    def get_event_code(event_name):
        return TraceProcessor.event_codes.get(event_name, None)

    @classmethod
    def get_chars(cls, trace):
        if cls.event_codes is None:
            cls.initialize_event_codes()
        sequence = ''
        for event in trace:
            event_code = cls.get_event_code(event)
            if event_code is not None:
                sequence += chr(event_code)
        return sequence


In [77]:
import json
import os

from tqdm import tqdm

class TracesProcessor:
    def __init__(self):

        base_dir = os.path.dirname(__file__)
        accepted_events_path = os.path.join(base_dir, 'data', 'accepted_events.json')

        with open(accepted_events_path, 'r') as json_file:
            self.accepted_events = json.load(json_file)

        accepted_indexes = [i for i in range(33, 127)]

        total_events = len(self.accepted_events)
        to_add = total_events - len(accepted_indexes)
        addition = [i for i in range(256, 256 + to_add)]
        accepted_indexes += addition

        self.event_codes = {event: accepted_indexes[index] for index, event in
                            enumerate(self.accepted_events)}

    def __get_event_code(self, event_name):
        return self.event_codes.get(event_name, None)

    def __get_sequence(self, trace):
        sequence = ''
        for event in trace:
            id = self.__get_event_code(event)
            sequence += chr(id)
        return sequence

    def __traces2seqs(self, traces):
        return [self.__get_sequence(trace) for trace in traces]

    def process_traces(self, traces, LoA):
        sequences = self.__traces2seqs(traces)

        tokenizer = TokenizerManager.get_tokenizer(LoA)
        mapper = TokenizerManager.get_mapping(LoA)

        processed_traces = []

        for sequence in tqdm(sequences, desc="tokenizing traces"):
            trace = []

            tokens = tokenizer.tokenize(sequence)

            for token in tokens:
                trace.append(mapper[token])

            processed_traces.append(trace)

        return processed_traces


In [78]:
import os

import matplotlib.pyplot as plt
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

def visualize_event_traces(dataframe, xes_file, LoA, stride=0.5):
    activities = dataframe['concept:name'].unique()
    activity_colors = {activity: plt.cm.nipy_spectral(i / len(activities) + stride) for i, activity in
                       enumerate(activities)}

    fig, ax = plt.subplots(figsize=(15, 5))

    for i, (case_id, group) in enumerate(dataframe.groupby('case:concept:name')):
        group = group.reset_index(drop=True)
        for j in range(len(group)):
            ax.barh(i, 1, left=j, color=activity_colors[group.at[j, 'concept:name']], edgecolor='none')

    ax.set_yticks(range(i + 1))
    ax.set_yticklabels(dataframe['case:concept:name'].unique())
    ax.set_xlabel('Events in Trace')
    ax.set_title('Event Trace Visualization')
    plt.gca().invert_yaxis()

    base_dir = "plots"
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    filename = FileManager.get_filename(xes_file)
    default_filename = f"{filename}_LoA_{LoA}.png"

    output_path = os.path.join('plots', default_filename)
    print(f"Saved to default {base_dir} dir.")

    plt.savefig(output_path)
    plt.show()
    plt.close(fig)

    # img = Image.open(output_path)
    # img.show()


In [None]:
import os

import pm4py

from transformers import SqueezeBertForMaskedLM, PreTrainedTokenizerFast
from pm4py.objects.conversion.log import converter as log_converter

import warnings

warnings.filterwarnings("ignore")


class UserInteractionHandler:
    def __init__(self):
        self.model, self.tokenizer = self.__load_model_and_tokenizer()
        self.log_processor = None

    def __load_model_and_tokenizer(self):
        base_dir = os.path.dirname(__file__)
        model_dir = os.path.join(base_dir, 'model', 'squeezebert')
        tokenizer_dir = os.path.join(base_dir, 'model', 'tokenizer')
        model = SqueezeBertForMaskedLM.from_pretrained(model_dir)
        tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
        return model, tokenizer

    def __find_patterns(self, LoA):
        if not self.log_processor:
            self.log_processor = LogProcessor(self.xes_file)

        tokenized_log = self.log_processor.create_tokenized_event_log(LoA=LoA)
        dataframe = log_converter.apply(tokenized_log, variant=log_converter.Variants.TO_DATA_FRAME)
        visualize_event_traces(dataframe, self.xes_file, LoA)
        self.save_tokenized_event_log(dataframe, LoA)

    def __find_anomalies(self):
        if not self.log_processor:
            self.log_processor = LogProcessor(self.xes_file)

        traces = self.log_processor.get_traces()
        
        base_dir = os.path.dirname(__file__)
        db_dir = os.path.join(base_dir, 'db')
        
        if not os.path.exists(db_dir):
            os.makedirs(db_dir)
            
        dp_path = os.path.join(db_dir, "trace_evaluator.db")
        
        te = TraceEvaluatorDB(self.model, self.tokenizer, db_path=dp_path)
        eval_results = te.evaluate_traces(traces)
        self.display_anomalies(eval_results)

    def save_tokenized_event_log(self, dataframe, LoA):
        print("Do you want to save tokenized event log? ")
        print("1. Yes")
        print("2. No")
        action = int(input("Enter 1, 2: "))

        if action == 1:
            filename = FileManager.get_filename(self.xes_file)
            default_filename = f"{filename}_LoA_{LoA}.xes"
            output_path = FileManager.get_save_path(default_filename, ".xes")
            pm4py.write_xes(dataframe, output_path)

    @staticmethod
    def display_anomalies(results):
        for trace_idx, trace_result in enumerate(results, 1):
            print(f"Trace {trace_idx}:")
            rate = 0
            for method_idx, (status, details) in enumerate(trace_result, 1):
                if status == 'abnormal':
                    rate += 1
                print(f"  Method {method_idx} - Status: {status}")
            
            if rate >= 2:
                print("\n  Final trace status: abnormal")
            else:
                print("\n  Final trace status: normal")
            print()

    @staticmethod
    def request_level_of_abstraction():
        while True:
            try:
                LoA = int(input("Please enter the level of abstraction (1-13): "))
                if LoA < 1 or LoA > 13:
                    raise ValueError
                return LoA
            except ValueError:
                print("Invalid level of abstraction. Please enter a number between 1 and 13.")

    @staticmethod
    def validate_file_path(xes_file):
        if not os.path.exists(xes_file):
            print(f"File '{xes_file}' does not exist. Please try again.")
            return False
        return True

    def __get_log(self):
        print("Please select XES log file: ")
        xes_file = FileManager.get_in_path("xes-anomaly-detector", "xes")
        if self.validate_file_path(xes_file):
            self.xes_file = xes_file
        else:
            print("File is not valid")

    def process_action(self, action):
        if action == 1:
            LoA = self.request_level_of_abstraction()
            self.__find_patterns(LoA)
        elif action == 2:
            self.__find_anomalies()
        elif action == 3:
            self.__get_log()
        elif action == 4:
            print("Exiting the program.")
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")

    def run(self):
        print("Welcome to the XES Anomaly Detector!")
        self.__get_log()

        if not self.xes_file:
            return

        while True:
            print("Choose an action: ")
            print("1. Find patterns")
            print("2. Find anomalies")
            print("3. Change log")
            print("4. Exit")
            action = int(input("Enter 1, 2, 3, or 4: "))

            self.process_action(action)
            
            if action == 4:
                return


def main():
    handler = UserInteractionHandler()
    handler.run()


if __name__ == '__main__':
    main()

Welcome to the XES Anomaly Detector!
Please select XES log file: 


Provide a file path:  /home/jupyter/datasphere/project/logs/example_log.xes


Choose an action: 
1. Find patterns
2. Find anomalies
3. Change log
4. Exit


Enter 1, 2, 3, or 4:  2


[93mprocessing log...[0m


parsing log, completed traces :: 100%|██████████| 50/50 [00:01<00:00, 38.90it/s]
creating dataframe: 100%|██████████| 31138/31138 [00:12<00:00, 2395.25it/s]
analyzing anomalies by model: 100%|██████████| 25/25 [00:06<00:00,  3.85it/s]


Trace 1:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 2:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 3:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 4:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 5:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 6:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 7:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 8:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 9:

Enter 1, 2, 3, or 4:  2


analyzing anomalies by model: 100%|██████████| 25/25 [00:00<00:00, 77.59it/s]


Trace 1:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 2:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 3:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 4:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 5:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 6:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 7:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 8:
  Method 1 - Status: normal
  Method 2 - Status: normal
  Method 3 - Status: normal

  Final trace status: normal

Trace 9: