### Import Packages

In [None]:
#Import packages

from cmath import nan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyparsing import col # For plotting data
import seaborn as sns # For plotting data

# Selection of ml algorithms for prediction
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.dummy import DummyRegressor

#import packages for text preprocessing
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import string
import re

# For setting up pipeline
from sklearn.pipeline import Pipeline 

# Various pre-processing steps
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#hyperparametertuning
from optuna.integration import OptunaSearchCV
from optuna.distributions import *

#Feature Selection
from sklearn.feature_selection import RFE

#Target Transformation
from sklearn.compose import TransformedTargetRegressor

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import FunctionTransformer

#Metrics
from sklearn.metrics import mean_absolute_error

#Safe Model
from joblib import dump, load

import os
from pathlib import Path

### Define Process Types and T-Times

In [None]:
#for importing dataframes
ROOT_DIR = os.path.abspath(os.curdir)
ROOT_DIR = ROOT_DIR.replace(os.sep, '/')
ROOT_DIR = str(Path(ROOT_DIR).parent.parent)
print(ROOT_DIR)

In [None]:
#define t-times
#needs to be adapted according to the number of gates considered 
TIMES = range(0,4) #(Example: TIMES = range(0,4) corresponds to a KIP with its beginning t=0 and three gates)

#define the targets for every t
#the total lead time should be calculated before and added as attribute at the trace-level of the event log
targets_T0 = ["trace:total_leadtime"]
targets_T1 = ["trace:total_leadtime"] 
targets_T2 = ["trace:total_leadtime"]
targets_T3 = ["trace:total_leadtime"]

targets = [targets_T0, targets_T1, targets_T2,  targets_T3]

t_with_score = pd.DataFrame(columns=["Time", "MAE", "Model"])
row = 0

### Extract Text Information

In [None]:
#Load and extract textual features from XML
#put your event_log with the textual information into the Input Folder
with open(ROOT_DIR + "/Input Data/event_log.xml", 'r', encoding='utf-8') as f:
    logs = f.read()
logs_data = BeautifulSoup(logs, "xml")

In [None]:
def get_texts_for_t(tn):
  texts = ""
    
  docs = tn.find_all("doc")
  for d in docs:
    remarks = d.find("remarks")
    if remarks:
      texts = texts + "; " + remarks.text

  parts = tn.find_all("part")
  for p in parts:
    descde = p.find("descde")
    if descde:
      texts = texts + "; " + descde.text

  task_events = tn.find_all("task_event")
  for t in task_events:
    if (t.find("event_type").text == "Aufgabe Ereignis"):
      description = t.find("description")
      if description:
        texts = texts + "; " + description.text
  
  return(texts)

In [None]:
texts = []
cases = logs_data.find_all("case")
for c in cases:
  entry = {}

  entry["trace:cdb_ec_id"] = c.attrs["case_id"]
  header = "" \
    + c.find("name").text + "; " \
    + c.find("description").text + "; "  \
    + c.find("notice").text + "; "  \
    + c.find("compatibility").text + "; "  \
    + c.find("distribution").text + "; "  \
    + c.find("reason").text
  
  t0 = c.find("time", {"T" : "0"})
  t1 = c.find("time", {"T" : "1"})
  t2 = c.find("time", {"T" : "2"})
  t3 = c.find("time", {"T" : "3"})
  entry["t0"] = header + get_texts_for_t(t0)
  entry["t1"] = header + get_texts_for_t(t0) + get_texts_for_t(t1)
  entry["t2"] = header + get_texts_for_t(t0) + get_texts_for_t(t1) + get_texts_for_t(t2)
  entry["t3"] = header + get_texts_for_t(t0) + get_texts_for_t(t1) + get_texts_for_t(t2) + get_texts_for_t(t3)
  
  texts.append(entry)

In [None]:
texts = pd.DataFrame(texts)

### Text Preprocessing

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stemmer = SnowballStemmer("german")
stop_words = set(stopwords.words("german"))

def clean_text(text, for_embedding=False):
    """
        - remove any html tags (< /br> often found)
        - Keep only ASCII + European Chars and whitespace, no digits
        - remove single letter chars
        - convert all whitespaces (tabs etc.) to single wspace
        if not for embedding (but e.g. tdf-idf):
        - all lowercase
        - remove stopwords, punctuation and stemm
    """
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    if for_embedding:
        # Keep punctuation
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!? ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž,.!?]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", str(text))
    text = re.sub(RE_ASCII, " ", str(text))
    text = re.sub(RE_SINGLECHAR, " ", str(text))
    text = re.sub(RE_WSPACE, " ", str(text))

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    if for_embedding:
        # no stemming, lowering and punctuation / stop words removal
        words_filtered = word_tokens
    else:
        words_filtered = [
            stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
        ]

    text_clean = " ".join(words_filtered)
    return text_clean

### Prediction Pipeline

In [None]:
for TIME in TIMES:
    print(TIME)
    
    #LOAD TRAIN AND TEST DATASET
    df_train_raw = pd.read_csv(ROOT_DIR + "/Input Data/Train_Features_t" + str(TIME) + ".csv")
    df_test_raw = pd.read_csv(ROOT_DIR + "/Input Data/Test_Features_t" + str(TIME) + ".csv")

    #MERGE TRAIN AND TEST DATASET WITH TEXTUAL INFORMATION based on the case ID
    df_train_raw = pd.merge(left=df_train_raw, right=texts[["t" + str(TIME), "trace:case_id"]], left_on='trace:case_id', right_on='trace:case_id', how='left')
    df_test_raw = pd.merge(left=df_test_raw, right=texts[["t" + str(TIME), "trace:case_id"]], left_on='trace:case_id', right_on='trace:case_id', how='left')

    df_train_raw.drop('trace:concept:name', inplace = True, axis=1)
    df_test_raw.drop('trace:concept:name', inplace = True, axis=1)

    #DEFINE TARGETS
    target = targets[TIME][0]
    to_drop = targets[TIME]
    to_drop.append("trace:case_id")

    #DEFINE DATAFRAME FOR TRAIN SET
    df_train = df_train_raw.copy()
    df_train = df_train.dropna()
    y_train = df_train[target].to_frame() 
    train_ids = df_train["trace:case_id"]
    X_train = df_train.drop(to_drop, axis=1)

    #DEFINE DATAFRAME FOR TEST SET
    df_test = df_test_raw.copy()
    df_test = df_test.dropna()
    y_test = df_test[target].to_frame()
    test_ids = df_test["trace:case_id"]
    X_test = df_test.drop(to_drop, axis=1)

    #clean text
    X_train["t"+str(TIME)] = X_train.loc[X_train["t"+str(TIME)].str.len() > 20, "t"+str(TIME)]
    X_train["t"+str(TIME)] = X_train["t"+str(TIME)].map(
    lambda x: clean_text(x, for_embedding=False) if isinstance(x, str) else x
    )

    X_test["t"+str(TIME)] = X_test.loc[X_test["t"+str(TIME)].str.len() > 20, "t"+str(TIME)]
    X_test["t"+str(TIME)] = X_test["t"+str(TIME)].map(
    lambda x: clean_text(x, for_embedding=False) if isinstance(x, str) else x
    )

    #ENCODE CATEGORICAL VARIABLES
    categoric_datatypes = ['bool', 'object']

    numeric_train = X_train.select_dtypes(exclude=categoric_datatypes)
    categoric_train = X_train.select_dtypes(include=categoric_datatypes)
    categoric_train = categoric_train.drop('t'+str(TIME), axis=1)
    numeric_columns_train = numeric_train.columns.to_list()
    categoric_columns_train = categoric_train.columns.to_list() 
    text_columns_train = list(['t'+str(TIME)])

    numeric_test = X_test.select_dtypes(exclude=categoric_datatypes)
    categoric_test = X_test.select_dtypes(include=categoric_datatypes)
    categoric_test = categoric_test.drop('t'+str(TIME), axis=1)
    numeric_columns_test = numeric_test.columns.to_list()
    categoric_columns_test = categoric_test.columns.to_list()
    text_columns_test = list(['t'+str(TIME)])

    s1 = set(numeric_columns_train)
    s2 = set(numeric_columns_test)

    s_intersect = list(s1 & s2)
    s_intersect = s_intersect + categoric_columns_test + text_columns_test
    
    X_test = X_test[s_intersect]
    X_train = X_train[s_intersect]

    numeric_train_new = X_train.select_dtypes(exclude=categoric_datatypes)
    numeric_columns_train_new = numeric_train_new.columns.to_list()

    #DEFINE REGRESSION MODELS
    regressionModels = [
        RandomForestRegressor(random_state=0),
        Ridge(random_state=0),
        GradientBoostingRegressor(random_state=0),
        MLPRegressor(random_state=0, max_iter=10000, learning_rate='adaptive', early_stopping=True, tol=float(2), batch_size=32),
        DummyRegressor(),
        ]

    print(TIME)

    #DEFINE TOPIC MODELING

    topic_modeling = Pipeline([
        ("vectorizer", CountVectorizer(max_df=0.7, min_df=0.2)),
        ("TopicModeling", LatentDirichletAllocation(n_components=10, random_state=0)),
    ])

    #DEFINE COLUMN TRANSFORMER
    ct = ColumnTransformer([
        ("OneHot", OneHotEncoder(handle_unknown='ignore'), categoric_columns_train),
        ("scale", MinMaxScaler(), numeric_columns_train_new),
        ("topic_modeling", topic_modeling, "t"+str(TIME)),
        ])
    
    #HYPERPARAMETERTUNING WITH OPTUNA SEARCH
    #define dictionaries for each algorithm
    RandomForestDict = {'regression__regressor__n_features_to_select': FloatDistribution(low=0.2, high=0.7),'regression__regressor__estimator__n_estimators': IntDistribution(low=100, high=2000, step=1), 'regression__regressor__estimator__max_depth': IntDistribution(low=10, high=20, step=1), 'regression__regressor__estimator__max_features': FloatDistribution(low=0.1, high=0.7)} #spannen angeben
    RidgeDict = {'regression__regressor__n_features_to_select': FloatDistribution(low=0.2, high=0.7),'regression__regressor__estimator__alpha': FloatDistribution(low=0, high=10)}
    GradientBoostingDict = {'regression__regressor__n_features_to_select': FloatDistribution(low=0.2, high=0.7),'regression__regressor__estimator__n_estimators': IntDistribution(low=100, high=2000, step=1), 'regression__regressor__estimator__learning_rate': FloatDistribution(low=1e-5, high=0.2), 'regression__regressor__estimator__max_depth': IntDistribution(low=10, high=14, step=1)}
    MLPDict = {'regression__regressor__hidden_layer_sizes': IntDistribution(low=20, high=512),'regression__regressor__learning_rate_init': FloatDistribution(low=1e-5, high=0.2), 'regression__regressor__max_iter': IntDistribution(low=5, high=250)}
    Dummy = {'regression__regressor__strategy': CategoricalDistribution(['mean'])}

    #dictionaries werden als Liste mit in die Pipeline gegeben, damit sich für jeden algorithmus auf das richtige dict bezogen werden kann
    param_list = [
        RandomForestDict,
        RidgeDict,
        GradientBoostingDict,
        MLPDict,
        Dummy,
        ]
    
    name_list = [
        "RandomForest",
        "Ridge",
        "GradientBoosting",
        "MLP",
        "Dummy",
        ]

    #DEFINE PIPELINE    
    for regressor, param_dict, name in zip(regressionModels,param_list,name_list):
        if (name == 'RandomForest') or (name == "Ridge") or (name == "GradientBoosting"):
            rfe = RFE(estimator=regressor, step=10)
            y_transformer = TransformedTargetRegressor(regressor=rfe, func=np.log1p, inverse_func=np.expm1)
        else:
            y_transformer = TransformedTargetRegressor(regressor=regressor, func=np.log1p, inverse_func=np.expm1)
        pipe = Pipeline(steps=[
            ('encoding_scaling', ct),
            ('regression', y_transformer)
            ])
        
        #PERFORM OPTUNA SEARCH
        gs = OptunaSearchCV(estimator=pipe, param_distributions=param_dict, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, n_trials=15, random_state=42)
        
        gs.fit(X_train, y_train.values.ravel()) #Perform fit on Optuna Search instead of Pipeline
        model = gs.best_estimator_ #The best estimator is taken and fitted
        dump(model, ROOT_DIR + "/Prediction Models/"+name+"_"+str(TIME)+"_withTopicModeling.joblib") 
        predictions = model.predict(X_test)
        score_mae = mean_absolute_error(y_test, predictions)

        pd.DataFrame(columns=["Time", "MAE", "Model"])
        t_with_score.loc[row] = [TIME, score_mae, str(regressor)]
        row = row + 1

### Results and Vizualisation

In [None]:
t_with_score.to_csv(ROOT_DIR + "/Results/PredictionMetrics_withTopicModeling.csv", index=False)

In [None]:
pl = sns.relplot(x="Time", y="MAE", hue="Model", kind="line", data=t_with_score)
pl.set(ylim=(0, 200))
pl.set(xticks=list(TIMES))
plt.title("MAE for Predictions with Topic Modeling")