In [1]:
import pandas as pd
import nltk
from datetime import datetime

In [4]:
def preprocessing(file):
    """Method to handle the preparation of log event data for further usage in NLP learning methods.
    Timestamps are being changed to time inbetween events in a certain trace. After that the values are categorized to
    to further improve the quality of data"""
    
    # Convert the basic csv input into a first dataframe.
    df = pd.read_csv('../../logs/ecommerce_anomalies.csv', delimiter=';', header=None)
    df = df.drop([df.columns[3]], axis=1)
    df.columns = ['timestamp','id','event']
    
    # Converting timestamps to time difference between events in trace. Difference will be in seconds and categorized to
    # ease training of the model. First event will therefore always have a '0' value.
    
    FMT = '%Y-%m-%d %H:%M:%S' # timestamp format
    timediff_list = []

    for caseID in df['id'].unique():
        caseIndex = 0 # Index for each trace dataframe. iterrows() Index represents index in global DF.
        first_time = '' # Temp variable to hold the timestamp of the first event. Timestamp will be lost after first iteration.
        for index, event in df[df['id'] == caseID].iterrows():
            event_time = event['timestamp']
            
            if caseIndex == 0:
                first_time = event_time
                timediff_list.append(0.0) # Can't use previous time without previous event. Time differnece is therefore 0.0..
            elif caseIndex == 1:
                tdelta = datetime.strptime(event_time, FMT) - datetime.strptime(first_time, FMT) # Deduct current from from previous.
                timediff_list.append(tdelta.total_seconds())
            else: 
                prev_time = df[df['id'] == caseID].iloc[caseIndex-1]['timestamp'] # Timestamp of previous event.
                tdelta = datetime.strptime(event_time, FMT) - datetime.strptime(prev_time, FMT) # Deduct current from from previous.
                timediff_list.append(tdelta.total_seconds())
            caseIndex += 1
    
    # Converting the build up timediff_list into a DataFrame, categorizing it and replace the new values with the given timestamps.
    timediff_df = pd.DataFrame(timediff_list, columns=['timestamp_diff'])
    timediff_df = pd.cut(timediff_df['timestamp_diff'],10, labels=False) # Categorizing differences into 10 bins with same width.
    df = pd.concat([timediff_df,df.drop(['timestamp'], axis=1) ], axis=1, join='inner') # Dropping old timestamps and adding new
    df['event'] = df['event'].str.lower() # Converting string to lower case.
    
    # Converting dataframe values into strings and tokenize each event after.
    string_list = df.to_string(header=False,index=False,index_names=False).split('\n')
    token_list = [nltk.word_tokenize(event.lower()) for event in string_list] # List containing event tokens.
    
    # Creating a list containing word tokens for each trace ID.
    traceids = df['id'].unique()
    trace_list = []
    
    # Filter for each tradeid and convert each trade into lists, containing the events.
    for id in traceids:
        df_trace= df.loc[df['id']==id] # df holding the entries for the specific trace ID
        traceString = df_trace.to_string(header=False,index=False,index_names=False).split('\n') # Converting each event into a string
        traceToken = [nltk.word_tokenize(event.lower()) for event in traceString]

        trace_list.append(traceToken)
    
    return df, trace_list, token_list

In [5]:
df, traceToken, tokenList = preprocessing('ecommerce_log')

In [6]:
traceToken[2][5][2]

'order'

In [31]:
df.to_pickle('../data/df_pre.pkl')
import pickle
with open('../data/trace_WordList.pkl', 'wb') as f:
    pickle.dump(traceToken,f)

## Timestamps in Preprocessing
Zeitstempel in Event Logs bieten eine Herausforderung im Hinblick auf die Verarbeitung in NLP Verfahren. 
- Durch die hohe Granularität (bis in Sekunden) ist die Chance, sicher wiederholende Werte zu haben gering
- Die Modelle können lediglich den Kontext der Wörter "erkennen" jedoch nicht die Bedeutung. Es ist davon auszugehen, dass der Unterschied die verschiedene Dauern der Event nicht klar. Inbesondere, da in den Logs selbst nur die eigentlich Zeit und nicht die Dauer festgehalten wird.

Im Sinne der weiteren Verwertbarkeit ist daher eine Kategorisierung der jeweiligen Dauer der Events am sinnvollsten. Es gibt noch andere Ansätze wie z.B. des Ausschreiben der Zahlen, um die Verarbeitung der NLP Modelle zu verbessern, bei der Betrachtung bereits trainierter Modelle, sieht man allerdings, dass dies numerische und ausgeschriebene Werte keine beachtenwerte "Korrelation" aufweisen.

Ein weiterer Vorteil der kategorischen Werte, ist vorallem im späteren Bilden der Trace Vektoren. Da in diesem Schritt die jeweilige Inverse-Document-Frequency der einzelnen Werte angegeben wird, bieten Kategorien hier eine durch die Einschränkung der Ausprägungen hier eine höhere Aussagekraft. Konkrete Zeiten oder die Dauer im Sekundenformat ist als späterer Wert zu unterschiedlich, als das hier mittels IDF eine repräsentative Aussage getroffen werden kann.

In [9]:
def preprocessing(file):
    """Method to handle the preparation of log event data for further usage in NLP learning methods.
    Timestamps are being changed to time inbetween events in a certain trace. After that the values are categorized to
    to further improve the quality of data"""
    
    # Convert the basic csv input into a first dataframe.
    df = pd.read_csv('../../logs/ecommerce_anomalies.csv', delimiter=';', header=None)
    df = df.drop([df.columns[3]], axis=1)
    df.columns = ['timestamp','id','event']
    
    # Converting timestamps to time difference between events in trace. Difference will be in seconds and categorized to
    # ease training of the model. First event will therefore always have a '0' value.
    
    FMT = '%Y-%m-%d %H:%M:%S' # timestamp format
    timediff_list = []
    for caseID in df['id'].unique():
        caseIndex = 0 # Index for each trace dataframe. iterrows() Index represents index in global DF.
        first_time = '' # Temp variable to hold the timestamp of the first event. Timestamp will be lost after first iteration.
        for index, event in df[df['id'] == caseID].iterrows():
            event_time = event['timestamp']
            
            if caseIndex == 0:
                first_time = event_time
                timediff_list.append(0.0) # Can't use previous time without previous event. Time differnece is therefore 0.0.
            elif caseIndex == 1:
                tdelta = datetime.strptime(event_time, FMT) - datetime.strptime(first_time, FMT) # Deduct current from from previous.
                timediff_list.append(tdelta.total_seconds())
            else: 
                prev_time = df[df['id'] == caseID].iloc[caseIndex-1]['timestamp'] # Timestamp of previous event.
                tdelta = datetime.strptime(event_time, FMT) - datetime.strptime(prev_time, FMT) # Deduct current from from previous.
                timediff_list.append(tdelta.total_seconds())
            caseIndex += 1
    timediff_df = pd.DataFrame(timediff_list, columns=['timestamp_diff'])
    df = pd.concat([timediff_df,df ], axis=1, join='inner')
    catTime_df = pd.cut(df['timestamp_diff'],10) # Categorizing differences into 10 bins with same width.
    catNumTime_df = pd.cut(df['timestamp_diff'],10, labels=False) # Categorizing differences into 10 bins with same width.
    df = pd.concat([catNumTime_df, catTime_df, df ], axis=1, join='inner')
    return df

preprocessing('ecommerce_log').head()
#print(preprocessing('ecommerce_log').head().style.to_latex())

Unnamed: 0,timestamp_diff,timestamp_diff.1,timestamp_diff.2,timestamp,id,event
0,0,"(-7.115, 711.5]",0.0,2022-02-28 08:23:29,854,Website Request served
1,7,"(4980.5, 5692.0]",5233.0,2022-02-28 09:50:42,854,User logged in
2,1,"(711.5, 1423.0]",1014.0,2022-02-28 10:07:36,854,Item added to cart
3,8,"(5692.0, 6403.5]",5716.0,2022-02-28 11:42:52,854,Item added to cart
4,3,"(2134.5, 2846.0]",2694.0,2022-02-28 12:27:46,854,Hermes chosen for shipping
