In [1]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk
import numpy as np
#from preprocessing import loadLog, tokenize

In [2]:
df = pd.read_csv('../logs/ecommerce_log.csv', delimiter=';', header=None)
df = df.drop([df.columns[3]], axis=1)
df

Unnamed: 0,0,1,2
0,2022-02-23 08:25:12,502,Website Request served
1,2022-02-23 09:51:22,502,User logged in
2,2022-02-23 10:31:19,502,Item added to cart
3,2022-02-23 11:56:22,502,Item added to cart
4,2022-02-23 12:03:37,502,DHL chosen for shipping
...,...,...,...
2940,2022-06-25 20:01:17,906,User logged in
2941,2022-06-25 21:42:25,906,Item added to cart
2942,2022-06-25 22:04:14,906,DHL chosen for shipping
2943,2022-06-25 23:53:55,906,Paypal chosen for payment


In [3]:
x = df.to_string(header=False,index=False,index_names=False).split('\n')
x[0]

'2022-02-23 08:25:12 502                Website Request served'

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nbloe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
xVec = [nltk.word_tokenize(event) for event in x]
print(xVec[0])

['2022-02-23', '08:25:12', '502', 'Website', 'Request', 'served']


In [6]:
model = Word2Vec(sentences=xVec, vector_size=100, window=4, min_count=1, workers=4)

In [7]:
model.wv.get_vector('User')

array([-0.5940518 ,  0.49130073,  0.07868639,  0.10935442, -0.13178326,
       -0.431945  ,  0.02251341,  1.0315506 , -0.56948674, -0.35486963,
        0.05497612, -0.56463194, -0.29749495,  0.19819899,  0.2664801 ,
       -0.19225816,  0.13266654, -0.38223517, -0.11779381, -0.7427251 ,
        0.14546604,  0.19215912,  0.3897603 , -0.09711373, -0.29192543,
        0.14073837, -0.36542735, -0.22322379, -0.27785334,  0.20179437,
        0.40273815, -0.16835815, -0.04083753, -0.35680038, -0.0665308 ,
        0.27712876, -0.05660896,  0.04790317, -0.02677449, -0.3702755 ,
        0.46117857, -0.21365255, -0.4898719 ,  0.3015555 ,  0.39348957,
       -0.26181734, -0.31789762, -0.19376549,  0.1341014 ,  0.2694371 ,
        0.18957081, -0.42710918, -0.11305144,  0.14431651, -0.2135126 ,
        0.00783498,  0.07564534, -0.24786736, -0.07033429,  0.10731077,
        0.03983046,  0.12221579,  0.26641056,  0.17812373, -0.42232525,
        0.2735623 ,  0.16427834,  0.28725433, -0.32933068,  0.59

In [8]:
model.wv.similar_by_word('User')

[('logged', 0.9993377923965454),
 ('in', 0.999204695224762),
 ('loged', 0.9991512298583984),
 ('Order', 0.9988253712654114),
 ('succesful', 0.9988229274749756),
 ('served', 0.9988051056861877),
 ('place', 0.998736560344696),
 ('Website', 0.9986993074417114),
 ('Request', 0.9986844062805176),
 ('for', 0.9986718893051147)]

### Calculate Trace Vectors
2 Possibilities: 
- Simply summing up word vectors produced by w2c
- Using tf-idf to weigh in the importance of certain events

##### Sort log for corresponding trace values 

In [9]:
traceids = df[1].unique()
trace_list = []

# Filter for each tradeid an convert each trade into lists, containing the events.
for id in traceids:
    df_trace= df.loc[df[1]==id] # df holding the entries for the specific trace ID
    df_traceString = df_trace.to_string(header=False,index=False,index_names=False).split('\n') # Converting each event into a string
    df_traceToken = [nltk.word_tokenize(event) for event in df_traceString]
    trace_list.append(df_traceToken)

In [10]:
trace_list[1] # trace_list now holds the tokenized entries to each trace trace_list[trace][event][token]

[['2022-02-23', '15:38:35', '503', 'Website', 'Request', 'served'],
 ['2022-02-23', '16:25:54', '503', 'User', 'loged', 'in'],
 ['2022-02-23', '17:22:00', '503', 'Item', 'added', 'cart'],
 ['2022-02-23', '18:39:16', '503', 'DPD', 'chosen', 'for', 'shipping'],
 ['2022-02-23',
  '19:35:03',
  '503',
  'Sofortueberweisung',
  'chosen',
  'for',
  'payment'],
 ['2022-02-23', '20:09:58', '503', 'Order', 'place', 'succesful']]

In [11]:
trace_VecWordList = [[['x' for token in trace_list[trace][event]] for event in range(len(trace_list[trace]))] for trace in range(len(trace_list))]

for trace in range(len(trace_VecWordList)):
    for event in range(len(trace_VecWordList[trace])):
        for token in range(len(trace_VecWordList[trace][event])):
            vec_temp = model.wv.get_vector(trace_list[trace][event][token])
            trace_VecWordList[trace][event][token] = vec_temp

In [12]:
trace_VecWordList[0][0][0] # trace_VecWordList now holds the word vector in the same structure als trace_list

array([-0.09779584,  0.10035191,  0.01301485,  0.02387704, -0.02146112,
       -0.0830856 ,  0.01357802,  0.18856898, -0.104498  , -0.05786083,
        0.01003336, -0.10818177, -0.06077594,  0.03826899,  0.04851247,
       -0.02240507,  0.01443088, -0.06951978, -0.01539576, -0.14299472,
        0.02469563,  0.03862039,  0.07722629, -0.0178235 , -0.05580613,
        0.01824606, -0.06417972, -0.04892108, -0.05909598,  0.03217707,
        0.07843202, -0.03724485, -0.01099197, -0.05886261, -0.00464725,
        0.04172616, -0.01530062,  0.00681002,  0.0024635 , -0.06930793,
        0.08149738, -0.03793086, -0.08037446,  0.06095518,  0.07939834,
       -0.03946851, -0.06137856, -0.04129384,  0.02535676,  0.05448697,
        0.03239105, -0.0726501 , -0.01895668,  0.03568915, -0.03879852,
        0.00349782,  0.01426152, -0.05146905, -0.00984154,  0.01052658,
        0.00421686,  0.0328161 ,  0.04911625,  0.02142886, -0.07391262,
        0.04310221,  0.02399166,  0.05606858, -0.05610271,  0.11

##### Summing up vector values
Using equal weight for each token

In [25]:
arr = np.array([trace_VecWordList[0][0][0],trace_VecWordList[0][0][1],trace_VecWordList[0][0][2],trace_VecWordList[0][0][3]])
np.add.reduce(trace_VecWordList[0][0])

array([-1.7732208 ,  1.5061026 ,  0.20985326,  0.39580762, -0.41969952,
       -1.3350701 ,  0.09739044,  3.0525699 , -1.7508745 , -1.0546958 ,
        0.21530071, -1.698124  , -0.8177281 ,  0.5815058 ,  0.8513632 ,
       -0.49165756,  0.37394083, -1.1920189 , -0.37086987, -2.2271066 ,
        0.42595804,  0.5865429 ,  1.2017881 , -0.33780292, -0.8551176 ,
        0.40224898, -1.0706888 , -0.7530372 , -0.8244256 ,  0.55899143,
        1.2573339 , -0.54879624, -0.10750362, -0.98149943, -0.16743213,
        0.8630176 , -0.10521385,  0.19304293, -0.07075709, -1.1196117 ,
        1.3656691 , -0.61478734, -1.466968  ,  0.88194764,  1.2041999 ,
       -0.73984355, -1.0110309 , -0.53025854,  0.39333916,  0.8233538 ,
        0.5281168 , -1.2220569 , -0.36653662,  0.47723958, -0.6036889 ,
        0.01350018,  0.22801465, -0.7620679 , -0.2077476 ,  0.25326818,
        0.10117194,  0.4012277 ,  0.81374156,  0.49761674, -1.2957555 ,
        0.8091037 ,  0.38076127,  0.8024878 , -0.943316  ,  1.76

##### Using TF-IDF to weigh in on event importance

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
event_text = df[2].values.tolist() # Store all text values of events in a list for IDF Vectorizer

In [16]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(event_text)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
dense_list = dense.tolist()
pd.DataFrame(dense_list, columns = feature_names).head()

Unnamed: 0,added,cart,chosen,dhl,dpd,for,fr,hermes,in,item,...,request,served,shipping,sofortueberweisung,sofortüberweisung,succesful,succesfull,to,user,website
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562409,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.562409,0.0
2,0.494288,0.494288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.494288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516757,0.0,0.0
3,0.494288,0.494288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.494288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516757,0.0,0.0
4,0.0,0.0,0.376952,0.687425,0.0,0.380025,0.0,0.0,0.0,0.0,...,0.0,0.0,0.49085,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
tfid[0].toarray()

NameError: name 'tfid' is not defined