# VMSP - Data Prep and workflow
Using groups of 100k. Resulting input data file: `spm_seq_grouped_data.txt`

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv('cleaned_otto_event_data.csv')
df.head()

Unnamed: 0,session,aid,ts,type,dt,aid_group,event
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,15,15_clicks
1,0,1563459,1659304904511,clicks,2022-07-31 22:01:44.511,15,15_clicks
2,0,1309446,1659367439426,clicks,2022-08-01 15:23:59.426,13,13_clicks
3,0,16246,1659367719997,clicks,2022-08-01 15:28:39.997,0,0_clicks
4,0,1781822,1659367871344,clicks,2022-08-01 15:31:11.344,17,17_clicks


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['event_num'] = label_encoder.fit_transform(df['event'])
df['event_num'] = df['event_num'].astype('str')
df.head()

Unnamed: 0,session,aid,ts,type,dt,aid_group,event,event_num
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,15,15_clicks,19
1,0,1563459,1659304904511,clicks,2022-07-31 22:01:44.511,15,15_clicks,19
2,0,1309446,1659367439426,clicks,2022-08-01 15:23:59.426,13,13_clicks,13
3,0,16246,1659367719997,clicks,2022-08-01 15:28:39.997,0,0_clicks,1
4,0,1781822,1659367871344,clicks,2022-08-01 15:31:11.344,17,17_clicks,25


In [4]:
# save dictionary to map event_num to event
event_dict = df[['event_num', 'event']].drop_duplicates()
event_dict.to_csv('/Users/jinglyng/Documents/MSBA/Capstone (UOB)/otto_rs/spm_seq_grouped_data_event_dict.csv', index=False)

In [5]:
df.sort_values(['session', 'dt'])
seq_df = df.groupby('session', as_index=False).agg(sequence=('event_num', ' -1 '.join))
seq_df['sequence'] = seq_df['sequence'] + ' -2'
seq_df.head()

Unnamed: 0,session,sequence
0,0,19 -1 19 -1 13 -1 1 -1 25 -1 7 -1 21 -1 39 -1 ...
1,1,39 -1 16 -1 15 -1 55 -1 54 -1 16 -1 15 -1 40 -...
2,2,49 -1 31 -1 43 -1 31 -1 49 -1 37 -1 49 -1 1 -1...
3,3,15 -1 16 -1 13 -1 13 -1 12 -1 16 -1 13 -1 28 -...
4,4,46 -1 34 -1 35 -1 37 -1 34 -1 28 -1 16 -1 28 -...


In [6]:
# save as .txt file

seq_df['sequence'].to_csv('spm_seq_grouped_data.txt', sep='\n', index=False, header=False)

## 10% sample

In [7]:
sample_seq_df = seq_df.sample(1000, random_state=123)
sample_seq_df.reset_index(drop=True, inplace=True)
sample_seq_df['sequence'].to_csv('spm_seq_grouped_data_sample.txt', sep='\n', index=False, header=False)

In [8]:
!python3 vmsp.py spm_seq_grouped_data_sample.txt spm_seq_grouped_sample_output.txt 0.41

>/Users/jinglyng/Documents/MSBA/Capstone (UOB)/maximal-sequential-patterns-mining/spmf.jar
 Total time ~ 217 ms
 Frequent sequences count : 19
 Max memory (mb) : 7.9566040039062519
minsup 410
Intersection count 23 

[['55', '#SUP: 563'], ['46', '#SUP: 576'], ['34', '#SUP: 547'], ['28', '#SUP: 417'], ['19', '#SUP: 578'], ['10', '#SUP: 574'], ['4', '#SUP: 582'], ['1', '#SUP: 569'], ['52', '52', '#SUP: 432'], ['49', '49', '#SUP: 413'], ['43', '43', '#SUP: 414'], ['40', '40', '#SUP: 425'], ['37', '37', '#SUP: 433'], ['31', '31', '#SUP: 425'], ['25', '25', '#SUP: 416'], ['22', '22', '#SUP: 423'], ['16', '16', '#SUP: 415'], ['13', '13', '#SUP: 415'], ['7', '7', '#SUP: 420']]


In [1]:
!python3 vmsp.py spm_seq_grouped_data_sample.txt spm_seq_grouped_sample_output.txt 0.35


>/Users/jinglyng/Documents/MSBA/Capstone (UOB)/maximal-sequential-patterns-mining/spmf.jar
