In [1]:
# imports
import os
import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#read dataset
raw_df = pd.read_csv('data/training_data_subset.csv',sep=',',header=0)
raw_df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,da2a3169afe2f7c5e1d172abbd418483056bae902f6585...,pageview,,,1550358404388,d37f603d410e900ee39a2994170670cde3e55a8a35efcd...
1,da2a3169afe2f7c5e1d172abbd418483056bae902f6585...,event_product,detail,59b2c0292b47455092d07c27e80da9130faca4e79f0189...,1550358404388,d37f603d410e900ee39a2994170670cde3e55a8a35efcd...
2,da2a3169afe2f7c5e1d172abbd418483056bae902f6585...,event_product,detail,7f5626cf0780cb6be28edd5b99cdaed4e65badb2166e72...,1550358413563,92b77633d47e4b2c570a40eb6030c771ab29c94a6bf8c3...
3,da2a3169afe2f7c5e1d172abbd418483056bae902f6585...,pageview,,,1550358413563,92b77633d47e4b2c570a40eb6030c771ab29c94a6bf8c3...
4,da2a3169afe2f7c5e1d172abbd418483056bae902f6585...,pageview,,,1550358439013,d37f603d410e900ee39a2994170670cde3e55a8a35efcd...


In [3]:
print('event types: {}'.format(set(raw_df['event_type'])))
print('product actions: {}'.format(set(raw_df['product_action'])))
print('unique session IDs: {}'.format(len(set(raw_df['session_id_hash']))))
print('unique product SKUs: {}'.format(len(set(raw_df['product_sku_hash']))))

event types: {'pageview', 'event_product'}
product actions: {'detail', nan, 'remove', 'purchase', 'add'}
unique session IDs: 1974586
unique product SKUs: 43552


# 0. Pre-processing

## 0.1 Filter out sessions with no add-to-cart

In [4]:
raw_df['product_action'] = raw_df['product_action'].fillna('view')
df = raw_df.groupby('session_id_hash')['product_action'].agg(list).reset_index()
print(len(df))
df.head()

1974586


Unnamed: 0,session_id_hash,product_action
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi..."
1,00000277639fc5c6f816654b78bf3654ece7fd53a7338f...,"[view, view, view, view, view, view]"
2,00001355930ff05e66ab30bccff221c33eba90e1517397...,"[view, detail]"
3,000024f4f0071f59b3fde8d0bdf2517a3767f29ef25a90...,"[view, view, view, detail, view]"
4,000033a81d6ae0e9b6ad1952348a2759f14fd688095b6d...,"[view, view, view, view, view, view, view, vie..."


In [5]:
df = df[df['product_action'].apply(lambda x: 'add' in x)]
df['len_before']=df['product_action'].map(len)
print(len(df))
df.sort_values(by='len_before').head()

86122


Unnamed: 0,session_id_hash,product_action,len_before
548932,47337e13234b321b60396b353ed79b5167e5416b9da3fe...,[add],1
734012,5f350b64e104fb244dca77fbe19b6c7456a9428f58081f...,[add],1
288795,256ed9b1f85b0b6791a138d770a79fd6defe44a15ac270...,[add],1
1711558,dddd7196296aae784fb3e9c7df518c975a0edf79a94f90...,[add],1
1134281,93111adcd183fffbe39e459b032face5677a83b6c0e11c...,[add],1


## 0.2 Label the sessions (1: conversions; 0: cart-abandonement)

In [6]:
df['convert'] = df['product_action'].apply(lambda x: 'purchase' in x).astype(int)
df.head()

Unnamed: 0,session_id_hash,product_action,len_before,convert
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",18,0
32,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, detail, view, view, view, v...",41,0
79,0002629c632e1d3211649ba4498194a13a280c35c6bd57...,"[view, detail, view, view, view, view, view, v...",58,1
140,0004be10f232d2ebd86be1c1c46dcf741ee77651bc550d...,"[view, view, view, view, view, view, view, det...",15,0
168,00057b97a81dbf743e2687b0fd43b67e86d5ced77c828d...,"[detail, view, add, view]",4,0


In [7]:
#Count the number of sessions with and without purchase event
counts = df.groupby('convert')['session_id_hash'].agg(lambda x: len(set(x)))
print(counts)

convert
0    67693
1    18429
Name: session_id_hash, dtype: int64


# 0.3 Trim the purchase sessions to the last event before the first purchase

In [8]:
target_action = 'purchase'

def filter_purchase(session_list, target):
    try:
        purchase_idx = session_list.index(target)
        return session_list[:purchase_idx]  # Return the sublist of events before 'purchase'
    except ValueError:
        return session_list  # Return the original list if 'purchase' is not found

# Applying the function to each group (session)
df['product_action'] = df['product_action'].apply(filter_purchase, target=target_action)

# Displaying the result
print(len(df))
print(df.head(5))


86122
                                       session_id_hash  \
0    00000114e1075962f022114fcfc17f2d874e694ac5d201...   
32   00010d84aca1294479304044207fd268f63228844779c6...   
79   0002629c632e1d3211649ba4498194a13a280c35c6bd57...   
140  0004be10f232d2ebd86be1c1c46dcf741ee77651bc550d...   
168  00057b97a81dbf743e2687b0fd43b67e86d5ced77c828d...   

                                        product_action  len_before  convert  
0    [view, detail, add, view, view, view, view, vi...          18        0  
32   [view, view, view, detail, view, view, view, v...          41        0  
79   [view, detail, view, view, view, view, view, v...          58        1  
140  [view, view, view, view, view, view, view, det...          15        0  
168                          [detail, view, add, view]           4        0  


## 0.4 Filter sessions that are too short (shorter than 5 events) or too long (longer than 100 events)

In [9]:
df = df[df['product_action'].apply(lambda x: len(x)>=5 and len(x)<=100)]
df['len_after']=df['product_action'].map(len)
print(len(df))
print(df.sort_values(by='len_before',ascending = False).head(10))

78011
                                           session_id_hash  \
1227584  9f242e4ffc641acac17c2f0be166bd5a02b4108d045186...   
75067    09b1777bb7fa6ef9f89bc98db4e59bac3d139f72f9da44...   
1540946  c7ba467397609db892395005cd269c92a1115a4cd21446...   
124293   1010dd31f9cce114c10bfd8c97f0377fc0b5b2c96a4c62...   
1212861  9d413a65650ee806b287fe70c1e5ba7d1433c3a65c4fc7...   
843643   6d6489dcb21a34025b64c1f2885df20ada092c2f993335...   
1473160  befaf387209b430910f023644813640363bd3ccd19f637...   
240398   1f26062cc448daf8970bc392d5fbd685fd5c393925f5cb...   
1330347  ac7968f0dc2ab16c2a92aeb6abfbd5b516eda254d504e0...   
1608155  d0749996c4bd85825cd0309589ea0af32caf2d8667d5fd...   

                                            product_action  len_before  \
1227584  [view, detail, detail, view, view, detail, det...         200   
75067    [view, view, view, view, detail, view, view, v...         200   
1540946  [view, view, view, detail, view, detail, detai...         199   
124293   [view,

# 0.5 Symbolise the session

In [10]:
from collections import Counter

sessions = df['product_action'].to_list()
labels = df['convert'].to_list()
print(sessions[:5])
print(labels[:5])

[['view', 'detail', 'add', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'detail', 'view', 'view', 'detail', 'view', 'view', 'detail', 'view'], ['view', 'view', 'view', 'detail', 'view', 'view', 'view', 'view', 'view', 'view', 'detail', 'view', 'view', 'view', 'view', 'detail', 'add', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'detail', 'view', 'add', 'view', 'view', 'detail', 'view', 'view', 'view', 'detail', 'view', 'detail', 'view', 'view', 'view', 'view'], ['view', 'detail', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'view', 'detail', 'add', 'remove', 'view', 'remove', 'add', 'remove', 'view', 'remove', 'view', 'remove', 'remove', 'remove', 'view', 'view', 'remove', 'view', 'view', 'view', 'view', 'remove', 'view', 'remove', 'view', 'remove', 'remove', 'remove', 'remove', 'remove', 'remove', 'remove', 'remove'], ['view', 'view', 'view', 'view', 'view', 'view', 'view', 'detail', 'view', 'add', 'view', 'view', 'detail', 'add

In [11]:
counts = Counter([item for session in sessions for item in session]) #list comprehension
# I'm not going to use it, but I'm saving the 0 for padding sessions when symbolising
symbol2idx = {
    symbol: idx for idx, symbol in enumerate(sorted(counts, key=counts.get, reverse=True), 1)
}
print(symbol2idx)

symbolised_sessions = []
for idx, session in enumerate(sessions):
    symbolised_session = [symbol2idx[s] for s in session]
    symbolised_sessions.append(symbolised_session)
print(symbolised_sessions[:5])

{'view': 1, 'detail': 2, 'add': 3, 'remove': 4}
[[1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1], [1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1], [1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 3, 4, 1, 4, 1, 4, 4, 4, 1, 1, 4, 1, 1, 1, 1, 4, 1, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4], [1, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 3, 1], [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1]]


In [12]:
df.head()

Unnamed: 0,session_id_hash,product_action,len_before,convert,len_after
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",18,0,18
32,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, detail, view, view, view, v...",41,0,41
79,0002629c632e1d3211649ba4498194a13a280c35c6bd57...,"[view, detail, view, view, view, view, view, v...",58,1,44
140,0004be10f232d2ebd86be1c1c46dcf741ee77651bc550d...,"[view, view, view, view, view, view, view, det...",15,0,15
179,0005ea00d00f64fbfaaa8b110f268025ba8ad963d8fdd3...,"[view, view, view, view, view, view, detail, v...",50,0,50


In [13]:
import json

json.dump(symbolised_sessions, open("data/symbolised_sessions_group27.json", "w"))
json.dump(labels, open("data/labels_group27.json", "w"))

## 0.6 Trim it to 10 events after the first add-to-cart (only for experimental model)

In [14]:
# target_event = 'add'

# def filter_add_to_cart(session_list, target_event):
#     try:
#         add_idx = session_list.index(target_event)
#         return session_list[:add_idx + 10]  # Return the sublist of events before 'add_to_cart
#     except ValueError:
#         return session_list  # Return the original list if < than 5

# # Applying the function to each group (session)
# df['product_action'] = df['product_action'].apply(filter_add_to_cart, target_event=target_event)

# # Displaying the result
# print(len(df))
# print(df.head(5))

In [15]:
#Export csv if you want to see the information more intuitively, but when training the model, we will use 2 json files above
# exported_df = df[['session_id_hash','product_action','convert', 'len_after']]
# exported_df.to_csv('preprocessing_group27.csv',sep=',')

# 0.7 Split into train and test set

In [16]:
# train, test splits
np.random.seed(2727)
train_ids = np.random.choice(
    len(symbolised_sessions), int(np.floor(len(symbolised_sessions)*0.8)), replace=False
)
training_sessions, training_labels, eval_sessions, eval_labels = [], [], [], []
for idx, (session, label) in enumerate(zip(symbolised_sessions, labels)):
    if idx in train_ids:
        training_sessions.append(session)
        training_labels.append(label)
    else:
        eval_sessions.append(session)
        eval_labels.append(label)

In [17]:
# a further checkpoint to save our progress
import json

json.dump(training_sessions, open("data/symbolised_sessions_training_group27.json", "w"))
json.dump(training_labels, open("data/labels_training_group27.json", "w"))
json.dump(eval_sessions, open("data/symbolised_sessions_eval_group27.json", "w"))
json.dump(eval_labels, open("data/labels_eval_group27.json", "w"))