# EDA

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("training_data.csv")
df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885210881,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
1,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,61ef3869355b78e11011f39fc7ac8f8dfb209b3442a9d5...,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
2,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
3,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
4,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...


In [3]:
df.shape

(30456445, 6)

In [4]:
print('event types: {}'.format(set(df['event_type'])))
print('product actions: {}'.format(set(df['product_action'])))

event types: {'event_product', 'pageview'}
product actions: {nan, 'add', 'detail', 'purchase', 'remove'}


## Pre-processing

### Sessionization

In [5]:
# derive sessions from action by action dataset
df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(list).reset_index()
df.head()

Unnamed: 0,session_id_hash,product_action
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi..."
1,000009f36a40de1d557afc083dbb3fc03eef2473337bad...,"[view, view]"
2,00000e812c3076d18245710a31b348d3f23314b7d0dc90...,[view]
3,00001355930ff05e66ab30bccff221c33eba90e1517397...,"[view, detail]"
4,0000162d1dad0beb867c191ab2c8c7c06086cc57d9ebe2...,"[view, view, view, view, detail, view, detail,..."


In [6]:
# select sessions with at least one add-to-cart
df = np.where(df.product_action.map(set(['add']).issubset), 1, 0)
df_add = df[df["add"] == 1]

Unnamed: 0,session_id_hash,product_action,add
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",1
1,000009f36a40de1d557afc083dbb3fc03eef2473337bad...,"[view, view]",0
2,00000e812c3076d18245710a31b348d3f23314b7d0dc90...,[view],0
3,00001355930ff05e66ab30bccff221c33eba90e1517397...,"[view, detail]",0
4,0000162d1dad0beb867c191ab2c8c7c06086cc57d9ebe2...,"[view, view, view, view, detail, view, detail,...",0


In [10]:
# add class label to each session (BUY V. NO-BUY)
df_add['purchase'] = np.where(df_add.product_action.map(set(['purchase']).issubset), 1, 0)
df_add.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_add['purchase'] = np.where(df_add.product_action.map(set(['purchase']).issubset), 1, 0)


Unnamed: 0,session_id_hash,product_action,add,purchase
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",1,0
37,0000913afa22ba9c31efb992bcf6388b0bbfe28056bef3...,"[view, view, view, detail, view, view, detail,...",1,0
64,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, detail, view, view, view, v...",1,0
84,0001368d732951035a7ef7ef42b345a5c50b7d66966749...,"[view, detail, add, view, view, detail, add, v...",1,0
119,0001c180fb742f96ff388ba8f67a568e6fa66aed30d0d2...,"[view, view, view, view, detail, add, remove, ...",1,1


In [44]:
# cut sessions after add
# cut BUY sessions to last event before purchase

def after_add(lst):
    '''
    remove the first 'add' and everything before in a given list
    '''
    for i, action in enumerate(lst):
        if action == "add":
            lst = lst[i+1:]
            break
    return lst

def before_purchase(lst):
    '''
    1.remove 'purchase' and everything after in a given list
    2.if purchase is not in the lst, return the whole list
    '''
    length = len(lst)
    for i, action in enumerate(lst):
        if action == "purchase":
            length = i
            break
    return lst[:length]  


In [52]:
df_add["product_action"]=df_add["product_action"].apply(after_add)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_add["product_action"]=df_add["product_action"].apply(after_add)


In [53]:
df_add["product_action"]=df_add["product_action"].apply(before_purchase)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_add["product_action"]=df_add["product_action"].apply(before_purchase)


In [54]:
df_add.head()

Unnamed: 0,session_id_hash,product_action,add,purchase
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, view, view, view, view, view, view, det...",1,0
37,0000913afa22ba9c31efb992bcf6388b0bbfe28056bef3...,"[view, detail, view, view, view, detail, view,...",1,0
64,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, view, view, view, view, vie...",1,0
84,0001368d732951035a7ef7ef42b345a5c50b7d66966749...,"[view, view, detail, add, view, view, view, vi...",1,0
119,0001c180fb742f96ff388ba8f67a568e6fa66aed30d0d2...,"[remove, view, detail, view, view, view, remov...",1,1
