In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timezone, timedelta

### Load datasets

In [2]:
events_df = pd.read_pickle("data/01_events_df.pkl")

In [3]:
category_tree_df = pd.read_pickle("data/01_category_tree_df.pkl")

In [4]:
item_properties_df = pd.read_pickle("data/01_item_properties_df.pkl")

### Data Preparation

In [5]:
# Define variables
TIME_KEY = "timestamp"
USER_KEY = "visitorid"
ITEM_KEY = "itemid"
SESSION_KEY = "sessionid"

In [6]:
# Remove rows with NA userId
events_prepared_df = events_df[~np.isnan(events_df[USER_KEY])].copy()

In [7]:
# Truncate milli seconds
events_prepared_df[TIME_KEY] = (events_prepared_df[TIME_KEY] / 1000).astype(int)

#### Introduce Sessions

In [8]:
# Sort data by user and time
events_prepared_df.sort_values(by=[USER_KEY, TIME_KEY], ascending=True, inplace=True)

In [9]:
# Compute the time difference between queries
tdiff = np.diff(events_prepared_df[TIME_KEY].values)

In [10]:
# Check which of them are bigger then session threshold
SESSION_THRESHOLD = 30 * 60
split_session = tdiff > SESSION_THRESHOLD
split_session = np.r_[True, split_session]

In [11]:
# Check when the user changes its data
new_user = events_prepared_df[USER_KEY].values[1:] != events_prepared_df[USER_KEY].values[:-1]
new_user = np.r_[True, new_user]

In [12]:
# A new sessions stars when at least one of the two conditions is verified
new_session = np.logical_or(new_user, split_session)

In [13]:
# Compute the session ids
session_ids = np.cumsum(new_session)
events_prepared_df[SESSION_KEY] = session_ids

In [14]:
events_prepared_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,sessionid
1361687,1442004589,0,view,285930,,1
1367212,1442004759,0,view,357564,,1
1367342,1442004917,0,view,67045,,1
830385,1439487966,1,view,72028,,2
742616,1438969904,2,view,325215,,3


In [15]:
def print_stats(data):
    data_start = datetime.fromtimestamp(data[TIME_KEY].min(), timezone.utc)
    data_end = datetime.fromtimestamp(data[TIME_KEY].max(), timezone.utc)
    print('\tEvents: {}\n\tUsers: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
          format(len(data), data[USER_KEY].nunique(),
                 data[SESSION_KEY].nunique(), data[ITEM_KEY].nunique(),
                 data_start.date().isoformat(), data_end.date().isoformat()))

In [16]:
print("Raw dataset:")
print_stats(events_prepared_df)

Raw dataset:
	Events: 2756101
	Users: 1407580
	Sessions: 1761660
	Items: 235061
	Span: 2015-05-03 / 2015-09-18


#### Filter

In [17]:
# Keep items with >=5 interactions
MIN_ITEM_SUPPORT = 5
item_pop = events_prepared_df[ITEM_KEY].value_counts()
good_items = item_pop[item_pop >= MIN_ITEM_SUPPORT].index
events_prepared_df = events_prepared_df[events_prepared_df[ITEM_KEY].isin(good_items)]

In [18]:
# Remove sessions with length < 2
MIN_SESSION_LENGTH = 2
session_length = events_prepared_df[SESSION_KEY].value_counts()
good_sessions = session_length[session_length >= MIN_SESSION_LENGTH].index
events_prepared_df = events_prepared_df[events_prepared_df[SESSION_KEY].isin(good_sessions)]

In [19]:
# let's keep only returning users (with >= 2 sessions)
# need to be 3, because we need at least 1 for each training, validation and test set
MIN_USER_SESSIONS = 3
MAX_USER_SESSIONS = None
sess_per_user = events_prepared_df.groupby(USER_KEY)[SESSION_KEY].nunique()
if MAX_USER_SESSIONS is None:  # no filter for max number of sessions for each user
    good_users = sess_per_user[(sess_per_user >= MIN_USER_SESSIONS)].index
else:
    good_users = sess_per_user[(sess_per_user >= MIN_USER_SESSIONS) & (sess_per_user < MAX_USER_SESSIONS)].index
events_prepared_df = events_prepared_df[events_prepared_df[USER_KEY].isin(good_users)]

In [20]:
print("Filtered dataset:")
print_stats(events_prepared_df)

Filtered dataset:
	Events: 308642
	Users: 9474
	Sessions: 51101
	Items: 41969
	Span: 2015-05-03 / 2015-09-18


### Create single train / test split

In [21]:
CLEAN_TEST = True
def last_session_out_split(data, min_session_length):
    """
    last-session-out split
    assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[USER_KEY, TIME_KEY]).groupby(USER_KEY)[SESSION_KEY]
    last_session = sessions.last()
    train = data[~data[SESSION_KEY].isin(last_session.values)].copy()
    test = data[data[SESSION_KEY].isin(last_session.values)].copy()
    if CLEAN_TEST:
        train_items = train[ITEM_KEY].unique()
        test = test[test[ITEM_KEY].isin(train_items)]
        
        #  Remove sessions in test shorter than min_session_length
        slen = test[SESSION_KEY].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[SESSION_KEY].isin(good_sessions)].copy()
        train = train.reset_index(drop=True)
        test = test.reset_index(drop=True)
    return train, test

In [22]:
# assign the last session of every user to the test set and the remaining ones to the training set
train_sessions, test_sessions = last_session_out_split(events_prepared_df, MIN_SESSION_LENGTH)
validation_train_sessions, validation_test_sessions = last_session_out_split(train_sessions, MIN_SESSION_LENGTH)

In [23]:
print("Training set:")
print_stats(train_sessions)

Training set:
	Events: 269317
	Users: 9474
	Sessions: 41627
	Items: 39974
	Span: 2015-05-03 / 2015-09-18


In [24]:
print("Test set:")
print_stats(test_sessions)

Test set:
	Events: 36023
	Users: 8761
	Sessions: 8761
	Items: 14262
	Span: 2015-05-03 / 2015-09-18


In [25]:
print("Validation training set:")
print_stats(validation_train_sessions)

Validation training set:
	Events: 224600
	Users: 9474
	Sessions: 32153
	Items: 37290
	Span: 2015-05-03 / 2015-09-17


In [26]:
print("Validation test set:")
print_stats(validation_test_sessions)

Validation test set:
	Events: 40320
	Users: 8712
	Sessions: 8712
	Items: 14923
	Span: 2015-05-03 / 2015-09-18


### Store Datasets

In [27]:
events_prepared_df.to_pickle("data/02_events_prepared_df.pkl")

In [28]:
train_sessions.to_pickle("data/02_train_sessions.pkl")

In [29]:
test_sessions.to_pickle("data/02_test_sessions.pkl")

In [30]:
validation_train_sessions.to_pickle("data/02_validation_train_sessions.pkl")

In [31]:
validation_test_sessions.to_pickle("data/02_validation_test_sessions.pkl")