In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid",
              context="notebook",
            #   font_scale=1.5
              )

# Diginetica
This notebook preprocesses the diginetica dataset and saves it in a format that can be used by the next notebooks. It also takes care of splitting the data into training, validation and test sets that can be found in 'data/processed datasets/diginetica/'.

To conclude, the notebook contains an exploratory data analysis of the dataset.

## Loading views dataset

In [2]:
diginetica_views = pd.read_csv(r'data\downloaded datasets\diginetica\train-item-views.csv',
                                      sep=';',
                                      usecols=[0, 2, 3, 4])

diginetica_views.columns = ['SessionId', 'ItemId', 'Time', 'Date']

diginetica_views.head()                                      
display(diginetica_views.head())
display(diginetica_views.shape)

Unnamed: 0,SessionId,ItemId,Time,Date
0,1,81766,526309,2016-05-09
1,1,31331,1031018,2016-05-09
2,1,32118,243569,2016-05-09
3,1,9654,75848,2016-05-09
4,1,32627,1112408,2016-05-09


(1235380, 4)

## Loading purchases dataset

In [3]:
diginetica_purchases = pd.read_csv(r'data\downloaded datasets\diginetica\train-purchases.csv',
                                          sep=';',
                                          usecols=['sessionId', 'itemId', 'timeframe', 'eventdate'])

diginetica_purchases.columns = ['SessionId', 'Time', 'Date', 'ItemId']

display(diginetica_purchases.head())
display(diginetica_purchases.shape)

Unnamed: 0,SessionId,Time,Date,ItemId
0,150,17100868,2016-05-06,25911
1,151,6454547,2016-05-06,175874
2,156,1721689387,2016-05-27,35324
3,179,343001,2016-05-09,31233
4,246,2311046,2016-05-09,34677


(18025, 4)

Concatenate the views and purchases events into a single dataframe. 

In [4]:
data = pd.concat([diginetica_views, 
                  diginetica_purchases],
                 axis=0,
                 keys=['view', 'purchase'],
                 ).reset_index().rename(columns={'level_0': 'Type'}).drop(columns='level_1')

display(data.head())
display(data.shape)

Unnamed: 0,Type,SessionId,ItemId,Time,Date
0,view,1,81766,526309,2016-05-09
1,view,1,31331,1031018,2016-05-09
2,view,1,32118,243569,2016-05-09
3,view,1,9654,75848,2016-05-09
4,view,1,32627,1112408,2016-05-09


(1253405, 5)

Perform cleaning and prepare the data that it can be used in session-rec framework.

In [5]:
# code originally from https://github.com/rn5l/session-rec/blob/master/preprocessing/session_based/preprocess_diginetica.py load_data function (type 2)

# replace NaN values in Time column with 0 and convert to int64
data['Time'] = data.Time.fillna(0).astype(np.int64)

# convert time string in Date to timestamp and remove the original column
# start = datetime.strptime('2018-1-1 00:00:00', '%Y-%m-%d %H:%M:%S')
data['Date'] = data.Date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

# create column with unix timestamp
data['Datestamp'] = data['Date'].apply(lambda x: x.timestamp())


data['Time'] = (data['Time'] / 1000)
data['Time'] = data['Time'] + data['Datestamp']
# convert timestamp to datetime
data['TimeO'] = data.Time.apply(lambda x: datetime.fromtimestamp(x, timezone.utc))


# output
data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)

print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
        format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(),
                data_end.date().isoformat()))

data = data.groupby('SessionId').apply(lambda x: x.sort_values('Time'))     # data = data.sort_values(['SessionId'],['Time'])
data.index = data.index.get_level_values(1)

display(data.head())

Loaded data set
	Events: 1253405
	Sessions: 310486
	Items: 123273
	Span: 2016-01-01 / 2016-11-09




  data = data.groupby('SessionId').apply(lambda x: x.sort_values('Time'))     # data = data.sort_values(['SessionId'],['Time'])


Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO
3,view,1,9654,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:01:15.848000+00:00
5,view,1,33043,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:02:53.912000+00:00
2,view,1,32118,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:04:03.569000+00:00
6,view,1,12352,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:05:29.870000+00:00
7,view,1,35077,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:06:30.072000+00:00


Filter data on minimum item support of 5 and session length of 2. Meaning that we only keep sessions with at least 2 items and items that have been viewed at least 5 times.

In [6]:
# code originally from https://github.com/rn5l/session-rec/blob/master/preprocessing/session_based/preprocess_diginetica.py filter data function

def filter_min_date(data, min_date='2016-05-07'):
    
    print('filter_min_date')
    
    min_datetime = datetime.strptime(min_date + ' 00:00:00', '%Y-%m-%d %H:%M:%S')

    # filter
    session_max_times = data.groupby('SessionId').Time.max()
    session_keep = session_max_times[session_max_times > min_datetime.timestamp()].index

    data = data[np.in1d(data.SessionId, session_keep)]

    # output
    data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
    data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)

    print('Filtered data set min date \n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
          format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(),
                 data_end.date().isoformat()))

    return data

data = filter_min_date(data)

display(data.head())

filter_min_date
Filtered data set min date 
	Events: 325993
	Sessions: 81057
	Items: 63250
	Span: 2016-01-08 / 2016-11-09




Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO
3,view,1,9654,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:01:15.848000+00:00
5,view,1,33043,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:02:53.912000+00:00
2,view,1,32118,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:04:03.569000+00:00
6,view,1,12352,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:05:29.870000+00:00
7,view,1,35077,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:06:30.072000+00:00


In [7]:
def filter_data(data, min_item_support=5, min_session_length=2):
    # filter session length
    session_lengths = data.groupby('SessionId').size()
    session_lengths = session_lengths[ session_lengths >= min_session_length ]
    data = data[np.in1d(data.SessionId, session_lengths.index)]

    # filter item support
    data['ItemSupport'] = data.groupby('ItemId')['ItemId'].transform('count')
    data = data[data.ItemSupport >= min_item_support]

    # filter session length again, after filtering items
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths >= min_session_length].index)]
    
    # output
    data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
    data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)

    print('Filtered data set default \n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
          format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(),
                 data_end.date().isoformat()))

    return data

data = filter_data(data)

data.head()

Filtered data set default 
	Events: 216241
	Sessions: 47846
	Items: 15120
	Span: 2016-01-08 / 2016-10-22




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ItemSupport'] = data.groupby('ItemId')['ItemId'].transform('count')


Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO,ItemSupport
3,view,1,9654,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:01:15.848000+00:00,74
5,view,1,33043,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:02:53.912000+00:00,41
2,view,1,32118,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:04:03.569000+00:00,19
6,view,1,12352,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:05:29.870000+00:00,79
7,view,1,35077,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:06:30.072000+00:00,47


Here, we split the data. We will take 7 days test data, and the remaining data will be used for training. This approach is based on gru4rec paper which is often used as a benchmark for session-based recommendation.

The function below creates an *incorrect* traint test split where only two sessions end up in the test set. This is not a good split, but it is used to demonstrate the data preparation process. In the next steps, we will create a correct train-test split. First running the instructions that are explained [here](https://github.com/rn5l/session-rec/tree/5dcd583cbd8d44703a5248b9a308945f24b91390?tab=readme-ov-file#--------dataset-preprocessing--------) and store them in 'data/premade datasets/diginetica. The default values are used for the preprocessing parameters which were also mentioned above. Unfortunately, the preprocessing of this repo does not take into account the event column. Therefore, we will add it manually. We will extract the session ids from the correctly generated train, test, validation splits and use them to filter the concatenated dataframe to create the final train, test, and validation dataframes with an event column.

In [8]:
# code originally from https://github.com/rn5l/session-rec/blob/master/preprocessing/session_based/preprocess_diginetica.py load_data function (type 2)
# def split_data(data, output_file, days_test=7):
#     # Ensure 'Time' column is in datetime format
#     data['Time'] = pd.to_datetime(data['Time'], unit='s')
    
#     # Determine the end of the dataset
#     data_end = data['Time'].max()
#     print(f'Data end: {data_end}')
#     test_from = data_end - timedelta(days=days_test)
#     print(f'Test from: {test_from}')

#     # Identify sessions for training and testing
#     session_max_times = data.groupby('SessionId')['Time'].max()
#     session_train = session_max_times[session_max_times <= test_from].index
#     session_test = session_max_times[session_max_times > test_from].index
    
#     print(f'Train set {session_train}')
#     print(f'test set {session_test}')
    
#     # Split the data into train and test sets
#     train = data[data['SessionId'].isin(session_train)]
#     trlength = train.groupby('SessionId').size()
#     train = train[train['SessionId'].isin(trlength[trlength >= 2].index)]
    
#     test = data[data['SessionId'].isin(session_test)]
#     test = test[test['ItemId'].isin(train['ItemId'])]
#     tslength = test.groupby('SessionId').size()
#     test = test[test['SessionId'].isin(tslength[tslength >= 2].index)]
    
#     print(f'Full train set\n\tEvents: {len(train)}\n\tSessions: {train["SessionId"].nunique()}\n\tItems: {train["ItemId"].nunique()}')
#     train.to_csv(f'{output_file}_train_full.txt', sep='\t', index=False)
    
#     print(f'Test set\n\tEvents: {len(test)}\n\tSessions: {test["SessionId"].nunique()}\n\tItems: {test["ItemId"].nunique()}')
#     test.to_csv(f'{output_file}_test.txt', sep='\t', index=False)

#     # Determine the end of the training set
#     data_end = train['Time'].max()
#     valid_from = data_end - timedelta(days=days_test)
    
#     # Identify sessions for training and validation within the training set
#     session_max_times = train.groupby('SessionId')['Time'].max()
#     session_train = session_max_times[session_max_times < valid_from].index
#     session_valid = session_max_times[session_max_times >= valid_from].index
    
#     train_tr = train[train['SessionId'].isin(session_train)]
#     valid = train[train['SessionId'].isin(session_valid)]
#     valid = valid[valid['ItemId'].isin(train_tr['ItemId'])]
#     tslength = valid.groupby('SessionId').size()
#     valid = valid[valid['SessionId'].isin(tslength[tslength >= 2].index)]
    
#     print(f'Train set\n\tEvents: {len(train_tr)}\n\tSessions: {train_tr["SessionId"].nunique()}\n\tItems: {train_tr["ItemId"].nunique()}')
#     train_tr.to_csv(f'{output_file}_train_tr.txt', sep='\t', index=False)
    
#     print(f'Validation set\n\tEvents: {len(valid)}\n\tSessions: {valid["SessionId"].nunique()}\n\tItems: {valid["ItemId"].nunique()}')
#     valid.to_csv(f'{output_file}_train_valid.txt', sep='\t', index=False)


# path_proc = 'data\processed datasets\diginetica\interactions'

# split_data(data, output_file=path_proc)

In [9]:
test_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_test.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()
print(f'Test ids: {test_ids}')

validation_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_train_valid.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()

train_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_train_tr.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()

# use the ids to split the data into train, validation and test sets and save them to files
data[data['SessionId'].isin(test_ids)].to_csv(r'data\processed datasets\diginetica\interactions_test.txt', sep='\t', index=False)
data[data['SessionId'].isin(validation_ids)].to_csv(r'data\processed datasets\diginetica\interactions_train_valid.txt', sep='\t', index=False)
data[data['SessionId'].isin(train_ids)].to_csv(r'data\processed datasets\diginetica\interactions_train_tr.txt', sep='\t', index=False)

Test ids: [   289    290    302 ... 600404 600608 600674]


In [10]:
test_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_test.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()
print(f'Test ids: {test_ids}')

validation_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_train_valid.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()

train_ids = pd.read_csv(r'data\premade datasets\diginetica\train-item-views_train_tr.txt',
                           sep='\t',
                           usecols=['SessionId'])['SessionId'].unique()

# use the ids to split the data into train, validation and test sets and save them to files
test = data[data['SessionId'].isin(test_ids)]
test.to_csv(r'data\processed datasets\diginetica\interactions_test.txt', sep='\t', index=False)
validation = data[data['SessionId'].isin(validation_ids)]
validation.to_csv(r'data\processed datasets\diginetica\interactions_train_valid.txt', sep='\t', index=False)
train = data[data['SessionId'].isin(train_ids)]
train.to_csv(r'data\processed datasets\diginetica\interactions_train_tr.txt', sep='\t', index=False)

display('Test set:')
display(test.head())
display(test.shape)
display('Validation set:')
display(validation.head())
display(validation.shape)
display('Train set:')
display(train.head())
display(train.shape)

Test ids: [   289    290    302 ... 600404 600608 600674]


'Test set:'

Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO,ItemSupport
1044,view,289,125013,1464221000.0,2016-05-26,1464221000.0,2016-05-26 00:00:18.301000+00:00,7
1045,view,289,64068,1464222000.0,2016-05-26,1464221000.0,2016-05-26 00:14:07.735000+00:00,6
1046,view,289,133346,1464222000.0,2016-05-26,1464221000.0,2016-05-26 00:14:38.934000+00:00,6
1050,view,289,198930,1464222000.0,2016-05-26,1464221000.0,2016-05-26 00:18:48.607000+00:00,7
1066,view,302,36202,1464221000.0,2016-05-26,1464221000.0,2016-05-26 00:00:45.583000+00:00,23


(64761, 8)

'Validation set:'

Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO,ItemSupport
742,view,199,234568,1464134000.0,2016-05-25,1464134000.0,2016-05-25 00:00:09.275000+00:00,130
741,view,199,85571,1464134000.0,2016-05-25,1464134000.0,2016-05-25 00:00:45.383000+00:00,119
743,view,200,16486,1464135000.0,2016-05-25,1464134000.0,2016-05-25 00:01:48.048000+00:00,56
744,view,200,16486,1464135000.0,2016-05-25,1464134000.0,2016-05-25 00:02:22.826000+00:00,56
757,view,203,36271,1464134000.0,2016-05-25,1464134000.0,2016-05-25 00:00:37.001000+00:00,64


(47875, 8)

'Train set:'

Unnamed: 0,Type,SessionId,ItemId,Time,Date,Datestamp,TimeO,ItemSupport
3,view,1,9654,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:01:15.848000+00:00,74
5,view,1,33043,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:02:53.912000+00:00,41
2,view,1,32118,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:04:03.569000+00:00,19
6,view,1,12352,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:05:29.870000+00:00,79
7,view,1,35077,1462752000.0,2016-05-09,1462752000.0,2016-05-09 00:06:30.072000+00:00,47


(102272, 8)

In [11]:
# save the full data set to a file for EDA
data.to_pickle(r'data\processed datasets\diginetica\interactions_full.pkl')