# Baseline Dataset

- This dataset is used to implement the baseline from the paper "Personalizing Session based Recommendations with Hierarchical RNNs" -> resources/papers/personalizing_session_based_rec.pdf
- This dataset is generated from the OnlineShopTrafficTracking Table in BigQuery
- Clean out bots using: https://github.com/monperrus/crawler-user-agents/blob/master/crawler-user-agents.json

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

TESTMODE = False
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=ResourceWarning)
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
import time
from io import StringIO
from dg_ml_core.datastores import gcs_utils
from dg_ml_core.file import get_file_handle, get_paths_with_prefix, save_to_file, file_exists, copy_file
from dg_ml_core.collections import dict_ops
from dg_ml_core.datastores import gcs_utils
import requests
import json
import csv
import random
import pprint
from statistics import mean, median, stdev

In [30]:
def get_bots_list():
    url = 'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json'
    response = requests.get(url)
    data = json.loads(response.content)
    all_instances = [item for sublist in map(lambda x: x['instances'], data) for item in sublist]
    return all_instances

## Execute Query in BQ

- Here we extract the relevant features out of the large collection of visits

In [4]:
query = """
SELECT (SELECT Value FROM UNNEST(ActionParameters) WHERE Key = 'id') as ProductId, LastLoggedInUserId, UserId, SessionId, UserAgent, Timestamp
FROM `dg-prod-personalization.PersonalizationDataV2.OnlineShopTrafficTracking` 
WHERE LOWER(ControllerName) = 'product' AND LOWER(ActionName) = 'show' AND UserId > 0
"""

if TESTMODE:
    query += ' AND _PARTITIONTIME = TIMESTAMP("2019-02-11")'
else:
    query += ' AND _PARTITIONTIME < TIMESTAMP("2019-02-11")'
    
print('Executing query {}. \nYou have 5 seconds to cancel...'.format(query))
time.sleep(5)

client = bigquery.Client()
dataset_ref = client.dataset('MAMuy', project='machinelearning-prod')
table_ref = dataset_ref.table('baseline_dataset')

job_config = bigquery.job.QueryJobConfig(
    allow_large_results=True, 
    destination=table_ref,
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)

query_job = client.query(query, job_config=job_config, job_id_prefix='baseline_dataset_query_', location='EU')
print('Running Job {}'.format(query_job.job_id))
query_job.result()

print('Query execution done')

Executing query 
SELECT (SELECT Value FROM UNNEST(ActionParameters) WHERE Key = 'id') as ProductId, LastLoggedInUserId, UserId, SessionId, UserAgent, Timestamp
FROM `dg-prod-personalization.PersonalizationDataV2.OnlineShopTrafficTracking` 
WHERE LOWER(ControllerName) = 'product' AND LOWER(ActionName) = 'show' AND UserId > 0
 AND _PARTITIONTIME < TIMESTAMP("2019-02-11"). 
You have 5 seconds to cancel...
Running Job baseline_dataset_query_df2b2811-b0bd-440f-9371-988a33ccd80e


<google.cloud.bigquery.table.RowIterator at 0x7fad64ce9da0>

Query execution done


## Extract to GCS

- Extract the table containing the relevant features to GCS

In [5]:
destination_uri = 'gs://ma-muy/baseline_dataset/raw/*.csv'

client = bigquery.Client()
dataset_ref = client.dataset('MAMuy', project='machinelearning-prod')
table_ref = dataset_ref.table('baseline_dataset')

extract_job = client.extract_table(
    table_ref,
    destination_uri,
    location='EU',
    job_id_prefix='baseline_dataset_extract_')

print('Running Job {}'.format(extract_job.job_id))
extract_job.result()
print('Extraction done')

Running Job baseline_dataset_extract_42af8c64-687a-4897-a333-1ec49119f06c


<google.cloud.bigquery.job.ExtractJob at 0x7fad64d8c630>

Extraction done


## Clean data

- Here we clean the data.
- Specifically there are two steps:
  - Clean out bot visits
  - Merge LastLoggedInUserId and UserId

In [None]:
def clean_dataset(source, target):
    col_types = {"ProductId": 'Float64', 
             "UserId": 'Float64', 
             "UserAgent": str, 
             "LastLoggedInUserId": 'Float64', 
             "SessionId": 'Float64', 
             "Timestamp": 'Float64'}
    
    df = pd.read_csv(source, dtype=col_types).fillna(-1)
    bots = get_bots_list()
    df = df[~df.UserAgent.isin(bots)]
    df = df.dropna(subset=["ProductId"])
    
    no_user_id_mask = df.UserId == -1
    df.loc[no_user_id_mask, 'UserId'] = df.loc[no_user_id_mask, 'LastLoggedInUserId']
    
    df.to_csv(target, index=False, columns=['UserId', 'ProductId', 'SessionId', 'Timestamp'])

    return target

##################################################################

if TESTMODE:
    print('Processing example.csv')
    df = clean_dataset('example.csv', 'example_clean.csv')
    
else:
    raw_data_prefix = 'gs://ma-muy/baseline_dataset/raw/'
    cleaned_data_prefix = 'gs://ma-muy/baseline_dataset/clean/'
    
    client = storage.Client.from_service_account_json('../../service-account.json')
    
    raw_paths = get_paths_with_prefix(raw_data_prefix)
    print('Processing' + ' '*(len(paths) - 11) + '|')
    for raw_path in raw_paths:
        clean_path = cleaned_data_prefix + gcs_utils.get_file_name(raw_path)

        print('=', end='', flush=True)
        source = get_file_handle(raw_path, gcs_client=client)
        target = StringIO()
        
        target = clean_dataset(source, target)
        
        save_to_file(clean_path, target.getvalue(), gcs_client=client)

## Merge Sessions

- In this step we will merge all the single visit events into sessions
- Further we merge all sessions to the specific user

In [None]:
def merge_sessions(reader):
    sessions_by_user = dict()
    for row in reader:
        user_id = str(int(float(row['UserId'])))
        session_id = str(int(float(row['SessionId'])))
        product_id = int(float(row['ProductId']))
        timestamp = int(float(row['Timestamp']))
        
        if user_id not in sessions_by_user:
            sessions_by_user[user_id] = dict()
        
        if session_id not in sessions_by_user[user_id]:
            sessions_by_user[user_id][session_id] = dict()
            sessions_by_user[user_id][session_id]['Events'] = []
        
        sessions_by_user[user_id][session_id]['Events'].append(
            {
                "ProductId": product_id,
                "Timestamp": timestamp
            })
        
        first_event_ts = min(map(lambda x: int(x['Timestamp']), sessions_by_user[user_id][session_id]['Events']))
        sessions_by_user[user_id][session_id]['StartTime'] = first_event_ts
    return sessions_by_user, unique_products, unique_users, unique_sessions

##################################################################

if TESTMODE:
    reader = csv.DictReader(open('example_clean.csv'))
    
    unique_products = set()
    unique_users = set()
    unique_sessions = set()
    
    sessions_by_user = merge_sessions(reader)
    
    dict_ops.save_dict('example_merged.json', sessions_by_user)

else:
    cleaned_data_prefix = 'gs://ma-muy/baseline_dataset/clean/'
    merged_data_prefix = 'gs://ma-muy/baseline_dataset/merged/'
    
    client = storage.Client.from_service_account_json('../../service-account.json')
    
    clean_paths = get_paths_with_prefix(cleaned_data_prefix)
    print('Processing' + ' '*(len(paths) - 11) + '|')
    for clean_path in clean_paths:
        merged_path = (merged_data_prefix + gcs_utils.get_file_name(clean_path)).replace('csv', 'json')
        
        print('=', end='', flush=True)
        source = get_file_handle(clean_path, gcs_client=client)
        reader = csv.DictReader(source)
        
        sessions_by_user = merge_sessions(reader)
        
        dict_ops.save_dict(merged_path, sessions_by_user, gcs_client=client)

## Merge shards

- As of now we have several shards, containing the sessions aggregated to the user level.
- The merging of the shards is the most time consuming part of the data generation process. 
- We need to merge all sessions of a specific user into one datastructure.
- In production we will be dealing with daily shards, which makes the generation of the dataset easier
- However in this case we will be dealing with full exports, therefore we cannot assume that a shard is from one day. 

In [None]:
def generate_sessions_by_user(shard, merged_shards_prefix, num_target_files):
    
    gcs_client = storage.Client.from_service_account_json('../../service-account.json')
    
    path = ''
    new_path = ''
    output_dict = dict()
    
    for i in range(num_target_files):
        relevant_user_ids = list(filter(lambda x: int(x) % num_target_files == i, shard.keys()))
        
        path = merged_shards_prefix + str(i) + '.json' # Add a datestamp hierarchy
        
        if file_exists(path):
            output_dict = dict_ops.load_dict(path, gcs_client=gcs_client)
        else:
            output_dict = dict()
            
        for user_id in relevant_user_ids:
            if int(user_id) > 0:
                for session_id in shard[user_id]:
                    if user_id not in output_dict:
                        output_dict[user_id] = dict()

                    if session_id not in output_dict[user_id]:
                        output_dict[user_id][session_id] = shard[user_id][session_id]

                    else:
                        merged_events = output_dict[user_id][session_id]['Events'] + shard[user_id][session_id]['Events']
                        merged_events_str = map(lambda x: json.dumps(x), merged_events)
                        unique_events_str = set(merged_events_str)
                        unique_events = list(map(lambda x: json.loads(x), unique_events_str))
                        output_dict[user_id][session_id]['Events'] = unique_events
                        output_dict[user_id][session_id]['StartTime'] = min(map(lambda x: int(x['Timestamp']), unique_events))

        dict_ops.save_dict(path, output_dict, gcs_client=gcs_client)
    
##################################################################

NUM_TARGET_FILES = 100
if TESTMODE:
    shard = json.load(open('example_merged.json'))
    generate_sessions_by_user(shard, 'sessions_by_user/', NUM_TARGET_FILES)
else:
    merged_data_prefix = 'gs://ma-muy/baseline_dataset/merged/'
    sessions_by_user_prefix = 'gs://ma-muy/baseline_dataset/sessions_by_user/'
    temp_sessions_by_user_prefix = 'temp_sessions_by_user/'
    
    client = storage.Client.from_service_account_json('../../service-account.json')
    
    merged_paths = get_paths_with_prefix(merged_data_prefix)
    print('Processing' + ' '*(len(paths) - 11) + '|')
    for merged_path in merged_paths[-2:]:
        
        print('=', end='', flush=True)
        source = dict_ops.load_dict(merged_path)
        
        generate_sessions_by_user(source, temp_sessions_by_user_prefix, NUM_TARGET_FILES)
    
    temp_files = get_paths_with_prefix(temp_sessions_by_user_prefix)
    
    print('Uploading Files')
    
    for temp_file in temp_files:
        print('=', end='')
        file_name = temp_file.rsplit('/', 1)[1]
        if 'ipynb' not in file_name:
            target_uri = sessions_by_user_prefix + file_name
            copy_file(temp_file, target_uri, gcs_client=client)

## Preprocess Data & Collect Statistics

- At this point we have the data in a form that is nice for preprocessing.
- During preprocessing we can also collect dataset stats
- According to the paper there are several steps to preprocess the session data:
    - Remove Items with low support (which threshold to use? 10 vs. 20)
    - Remove sessions with less than 3 items
    - Remove users with less than 5 sessions

In [3]:
def filter_sessions_and_users(input_dict, 
                              min_sessions_per_user, 
                              min_events_per_session, 
                              events_per_products):
    
    output_dict = dict()
    
    for user_id in input_dict:
        output_dict[user_id] = dict()
        for session_id in input_dict[user_id]:
            if len(input_dict[user_id][session_id]['Events']) >= min_events_per_session:
                output_dict[user_id][session_id] = input_dict[user_id][session_id]
        if len(output_dict[user_id]) < min_sessions_per_user:
            _ = output_dict.pop(user_id, None)
        else:
            for session_id in output_dict[user_id]:
                product_ids = list(map(lambda x: x['ProductId'], output_dict[user_id][session_id]['Events']))
                for product_id in product_ids:
                    if product_id in events_per_products:
                        events_per_products[product_id] += 1
                    else:
                        events_per_products[product_id] = 1
    
    return output_dict, events_per_products

##################################################################

events_per_product = dict()

sessions_by_user_prefix = 'gs://ma-muy/baseline_dataset/sessions_by_user/'
    
client = storage.Client.from_service_account_json('../../service-account.json')
paths = get_paths_with_prefix(sessions_by_user_prefix)

min_sessions_per_user = 5
min_events_per_session = 3
print('Processing' + ' '*(len(paths) - 11) + '|')
for path in paths:
    
    print('=', end='', flush=True)
    input_dict = dict_ops.load_dict(path, gcs_client=client)
    
    output_dict, events_per_product = filter_sessions_and_users(input_dict,
                                                            min_sessions_per_user,
                                                            min_events_per_session,
                                                            events_per_product)
    
    output_path = path.replace('sessions_by_user', 'filtered_users_and_sessions')
    dict_ops.save_dict(output_path, output_dict, gcs_client=client)
    
dict_ops.save_dict('events_per_product.json', events_per_product)

Processing                                                                                        |

In [11]:
def filter_products_and_collect_stats(input_dict, dataset_stats, products_to_filter, min_events_per_session, events_per_product):
    
    output_dict = dict()
    num_filtered = 0
    
    dataset_stats['num_users'] += len(input_dict)
    for user_id in input_dict:
        sessions = 0
        output_dict[user_id] = dict()
        for session_id in input_dict[user_id]:

            filtered_events = list(filter(lambda x: x['ProductId'] not in products_to_filter, input_dict[user_id][session_id]['Events']))

            if len(filtered_events) >= min_events_per_session:
                sessions += 1
                output_dict[user_id][session_id] = dict()
                output_dict[user_id][session_id]['Events'] = filtered_events
                output_dict[user_id][session_id]['StartTime'] = min(map(lambda x: int(x['Timestamp']), output_dict[user_id][session_id]['Events']))
                dataset_stats['events_per_session'].append(len(filtered_events))
            else:
                # keep events_per_products accurate -> if we do not add a session we remove the counts generated by that
                for product_id in map(lambda x: x['ProductId'], input_dict[user_id][session_id]['Events']):
                    if str(product_id) in events_per_product:
                        events_per_product[str(product_id)] -= 1
                
        dataset_stats['sessions_per_user'].append(sessions)
    return output_dict, dataset_stats, events_per_product

##################################################################

min_events_per_product = 5
min_events_per_session = 3

client = storage.Client.from_service_account_json('../../service-account.json')
preprocessed_prefix = 'gs://ma-muy/baseline_dataset/filtered_users_and_sessions/'
events_per_product = dict_ops.load_dict('events_per_product.json')
products_to_filter = set(list(map(lambda x: int(x), filter(lambda x: events_per_product[x] < min_events_per_product, events_per_product))))

print("Total Products:", len(events_per_product.keys()))
print("Products with low support:", len(products_to_filter))
      
for product_id in products_to_filter:
    _ = events_per_product.pop(str(product_id), None)
    
print("Products with enough support:", len(events_per_product.keys()))

dataset_stats = dict()
dataset_stats['num_users'] = 0
dataset_stats['events_per_session'] = []
dataset_stats['sessions_per_user'] = []

paths = get_paths_with_prefix(preprocessed_prefix)
print('Processing' + ' '*(len(paths) - 11) + '|')
for path in paths:
    print('=', end='', flush=True)
    input_dict = dict_ops.load_dict(path, gcs_client=client)
    
    output_dict, dataset_stats, events_per_product = filter_products_and_collect_stats(
        input_dict, 
        dataset_stats, 
        products_to_filter, 
        min_events_per_session,
        events_per_product)
    
    output_path = path.replace('filtered_users_and_sessions', 'filtered_products')
    dict_ops.save_dict(output_path, output_dict, gcs_client=client)
print('')

products_with_no_events = list(filter(lambda x: events_per_product[x] <= 0, events_per_product))
for product_id in products_with_no_events:
    if events_per_product[product_id] <= 0:
        _ = events_per_product.pop(product_id, None)

dataset_stats['events_per_product'] = events_per_product

dataset_stats['num_sessions'] = sum(dataset_stats['sessions_per_user'])
dataset_stats['median_sessions_per_user'] = median(dataset_stats['sessions_per_user'])
dataset_stats['mean_sessions_per_user'] = mean(dataset_stats['sessions_per_user'])
dataset_stats['std_sessions_per_user'] = stdev(dataset_stats['sessions_per_user'])
del dataset_stats['sessions_per_user']

dataset_stats['median_events_per_product'] = median(dataset_stats['events_per_product'].values())
dataset_stats['mean_events_per_product'] = mean(dataset_stats['events_per_product'].values())
dataset_stats['std_events_per_product'] = stdev(dataset_stats['events_per_product'].values())
dataset_stats['num_products'] = len(dataset_stats['events_per_product'].keys())
del dataset_stats['events_per_product']

dataset_stats['num_events'] = sum(dataset_stats['events_per_session'])
dataset_stats['median_events_per_session'] = median(dataset_stats['events_per_session'])
dataset_stats['mean_events_per_session'] = mean(dataset_stats['events_per_session'])
dataset_stats['std_events_per_session'] = stdev(dataset_stats['events_per_session'])
del dataset_stats['events_per_session']

print(json.dumps(dataset_stats, indent=2))

Total Products: 1126156
Products with low support: 583796
Products with enough support: 542360
Processing                                                                                         |
{
  "num_users": 307526,
  "num_sessions": 6297389,
  "median_sessions_per_user": 11.0,
  "mean_sessions_per_user": 20.47758238327816,
  "std_sessions_per_user": 31.808828447449194,
  "median_events_per_product": 18.0,
  "mean_events_per_product": 84.3949950769435,
  "std_events_per_product": 441.8918137355163,
  "num_products": 542346,
  "num_events": 45771288,
  "median_events_per_session": 5,
  "mean_events_per_session": 7.268296114469028,
  "std_events_per_session": 87.38013699177742
}


## Generate Embedding Dictionary

- Since we will be using a one-hot encoding or later an embedding for the products we need to map those to a contiguous Id space
- Therefore we need to go through all the products and generate an embedding dictionary, and add this feature to the features generated in the next step

In [12]:
def generate_embedding_ids(input_dict, embedding_dict, next_embedding_id):
    output_dict = dict()
    for user_id in input_dict:
        output_dict[user_id] = dict()
        for session_id in input_dict[user_id]:
            output_dict[user_id][session_id] = dict()
            output_dict[user_id][session_id]['StartTime'] = input_dict[user_id][session_id]['StartTime']
            output_dict[user_id][session_id]['Events'] = []
            for event in input_dict[user_id][session_id]['Events']:
                if str(event['ProductId']) in embedding_dict['ToEmbedding']:
                    event['EmbeddingId'] = embedding_dict['ToEmbedding'][str(event['ProductId'])]
                else:
                    embedding_dict['ToEmbedding'][str(event['ProductId'])] = next_embedding_id
                    embedding_dict['FromEmbedding'][str(next_embedding_id)] = event['ProductId']
                    next_embedding_id += 1
                    event['EmbeddingId'] = embedding_dict['ToEmbedding'][str(event['ProductId'])]
                output_dict[user_id][session_id]['Events'].append(event)
    return output_dict, embedding_dict, next_embedding_id

##################################################################

filtered_products_prefix = 'gs://ma-muy/baseline_dataset/filtered_products/'
embedded_prefix = 'gs://ma-muy/baseline_dataset/embedded_products/'

client = storage.Client.from_service_account_json('../../service-account.json')
    
embedding_dict = dict()
embedding_dict['ToEmbedding'] = dict()
embedding_dict['FromEmbedding'] = dict()
next_embedding_id = 0

paths = get_paths_with_prefix(filtered_products_prefix)
print('Processing' + ' '*(len(paths) - 11) + '|')
for path in paths:

    print('=', end='', flush=True)
    file_name = gcs_utils.get_file_name(path)

    input_dict = dict_ops.load_dict(path, gcs_client=client)

    output_dict, embedding_dict, next_embedding_id = generate_embedding_ids(input_dict, embedding_dict, next_embedding_id)

    dict_ops.save_dict(embedded_prefix + file_name, output_dict, gcs_client=client)
print('')
print('Next EmbeddingId:', next_embedding_id)
dict_ops.save_dict('gs://ma-muy/baseline_dataset/embedding_dict.json', embedding_dict, gcs_client=client)

Processing                                                                                         |
Next EmbeddingId: 542346


## Generate Train, Validation and Test Dataset

- For each user keep the last session in the test set
- The rest represents the training set
- The last session of users in the training set is extracted again and used as the validation set

In [13]:
def split_training_set(input_dict):
    train_set = dict()
    eval_set = dict()
    test_set = dict()
    for user_id in input_dict:
        train_set[user_id] = dict()
        eval_set[user_id] = dict()
        test_set[user_id] = dict()
        
        sorted_sessions = sorted(map(lambda x: (x, input_dict[user_id][x]), input_dict[user_id].keys()), key=lambda y: y[1]['StartTime'])
        for idx, sorted_session in enumerate(sorted_sessions):
            if idx == len(sorted_sessions) - 1:
                test_set[user_id][sorted_session[0]] = sorted_session[1]
            elif idx == len(sorted_session) - 2:
                eval_set[user_id][sorted_session[0]] = sorted_session[1]
            else:
                train_set[user_id][sorted_session[0]] = sorted_session[1]
    
    return train_set, eval_set, test_set

##################################################################

embedded_prefix = 'gs://ma-muy/baseline_dataset/embedded_products/'
train_prefix = 'gs://ma-muy/baseline_dataset/train/'
eval_prefix = 'gs://ma-muy/baseline_dataset/eval/'
test_prefix = 'gs://ma-muy/baseline_dataset/test/'

client = storage.Client.from_service_account_json('../../service-account.json')
paths = get_paths_with_prefix(embedded_prefix)
print('Processing' + ' '*(len(paths) - 11) + '|')
for path in paths:
    print('=', end='', flush=True)
    file_name = gcs_utils.get_file_name(path)
    
    input_dict = dict_ops.load_dict(path, gcs_client=client)
    
    train_set, eval_set, test_set = split_training_set(input_dict)
    
    dict_ops.save_dict(train_prefix + file_name, train_set, gcs_client=client)
    dict_ops.save_dict(eval_prefix + file_name, eval_set, gcs_client=client)
    dict_ops.save_dict(test_prefix + file_name, test_set, gcs_client=client)

Processing                                                                                         |