# Baseline Dataset

- This dataset is used to implement the baseline from the paper "Personalizing Session based Recommendations with Hierarchical RNNs" -> resources/papers/personalizing_session_based_rec.pdf
- This dataset is generated from the OnlineShopTrafficTracking Table in BigQuery
- Clean out bots using: https://github.com/monperrus/crawler-user-agents/blob/master/crawler-user-agents.json

In [34]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

TESTMODE = True
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=ResourceWarning)
from google.cloud import bigquery
from google.cloud import storage
import pandas as pd
import time
from io import StringIO
from dg_ml_core.datastores import gcs_utils
import requests
import json
import csv

In [36]:
def get_bots_list():
    url = 'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json'
    response = requests.get(url)
    data = json.loads(response.content)
    all_instances = [item for sublist in map(lambda x: x['instances'], data) for item in sublist]
    return all_instances

def clean_dataset(source, target):
    col_types = {"ProductId": int, 
             "UserId": int, 
             "UserAgent": str, 
             "LastLoggedInUserId": int, 
             "SessionId": int, 
             "Timestamp": int}
    
    df = pd.read_csv(source).fillna(-1).astype(col_types)
    bots = get_bots_list()
    df = df[~df.UserAgent.isin(bots)]
    
    no_user_id_mask = df.UserId == -1
    df.loc[no_user_id_mask, 'UserId'] = df.loc[no_user_id_mask, 'LastLoggedInUserId']
    
    df.to_csv(target, index=False, columns=['UserId', 'ProductId', 'SessionId', 'Timestamp'])

    return target

def merge_sessions(reader):
    sessions_by_user = dict()
    for row in reader:
        if row['UserId'] not in sessions_by_user:
            sessions_by_user[row['UserId']] = dict()
        if row['SessionId'] not in sessions_by_user[row['UserId']]:
            sessions_by_user[row['UserId']][row['SessionId']] = dict()
            sessions_by_user[row['UserId']][row['SessionId']]['Events'] = []
        
        sessions_by_user[row['UserId']][row['SessionId']]['Events'].append(
            {
                "ProductId": int(row['ProductId']),
                "Timestamp": int(row['Timestamp'])
            })
        
        first_event_ts = min(map(lambda x: int(x['Timestamp']), sessions_by_user[row['UserId']][row['SessionId']]['Events']))
        sessions_by_user[row['UserId']][row['SessionId']]['StartTime'] = first_event_ts
    return sessions_by_user

## Execute Query in BQ

- Here we extract the relevant features out of the large collection of visits

In [29]:
query = """
SELECT (SELECT Value FROM UNNEST(ActionParameters) WHERE Key = 'id') as ProductId, LastLoggedInUserId, UserId, SessionId, UserAgent, Timestamp
FROM `dg-prod-personalization.PersonalizationDataV2.OnlineShopTrafficTracking` 
WHERE LOWER(ControllerName) = 'product' AND LOWER(ActionName) = 'show'
"""

if TESTMODE:
    query += ' AND _PARTITIONTIME = TIMESTAMP("2019-02-11")'
    
print('Executing query {}. \nYou have 5 seconds to cancel...'.format(query))
time.sleep(5)

client = bigquery.Client()
dataset_ref = client.dataset('MAMuy', project='machinelearning-prod')
table_ref = dataset_ref.table('baseline_dataset')

job_config = bigquery.job.QueryJobConfig(
    allow_large_results=True, 
    destination=table_ref,
    write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)

query_job = client.query(query, job_config=job_config, job_id_prefix='baseline_dataset_query_', location='EU')
print('Running Job {}'.format(query_job.job_id))
query_job.result()

print('Query execution done')

Executing query 
SELECT (SELECT Value FROM UNNEST(ActionParameters) WHERE Key = 'id') as ProductId, LastLoggedInUserId, UserId, SessionId, UserAgent, Timestamp
FROM `dg-prod-personalization.PersonalizationDataV2.OnlineShopTrafficTracking` 
WHERE LOWER(ControllerName) = 'product' AND LOWER(ActionName) = 'show'
 AND _PARTITIONTIME = TIMESTAMP("2019-02-11"). 
You have 5 seconds to cancel...
Running Job baseline_dataset_query_46a37433-e183-435b-aa6b-ca40170d95ab


<google.cloud.bigquery.table.RowIterator at 0x7fe32584ee48>

Query execution done


## Extract to GCS

- Extract the table containing the relevant features to GCS

In [30]:
destination_uri = 'gs://ma-muy/baseline_dataset/raw/*.csv'

client = bigquery.Client()
dataset_ref = client.dataset('MAMuy', project='machinelearning-prod')
table_ref = dataset_ref.table('baseline_dataset')

extract_job = client.extract_table(
    table_ref,
    destination_uri,
    location='EU',
    job_id_prefix='baseline_dataset_extract_')

print('Running Job {}'.format(extract_job.job_id))
extract_job.result()
print('Extraction done')

Running Job baseline_dataset_extract_78ed9046-82b1-4de6-9707-8cbcec1225e0


<google.cloud.bigquery.job.ExtractJob at 0x7fe3268db2b0>

Extraction done


## Clean data

- Here we clean the data.
- Specifically there are two steps:
  - Clean out bot visits
  - Merge LastLoggedInUserId and UserId

In [None]:
if TESTMODE:
    print('Processing example.csv')
    df = clean_dataset('example.csv', 'example_clean.csv')
    
else:
    raw_data_prefix = 'gs://ma-muy/baseline_dataset/raw/'
    cleaned_data_prefix = 'gs://ma-muy/baseline_dataset/clean/'
    
    client = storage.Client()
    
    raw_uris = gcs_utils.get_uris_with_prefix(raw_data_prefix, storage_client=client)
    
    for raw_uri in raw_uris:
        clean_uri = cleaned_data_prefix + gcs_utils.get_file_name(raw_uri)

        print('Downloading {}'.format(raw_uri))
        source = StringIO(gcs_utils.download_string(raw_uri))
        target = StringIO()
        
        clean_dataset(source, target)
        
        print('Uploading {}'.format(clean_uri))
        gcs_utils.upload_string(target.getvalue(), clean_uri)

## Merge Sessions

- In this step we will merge all the single visit events into sessions
- Further we merge all sessions to the specific user

In [37]:
if TESTMODE:
    reader = csv.DictReader(open('example_clean.csv'))
    
    sessions_by_user = merge_sessions(reader)
    
    json.dump(sessions_by_user, open('example_merged.json', 'w'), indent=2)

else:
    cleaned_data_prefix = 'gs://ma-muy/baseline_dataset/clean/'
    merged_data_prefix = 'gs://ma-muy/baseline_dataset/merged/'
    
    clean_uris = gcs_utils.get_uris_with_prefix(cleaned_data_prefix)
    for clean_uri in clean_uris:
        merged_uri = merged_data_prefix + gcs_utils.get_file_name(clean_uri)
        
        print('Downloading {}'.format(clean_uri))
        source = StringIO(gcs_utils.download_string(clean_uri))
        reader = csv.DictReader(source)
        
        sessions_by_user = merge_sessions(reader)
        
        target = StringIO()
        json.dump(sessions_by_user, target, indent=2)
        
        print('Uploading {}'.format(merged_uri))
        gcs_utils.upload_string(target.getvalue(), merged_uri)

## Merge shards

- As of now we have several shards for each day, containing the sessions aggregated to the user level.
- The merging of the shards is the most time consuming part of the data generation process. 
- We need to merge all sessions of a specific user into one datastructure.
- However since we work with daily exports we can stop aggregate the data only over one day, this saves a lot of time and the data generation process can be iteratively extended day by day.
- For now we will not track sessions that go over multiple days, we approximate that by ending the session at midnight and starting a new one the next day.

## Generate User Parallel Mini batches

- Now that we know all the sessions of all the users we can generate the user parallel mini batches.