In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, activations, losses, Model, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import SparseCategoricalAccuracy, SparseTopKCategoricalAccuracy
# import tensorflow_ranking as tfr
from sklearn.model_selection import train_test_split
import os, time, gc, json
from tqdm.notebook import tqdm, trange
from matplotlib import pyplot as plt
from joblib import Parallel, delayed
from ast import literal_eval
import multiprocessing
import itertools

In [2]:
def create_shift_features(df, c, lags=[1,2,3,4,5]):
    try:
        naid = df[c[:-3]].cat.categories.tolist().index('_')
    except ValueError:
        naid = df[c].max() + 1
    
    for i in lags:
        if (c + '_lag%d'%i) in df.columns:
            continue
        tmp = df[['session_id_hash', c]].shift(i)
        tmp.loc[df.session_id_hash != tmp.session_id_hash, c] = naid   
        tmp[c] = tmp[c].astype(df[c].dtype)
        df[c + '_lag%d'%i] = tmp[c]
        print('Created\t' + c + '_lag%d'%i)
    print()

In [3]:
# read training data
df_browse = pd.read_csv("./data/browsing_train.csv")
df_browse['train'] = np.int8(1)

In [4]:
df_browse.server_timestamp_epoch_ms = pd.to_datetime(df_browse.server_timestamp_epoch_ms, unit='ms')
df_browse['server_day'] = df_browse.server_timestamp_epoch_ms.dt.date
print(df_browse.server_timestamp_epoch_ms.min())
print(df_browse.server_timestamp_epoch_ms.max())
val_session_id = df_browse.session_id_hash[df_browse.server_day > pd.to_datetime('2019-03-20')].unique()

2019-01-15 05:02:44.513000
2019-04-15 03:59:58.560000


In [5]:
df_browse.memory_usage().sum()/1024**2

1961.2509031295776

In [6]:
# read testing data
with open('./data/rec_test_phase_1.json') as f:
    df_test = json.load(f)
df_test = pd.json_normalize(df_test, record_path =['query'])
df_test['train'] = np.int8(0)
# todo: should take search event into account
df_test = df_test[~df_test.is_search]

df_test.server_timestamp_epoch_ms = pd.to_datetime(df_test.server_timestamp_epoch_ms, unit='ms')
df_test['server_day'] = df_test.server_timestamp_epoch_ms.dt.date
print(df_test.server_timestamp_epoch_ms.min())
print(df_test.server_timestamp_epoch_ms.max())

df_test = df_test[df_browse.columns]
assert df_test.session_id_hash.isin(df_browse.session_id_hash).sum() == 0
gc.collect()

2019-04-15 04:00:00.991000
2019-05-14 03:57:58.517000


49596

In [7]:
# set items which only appears in test set as minority
for c in ['product_sku_hash', 'hashed_url']:
    tmp = df_test[c][~df_test[c].isna()]
    test_only_sku = tmp[~tmp.isin(df_browse[c])]
    df_test.loc[df_test[c].isin(test_only_sku), c] = 'minority'
    del tmp, test_only_sku
    gc.collect()

In [8]:
# combine training and testing data
df_browse = pd.concat((df_browse, df_test), ignore_index=True)
df_browse.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True, ignore_index=True)
del df_test
gc.collect()
df_browse.memory_usage().sum()/1024**2

1991.6658773422241

In [9]:
# combine values with few records in training
for c in ['product_sku_hash', 'hashed_url']:
    tmp = df_browse[c].value_counts()
    df_browse.loc[df_browse[c].isin(tmp[tmp<=1].index), c] = 'minority'

In [10]:
# factorize columns
for c in ['product_sku_hash', 'hashed_url', 'event_type', 'product_action']:
    # set NaN to _
    if df_browse[c].isnull().values.any():
        df_browse.loc[df_browse[c].isna(), c] = '_'
    # add new column, id start from 0
    df_browse[c] = df_browse[c].astype('category')
    df_browse[c + '_id'] = df_browse[c].cat.codes

In [11]:
df_browse.memory_usage().sum()/1024**2

1583.5668745040894

In [12]:
# add shift features
max_lag = 5
for c in ['product_sku_hash_id', 'hashed_url_id', 'event_type_id', 'product_action_id']:
    create_shift_features(df_browse, c, lags=[i+1 for i in range(max_lag)])

Created	product_sku_hash_id_lag1
Created	product_sku_hash_id_lag2
Created	product_sku_hash_id_lag3
Created	product_sku_hash_id_lag4
Created	product_sku_hash_id_lag5

Created	hashed_url_id_lag1
Created	hashed_url_id_lag2
Created	hashed_url_id_lag3
Created	hashed_url_id_lag4
Created	hashed_url_id_lag5

Created	event_type_id_lag1
Created	event_type_id_lag2
Created	event_type_id_lag3
Created	event_type_id_lag4
Created	event_type_id_lag5

Created	product_action_id_lag1
Created	product_action_id_lag2
Created	product_action_id_lag3
Created	product_action_id_lag4
Created	product_action_id_lag5



In [13]:
# add continuous features
# df_browse['sess_step'] = df_browse.groupby('session_id_hash').cumcount().astype(np.int16)

In [14]:
# add target features (this column only useful for train set)
create_shift_features(df_browse, 'product_sku_hash_id', lags=[-1])
df_browse.rename(columns={'product_sku_hash_id_lag-1': 'next_sku'}, inplace=True)

Created	product_sku_hash_id_lag-1



In [None]:
naid = df_browse.product_sku_hash.cat.categories.tolist().index('_')
minorid = df_browse.product_sku_hash.cat.categories.tolist().index('minority')

# add next_interacted_sku
df_browse['next_interacted_sku'] = df_browse.next_sku.copy()
df_browse.loc[df_browse.next_sku==naid, 'next_interacted_sku'] = np.nan
df_browse.next_interacted_sku = df_browse.groupby('session_id_hash')['next_interacted_sku'].apply(lambda x: x.bfill())
df_browse.loc[df_browse.next_interacted_sku.isna(), 'next_interacted_sku'] = naid
gc.collect()

In [None]:
# set kfold based on date
df_browse['kfoldidx'] = 1
df_browse.loc[df_browse.session_id_hash.isin(val_session_id), 'kfoldidx'] = 0
df_browse.loc[df_browse.train==0, 'kfoldidx'] = -1

In [None]:
# set a random id per group, used for calculating testing metric later
df_browse = df_browse.sample(frac=1, random_state=0).reset_index(drop=True)
df_browse['rand_id'] = df_browse.groupby('session_id_hash').cumcount().astype(np.int16)
df_browse.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True, ignore_index=True)

# pick a row per group with min rand_id
# df_browse.head(1000)[df_browse.head(1000).groupby(['session_id_hash'])['rand_id'].transform(min) == df_browse.head(1000)['rand_id']]

In [3]:
# df_browse.to_parquet('df_browse_v2_update_0516')

# can simply reuse df_browse_v3_update
df_browse = pd.read_parquet('df_browse_v3_update')

In [4]:
df_browse['sample_weights'] = pd.to_datetime(df_browse.server_day).dt.month

In [5]:
# get num_x from x_lag1 because function create_shift_features
# may created a new_id in lag columns
num_sku = df_browse.product_sku_hash_id_lag1.max() + 1
num_url = df_browse.hashed_url_id_lag1.max() + 1
# num_action = df_browse.product_action_id_lag1.max() + 1
# num_event = df_browse.event_type_id_lag1.max() + 1

In [6]:
fea_prod = ['product_sku_hash_id_lag5', 'product_sku_hash_id_lag4', 'product_sku_hash_id_lag3', 
            'product_sku_hash_id_lag2', 'product_sku_hash_id_lag1', 'product_sku_hash_id']
fea_url = ['hashed_url_id_lag5', 'hashed_url_id_lag4', 'hashed_url_id_lag3', 
           'hashed_url_id_lag2', 'hashed_url_id_lag1', 'hashed_url_id']
# fea_event = ['event_type_id_lag5', 'event_type_id_lag4', 'event_type_id_lag3',
#              'event_type_id_lag2', 'event_type_id_lag1', 'event_type_id']
# fea_action = ['product_action_id_lag5', 'product_action_id_lag4', 'product_action_id_lag3', 'product_action_id_lag2',
#               'product_action_id_lag1', 'product_action_id',]
# fea_step = ['sess_step']

features = fea_prod + fea_url # + fea_event + fea_action + fea_step

In [7]:
naid = df_browse.product_sku_hash.cat.categories.tolist().index('_')
minorid = df_browse.product_sku_hash.cat.categories.tolist().index('minority')

df_train = df_browse[df_browse.train==1]
df_train = df_train[(df_train.next_sku!=naid)].reset_index(drop=True)

df_test = df_browse[df_browse.train==0]
df_test = df_test[(df_test.next_sku!=naid)].reset_index(drop=True)
gc.collect()

18

In [8]:
df_train.drop(columns='kfoldidx', inplace=True)
df_test['kfoldidx'] = 1

In [9]:
def strafiedkfold(df, idcol, k=5):
    """
    sklearn kfold will do unnessary sorting
    so build my own function
    """
    df_kidx = df_train.session_id_hash.unique()
    np.random.seed(123)
    np.random.shuffle(df_kidx)
    df_kidx = pd.DataFrame({idcol: df_kidx})
    df_kidx['kfoldidx'] = df_kidx.index % k
    df = df.merge(df_kidx, on=idcol, copy=False)
    print(df['kfoldidx'].value_counts())
    return df

df_train = strafiedkfold(df_train, 'session_id_hash', k=5)
df_train = pd.concat([df_train, df_test])
x_train = []
x_val = []
x_one = []
for f in [fea_prod, fea_url]:
    x_train.append(np.array(df_train.loc[df_train.kfoldidx!=0, f]))
    x_val.append(np.array(df_train.loc[df_train.kfoldidx==0, f]))
    x_one.append(x_train[-1][0:100,])
x_train_sessid = df_train.loc[df_train.kfoldidx!=0].session_id_hash.reset_index(drop=True)
x_val_sessid = df_train.loc[df_train.kfoldidx==0].session_id_hash.reset_index(drop=True)

y_train = np.array(df_train.loc[df_train.kfoldidx!=0, 'next_sku'])
y_val = np.array(df_train.loc[df_train.kfoldidx==0, 'next_sku'])
y_one = y_train[0:100]

1    1870935
2    1869633
3    1869217
4    1867049
0    1866106
Name: kfoldidx, dtype: int64


In [10]:
x_train[0].shape

(7651082, 6)

In [11]:
x_train_weights =  df_train.loc[df_train.kfoldidx!=0].sample_weights.reset_index(drop=True)

In [12]:
# model architecture
class MLP(Model):
    def __init__(self, num_sku, num_url, embed_dim=312):
        super().__init__()
        
        self.normal_init = keras.initializers.RandomNormal(mean=0., stddev=0.01)
        
        self.sku_embed = layers.Embedding(num_sku, embed_dim, self.normal_init)
        self.url_embed = layers.Embedding(num_url, embed_dim, self.normal_init)
        
        self.dense1 = layers.Dense(1024)
        self.norm1 = layers.BatchNormalization()
        self.activate1 = layers.ReLU()
        self.dropout1 = layers.Dropout(0.2)
        
        self.dense2 = layers.Dense(1024)
        self.norm2 = layers.BatchNormalization()
        self.activate2 = layers.ReLU()
        self.dropout2 = layers.Dropout(0.2)
        
        self.dense3 = layers.Dense(embed_dim)
        self.norm3 = layers.BatchNormalization()
        self.activate3 = layers.ReLU(name='sess_embed')
        self.dropout3 = layers.Dropout(0.2)
        
        self.output_bias = tf.random.normal((num_sku,), 0., 0.01)
    
    def call(self, inputs):
        lag_sku, lag_url = inputs['sku'], inputs['url']
        
        sku_embed = layers.Flatten()(self.sku_embed(lag_sku))
        url_embed = layers.Flatten()(self.url_embed(lag_url))
        
        x = layers.concatenate([sku_embed, url_embed])
        x = self.activate1(self.norm1(self.dense1(x)))
        x = self.dropout1(x)

        x = self.activate2(self.norm2(self.dense2(x)))
        x = self.dropout2(x)

        sess_embed = self.activate3(self.norm3(self.dense3(x)))
        x = self.dropout3(sess_embed)
        
        x = tf.matmul(x, tf.transpose(self.sku_embed.weights[0]))
        logits = tf.nn.bias_add(x, self.output_bias, name='logits')

        return {'logits': logits, 'embed': sess_embed}
    
    def build_graph(self):
        x = {'sku': Input(shape=(6)), 'url': Input(shape=(6))}
        return Model(inputs=x, outputs=self.call(x))
    
    def predict_subset(self, x, u, l):
        _x = []
        for i in range(len(x)):
            _x.append(x[i][u:l])
        return self.predict(_x)
    
keras.utils.plot_model(MLP(num_sku, num_url).build_graph(), show_shapes=False)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [13]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of devices: 1


In [14]:
batch_size = 2000
tf_train = tf.data.Dataset.from_tensor_slices(({'sku':x_train[0], 'url':x_train[1]}, y_train, x_train_weights)).batch(batch_size)
tf_val = tf.data.Dataset.from_tensor_slices(({'sku':x_val[0], 'url':x_val[1]}, y_val)).batch(batch_size)
tf_one = tf.data.Dataset.from_tensor_slices(({'sku':x_one[0], 'url':x_one[1]}, y_one)).batch(10)


In [15]:
dist_train = strategy.experimental_distribute_dataset(tf_train)
dist_val = strategy.experimental_distribute_dataset(tf_val)

In [16]:
with strategy.scope():
    model_mlp = MLP(num_sku, num_url)
    LossFunc = {'logits':keras.losses.SparseCategoricalCrossentropy(from_logits=True), 'embed':None}
    metrics = {'logits': [keras.metrics.SparseCategoricalAccuracy(name='top1_acc'), 
                      keras.metrics.SparseTopKCategoricalAccuracy(k=20, name='top20_acc')]}
    model_mlp.compile(optimizer='adam', loss=LossFunc, metrics=metrics)
    

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

In [22]:
es = EarlyStopping(monitor='val_logits_loss', mode='min', verbose=1, patience=1)
mc = ModelCheckpoint('model_update8.h5', monitor='val_logits_loss', mode='min', verbose=1, save_best_only=True, save_weights_only=True)
history = model_mlp.fit(tf_train.shuffle(batch_size), epochs=10, validation_data=tf_val, callbacks=[es, mc])


Epoch 1/10

Epoch 00001: val_logits_loss improved from inf to 1.69220, saving model to model_update8.h5
Epoch 2/10

Epoch 00002: val_logits_loss improved from 1.69220 to 1.64941, saving model to model_update8.h5
Epoch 3/10

Epoch 00003: val_logits_loss improved from 1.64941 to 1.63874, saving model to model_update8.h5
Epoch 4/10

Epoch 00004: val_logits_loss did not improve from 1.63874
Epoch 00004: early stopping


In [18]:
model_mlp.load_weights('model_update8.h5')

In [19]:
naid = df_browse.product_sku_hash.cat.categories.tolist().index('_')
minorid = df_browse.product_sku_hash.cat.categories.tolist().index('minority')

@tf.function
def mrr(i, model, x, y, topk=tf.constant(20)):
    # this is for tf dataset version
    interval = tf.shape(y)[0]
    i = tf.cast(i, tf.int32)
    _u = interval * i
    _l = interval * (i+1)
    
    y_pred = model(x, training=False)
    col_to_zero = [naid, minorid]
    tnsr_shape=tf.shape(y_pred['logits'])
    mask = [tf.one_hot(col_num*tf.ones((tnsr_shape[0], ), dtype=tf.int32), tnsr_shape[-1]) for col_num in col_to_zero]
    mask = tf.reduce_sum(mask, axis=0) * -9999
    y_pred['logits'] = tf.add(y_pred['logits'], mask)
    
    # topk items' id for each session, 2d array
    r = tf.math.top_k(y_pred['logits'], k=topk).indices
    # True indicate that item is the correct prediction
    r = tf.cast(tf.equal(r, tf.expand_dims(tf.cast(y, tf.int32), 1)), tf.float32)
    # rank of the correct prediction, rank = 9999999+1 if no correction prediction within topk
    r = tf.add((tf.reduce_sum(r, 1)-1) * -9999999, tf.cast(tf.argmax(r, 1) + 1, tf.float32))
    return 1/r

In [20]:
# training MRR

s = time.time()
rr = []
# Iterate over the `tf.distribute.DistributedDataset`
i = tf.constant(0)
topk = tf.constant(20)
for x, y, z in dist_train:
    # process dataset elements
    rr.append(strategy.run(mrr, args=(i, model_mlp, x, y, topk)))
    i += 1
    if i % 1000 == 0:
        print(time.time()-s)

rr = np.append(np.array(rr[0:-1]).reshape(-1), np.array(rr[-1]))
print('MRR=%.4f'%np.mean(rr))
out_rr = (rr<1/topk.numpy())
print('%d out of %d records (%.2f%%) with prediction outside top%d'%(
    out_rr.sum(), out_rr.shape[0], (out_rr.mean())*100., topk.numpy()), flush=True)

df_train['rr'] = -1
df_train['r'] = -1
df_train.loc[df_train.kfoldidx!=0, 'rr'] = rr
df_train.loc[df_train.kfoldidx!=0, 'r'] = 1/rr

print('---------------------------')
print('random pick one per session')
cond = df_train.loc[df_train.kfoldidx!=0].groupby(['session_id_hash'])['rand_id'].transform(min) == df_train.loc[df_train.kfoldidx!=0, 'rand_id']
print('MRR=%.4f ' % np.mean(rr[cond]))
print('%d out of %d sessions (%.2f%%) with prediction outside top%d'%(
    out_rr[cond].sum(), out_rr[cond].shape[0], (out_rr[cond].mean())*100., topk.numpy()), flush=True)



58.72942757606506
115.52186465263367
173.78733730316162
MRR=0.7535
600733 out of 7651082 records (7.85%) with prediction outside top20
---------------------------
random pick one per session
MRR=0.7902 
166926 out of 2157060 sessions (7.74%) with prediction outside top20


In [21]:
# validation MRR

s = time.time()
rr = []
# Iterate over the `tf.distribute.DistributedDataset`
i = tf.constant(0)
for x, y in dist_val:
    # process dataset elements
    rr.append(strategy.run(mrr, args=(i, model_mlp, x, y, topk)))
    i += 1
    if i % 1000 == 0:
        print(time.time()-s)

rr = np.append(np.array(rr[0:-1]).reshape(-1), np.array(rr[-1]))
print('MRR=%.4f'%np.mean(rr))
out_rr = (rr<1/topk.numpy())
print('%d out of %d records (%.2f%%) with prediction outside top%d'%(
    out_rr.sum(), out_rr.shape[0], (out_rr.mean())*100., topk.numpy()), flush=True)

df_train.loc[df_train.kfoldidx==0, 'rr'] = rr
df_train.loc[df_train.kfoldidx==0, 'r'] = 1/rr

print('---------------------------')
print('random pick one per session')
cond = df_train.loc[df_train.kfoldidx==0].groupby(['session_id_hash'])['rand_id'].transform(min) == df_train.loc[df_train.kfoldidx==0, 'rand_id']
print('MRR=%.4f ' % np.mean(rr[cond]))
print('%d out of %d sessions (%.2f%%) with prediction outside top%d'%(
    out_rr[cond].sum(), out_rr[cond].shape[0], (out_rr[cond].mean())*100., topk.numpy()), flush=True)



MRR=0.7300
206611 out of 1866106 records (11.07%) with prediction outside top20
---------------------------
random pick one per session
MRR=0.7778 
52125 out of 526101 sessions (9.91%) with prediction outside top20


## MRR of testing set

In [22]:
df_test = df_browse[df_browse.train==0].copy()
df_test = df_test.loc[df_test.next_interacted_sku!=naid]
x_test = [np.array(df_test[fea_prod]), np.array(df_test[fea_url])]
x_test_sessid = df_test.session_id_hash.reset_index(drop=True)
y_test = np.array(df_test.next_interacted_sku)

In [23]:
batch_size = 2000
tf_test = tf.data.Dataset.from_tensor_slices(({'sku':x_test[0], 'url':x_test[1]}, y_test)).batch(batch_size)
dist_test = strategy.experimental_distribute_dataset(tf_test)

In [28]:
s = time.time()
rr = []
# Iterate over the `tf.distribute.DistributedDataset`
i = tf.constant(0)
topk = tf.constant(20)
for x, y in dist_test:
    # process dataset elements
    rr.append(strategy.run(mrr, args=(i, model_mlp, x, y)))
    i += 1
    if i % 1000 == 0:
        print(time.time()-s)

rr = np.append(np.array(rr[0:-1]).reshape(-1), np.array(rr[-1]))
print('MRR=%.4f'%np.mean(rr))
out_rr = (rr<1/topk.numpy())
print('%d out of %d records (%.2f%%) with prediction outside top%d'%(
    out_rr.sum(), out_rr.shape[0], (out_rr.mean())*100., topk.numpy()), flush=True)

df_test['rr'] = rr
df_test['r'] = 1/rr

print('---------------------------')
print('random pick one per session')
cond = df_test.groupby(['session_id_hash'])['rand_id'].transform(min) == df_test['rand_id']
print('MRR=%.4f ' % np.mean(rr[cond]))
print('%d out of %d sessions (%.2f%%) with prediction outside top%d'%(
    out_rr[cond].sum(), out_rr[cond].shape[0], (out_rr[cond].mean())*100., topk.numpy()), flush=True)


MRR=0.2830
136419 out of 312691 records (43.63%) with prediction outside top20
---------------------------
random pick one per session
MRR=0.3318 
18534 out of 52660 sessions (35.20%) with prediction outside top20


In [30]:
df_test.r = df_test.r.astype(np.int32)
df_test.r_next_sku = df_test.r_next_sku.astype(np.int32)

In [37]:
df_test.to_parquet('df_test_update8')

## prepare submission

In [29]:
df_submission = df_browse[df_browse.train==0]
df_submission = df_submission.groupby('session_id_hash').tail(1).reset_index(drop=True)

In [30]:
batch_size = 2000
x_submission = [np.array(df_submission[fea_prod]), np.array(df_submission[fea_url])]
tf_submission = tf.data.Dataset.from_tensor_slices(({'sku':x_submission[0], 'url':x_submission[1]})).batch(batch_size)

In [31]:
naid = df_browse.product_sku_hash.cat.categories.tolist().index('_')
minorid = df_browse.product_sku_hash.cat.categories.tolist().index('minority')

next_sku_all = []
for x in tqdm(tf_submission):
    y = model_mlp.predict(x)
    y['logits'][:,naid] = -99999
    y['logits'][:,minorid] = -99999
    next_sku_id = np.argpartition(y['logits'], range(-20, 0), axis=1)[:, ::-1][:,0:20]
    next_sku_all += np.array(df_browse.product_sku_hash.cat.categories)[next_sku_id].tolist()

  0%|          | 0/72 [00:00<?, ?it/s]

In [32]:
test_file='./data/rec_test_phase_1.json'
with open(test_file) as json_file:
    # read the test cases from the provided file
    test_queries = json.load(json_file)

In [33]:
def set_submission(q):
    sess_id = q['query'][0]['session_id_hash']
    try:
        next_sku = next_sku_all[df_submission.session_id_hash.tolist().index(sess_id)]
    except ValueError:
        # query with only search events not exists in df_test
        next_sku = np.random.choice(df_browse.product_sku_hash.cat.categories, 20, False).tolist()
    
    # copy the test case
    _pred = dict(q)

    # append the label - which needs to be a list
    _pred["label"] = next_sku
    return _pred


In [34]:
my_predictions = Parallel(n_jobs=multiprocessing.cpu_count()//2, backend='multiprocessing')(delayed(set_submission)(q) for q in tqdm(test_queries))
# check for consistency
assert len(my_predictions) == len(test_queries)

  0%|          | 0/142327 [00:00<?, ?it/s]

In [35]:
EMAIL = ''
local_prediction_file = '{}_{}.json'.format(EMAIL.replace('@', '_'), round(time.time() * 1000))
# dump to file
with open(local_prediction_file, 'w') as outfile:
    json.dump(my_predictions, outfile, indent=2)

In [36]:
print(local_prediction_file)

louiskitlung_connect.hku.hk_1621760806767.json


In [37]:
from dotenv import load_dotenv
from datetime import datetime
import boto3
load_dotenv(verbose=True, dotenv_path='./submission/upload.env')

BUCKET_NAME = os.getenv('BUCKET_NAME') # you received it in your e-mail
EMAIL = os.getenv('EMAIL') # the e-mail you used to sign up
PARTICIPANT_ID = os.getenv('PARTICIPANT_ID') # you received it in your e-mail
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY') # you received it in your e-mail
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY') # you received it in your e-mail

def upload_submission(
        local_file: str,
        task: str
):
    """
    Thanks to Alex Egg for catching the bug!

    :param local_file: local path, may be only the file name or a full path
    :param task: rec or cart
    :return:
    """

    print("Starting submission at {}...\n".format(datetime.utcnow()))
    # instantiate boto3 client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY ,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name='us-west-2'
    )
    s3_file_name = os.path.basename(local_file)
    # prepare s3 path according to the spec
    s3_file_path = '{}/{}/{}'.format(task, PARTICIPANT_ID, s3_file_name)  # it needs to be like e.g. "rec/id/*.json"
    # upload file
    s3_client.upload_file(local_file, BUCKET_NAME, s3_file_path)
    # say bye
    print("\nAll done at {}: see you, space cowboy!".format(datetime.utcnow()))

    return


In [38]:
upload_submission(local_file=local_prediction_file, task='rec')

Starting submission at 2021-05-23 09:07:00.371677...


All done at 2021-05-23 09:07:05.752167: see you, space cowboy!


In [35]:
!date

Mon May 17 04:36:37 UTC 2021
