In [59]:
import pandas as pd
import numpy as np
import polars as pl

### Load data

In [60]:
class config:
    data_path = "../data/"
    validation_path = "../data/local_validation/"


train = pl.read_parquet(config.validation_path + "train.parquet")
test = pl.read_parquet(config.validation_path + "test.parquet")
test_labels = pl.read_parquet(config.validation_path + "test_labels.parquet")

### Create item features

In [61]:
item_features_data = pl.concat([train, test])
user_features_data = test

In [62]:
MILLISECONDS_IN_SECOND = 1000

item_features_data = item_features_data.with_columns(
    [
        (pl.col("ts").cast(pl.Int64) * MILLISECONDS_IN_SECOND)
        .cast(pl.Datetime)
        .dt.with_time_unit("ms")
        .alias("datetime")
    ]
)
item_features_data

session,aid,ts,type,datetime
i32,i32,i32,u8,datetime[ms]
0,1517085,1659304800,0,2022-07-31 22:00:00
0,1563459,1659304904,0,2022-07-31 22:01:44
0,1309446,1659367439,0,2022-08-01 15:23:59
0,16246,1659367719,0,2022-08-01 15:28:39
0,1781822,1659367871,0,2022-08-01 15:31:11
0,1152674,1659367885,0,2022-08-01 15:31:25
0,1649869,1659369893,1,2022-08-01 16:04:53
0,461689,1659369898,1,2022-08-01 16:04:58
0,305831,1659370027,2,2022-08-01 16:07:07
0,461689,1659370027,2,2022-08-01 16:07:07


In [63]:
user_features_data = user_features_data.with_columns(
    [
        (pl.col("ts").cast(pl.Int64) * MILLISECONDS_IN_SECOND)
        .cast(pl.Datetime)
        .dt.with_time_unit("ms")
        .alias("datetime")
    ]
)
user_features_data

session,aid,ts,type,datetime
i32,i32,i32,u8,datetime[ms]
11098528,11830,1661119200,0,2022-08-21 22:00:00
11098529,1105029,1661119200,0,2022-08-21 22:00:00
11098530,264500,1661119200,0,2022-08-21 22:00:00
11098530,264500,1661119288,0,2022-08-21 22:01:28
11098530,409236,1661119369,0,2022-08-21 22:02:49
11098530,409236,1661119441,0,2022-08-21 22:04:01
11098530,409236,1661120165,0,2022-08-21 22:16:05
11098530,409236,1661120532,1,2022-08-21 22:22:12
11098531,452188,1661119200,0,2022-08-21 22:00:00
11098531,1239060,1661119227,0,2022-08-21 22:00:27


In [64]:
one_day = 24 * 60 * 60
seven_days = 7 * one_day

In [65]:
%%time

temporal_features = (
    item_features_data.groupby("aid")
    .agg(
        [
            (pl.col("type") == 0).sum().alias("n_clicks"),
            (pl.col("type") == 1).sum().alias("n_carts"),
            (pl.col("type") == 2).sum().alias("n_orders"),
            ((pl.col("type") == 0) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("item_n_clicks_24h"),
            ((pl.col("type") == 1) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("item_n_carts_24h"),
            ((pl.col("type") == 2) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("item_n_orders_24h"),
            # number of clicks in last 7 days
            ((pl.col("type") == 0) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("item_n_clicks_7d"),
            # number of carts in last 7 days
            ((pl.col("type") == 1) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("item_n_carts_7d"),
            # number of orders in last 7 days
            ((pl.col("type") == 2) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("item_n_orders_7d"),
        ]
    )
    .fill_null(-1)
    .sort("aid")
)
temporal_features

CPU times: user 2min 4s, sys: 3.82 s, total: 2min 8s
Wall time: 12.1 s


aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,item_n_orders_7d
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,38,0,0,2,0,0,7,0,0
1,31,1,0,2,0,0,28,0,0
2,16,0,0,1,0,0,6,0,0
3,1415,93,17,90,9,0,298,27,2
4,143,5,0,4,0,0,14,1,0
5,7,0,0,1,0,0,1,0,0
6,1,0,0,1,0,0,1,0,0
7,3,0,0,2,0,0,2,0,0
8,20,2,0,1,0,0,8,0,0
9,11,0,0,4,0,0,6,0,0


In [66]:
time_features = (
    item_features_data.groupby("aid")
    .agg(
        [
            # average time between clicks
            pl.col("datetime")
            .filter(pl.col("type") == 0)
            .diff()
            .mean()
            .alias("item_avg_time_between_clicks"),
            # average time between carts
            pl.col("datetime")
            .filter(pl.col("type") == 1)
            .diff()
            .mean()
            .alias("item_avg_time_between_carts"),
            # average time between orders
            pl.col("datetime")
            .filter(pl.col("type") == 2)
            .diff()
            .mean()
            .alias("item_avg_time_between_orders"),
            # average click hour
            pl.col("datetime")
            .filter(pl.col("type") == 0)
            .dt.hour()
            .mean()
            .alias("item_avg_click_hour"),
            # average cart hour
            pl.col("datetime")
            .filter(pl.col("type") == 1)
            .dt.hour()
            .mean()
            .alias("item_avg_cart_hour"),
            # average order hour
            pl.col("datetime")
            .filter(pl.col("type") == 2)
            .dt.hour()
            .mean()
            .alias("item_avg_order_hour"),
            # average click day of month
            pl.col("datetime")
            .filter(pl.col("type") == 0)
            .dt.day()
            .mean()
            .alias("item_avg_click_day_of_month"),
            # average cart day of month
            pl.col("datetime")
            .filter(pl.col("type") == 1)
            .dt.day()
            .mean()
            .alias("item_avg_cart_day_of_month"),
            # average order day of month
            pl.col("datetime")
            .filter(pl.col("type") == 2)
            .dt.day()
            .mean()
            .alias("item_avg_order_day_of_month"),
        ]
    )
    .fill_null(-1)
    .sort("aid")
)
time_features

aid,item_avg_time_between_clicks,item_avg_time_between_carts,item_avg_time_between_orders,item_avg_click_hour,item_avg_cart_hour,item_avg_order_hour,item_avg_click_day_of_month,item_avg_cart_day_of_month,item_avg_order_day_of_month
i32,duration[ms],duration[ms],duration[ms],f64,f64,f64,f64,f64,f64
0,7h 54m 8s 28ms,,,13.973684,-1.0,-1.0,12.078947,-1.0,-1.0
1,8h 47m 51s 31ms,,,13.483871,15.0,-1.0,12.709677,10.0,-1.0
2,19h 15m 9s 69ms,,,11.5,-1.0,-1.0,16.6875,-1.0,-1.0
3,12m 25s 0ms,3h 30m 51s 12ms,12h 10m 7s 43ms,13.407774,14.55914,14.529412,19.440989,20.150538,18.529412
4,1h 56m 3s 6ms,1d 22h 4m 20s,,14.916084,14.8,-1.0,12.328671,15.4,-1.0
5,2d 16h 33m 24s 232ms,,,15.857143,-1.0,-1.0,6.142857,-1.0,-1.0
6,,,,18.0,-1.0,-1.0,10.0,-1.0,-1.0
7,-9h -11m -32s,,,16.666667,-1.0,-1.0,9.666667,-1.0,-1.0
8,3h 10m 54s 11ms,4d 19h 55m 14s,,13.95,15.0,-1.0,11.95,7.5,-1.0
9,1d 9h 34m 50s 120ms,,,15.545455,-1.0,-1.0,11.454545,-1.0,-1.0


In [67]:
item_features = temporal_features.join(time_features, on="aid", how="left").sort("aid")

In [68]:
item_features.write_parquet(config.validation_path + "item_features.parquet")

### Create user features

In [81]:
user_temporal_features = (
    user_features_data.groupby("session")
    .agg(
        [
            (pl.col("type") == 0).sum().alias("user_n_clicks"),
            (pl.col("type") == 1).sum().alias("user_n_carts"),
            (pl.col("type") == 2).sum().alias("user_n_orders"),
            ((pl.col("type") == 0) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("user_n_clicks_24h"),
            ((pl.col("type") == 1) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("user_n_carts_24h"),
            ((pl.col("type") == 2) & (pl.col("ts") > pl.col("ts").max() - one_day))
            .sum()
            .alias("user_n_orders_24h"),
            # number of clicks in last 7 days
            ((pl.col("type") == 0) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("user_n_clicks_7d"),
            # number of carts in last 7 days
            ((pl.col("type") == 1) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("user_n_carts_7d"),
            # number of orders in last 7 days
            ((pl.col("type") == 2) & (pl.col("ts") > pl.col("ts").max() - seven_days))
            .sum()
            .alias("user_n_orders_7d"),
            # number of unique items in last 7 days
            (pl.col("aid").filter(pl.col("ts") > pl.col("ts").max() - seven_days))
            .n_unique()
            .alias("user_n_unique_items_7d"),
            # number of unique items in last 24 hours
            (pl.col("aid").filter(pl.col("ts") > pl.col("ts").max() - one_day))
            .n_unique()
            .alias("user_n_unique_items_24h"),
            # session length
            pl.col("ts").n_unique().alias("user_session_length"),
            # average click hour
            pl.col("datetime")
            .filter(pl.col("type") == 0)
            .dt.hour()
            .mean()
            .alias("user_avg_click_hour"),
            # average cart hour
            pl.col("datetime")
            .filter(pl.col("type") == 1)
            .dt.hour()
            .mean()
            .alias("user_avg_cart_hour"),
            # average order hour
            pl.col("datetime")
            .filter(pl.col("type") == 2)
            .dt.hour()
            .mean()
            .alias("user_avg_order_hour"),
            # average duration between events
            pl.col("datetime").diff().mean().alias("user_avg_duration_between_events"),
            # average duration between clicks
            pl.col("datetime")
            .filter(pl.col("type") == 0)
            .diff()
            .mean()
            .alias("user_avg_duration_between_clicks"),
        ]
    )
    .fill_null(-1)
    .sort("session")
)
user_temporal_features

session,user_n_clicks,user_n_carts,user_n_orders,user_n_clicks_24h,user_n_carts_24h,user_n_orders_24h,user_n_clicks_7d,user_n_carts_7d,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,duration[ms],duration[ms]
11098528,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,,
11098529,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,,
11098530,5,1,0,5,1,0,5,1,0,2,2,6,22.0,22.0,-1.0,4m 26s 0ms,4m 1s 0ms
11098531,20,0,4,20,0,4,20,0,4,11,11,21,22.0,-1.0,22.0,23s 0ms,18s 0ms
11098532,2,0,0,2,0,0,2,0,0,2,2,2,22.0,-1.0,-1.0,13m 15s,13m 15s
11098533,15,2,0,15,2,0,15,2,0,13,13,17,9.0,22.0,-1.0,42m 5s 2ms,52s 0ms
11098534,7,0,0,7,0,0,7,0,0,5,5,7,22.0,-1.0,-1.0,4m 37s 0ms,4m 37s 0ms
11098535,9,1,0,9,1,0,9,1,0,5,5,10,21.0,22.0,-1.0,1h 40m 30s 6ms,1h 53m 4s
11098536,7,0,0,7,0,0,7,0,0,6,6,7,22.0,-1.0,-1.0,2m 1s 0ms,2m 1s 0ms
11098537,18,3,2,18,3,2,18,3,2,17,17,23,22.222222,22.333333,22.0,2m 52s 0ms,3m 42s 0ms


In [82]:
user_temporal_features.write_parquet(config.validation_path + "user_features.parquet")

### Load candidate dataframe

In [13]:
import os, sys

sys.path.append("../")

from inference import *

os.chdir("../")

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [14]:
data, test = load_data()
covisit_clicks = load_combined_covisitation(type="clicks")
model = load_model()
index, aid2idx = build_index(model)

Size of top_20_clicks: 1825950
Loading word2vec model...
Building index for word2vec model...


In [15]:
top_clicks, top_orders = get_top_clicks_orders(test)

In [16]:
def generate_click_candidates(df):
    products = df.aid.tolist()
    types = df.type.tolist()
    unique_products = list(dict.fromkeys(products[::-1]))

    covisit_products = list(
        itertools.chain(
            *[
                covisit_clicks[product]
                for product in unique_products
                if product in covisit_clicks
            ]
        )
    )

    word2vec_products = list(
        itertools.chain(
            *[
                get_nns(model, index, product, aid2idx)
                for product in unique_products
                if product in covisit_clicks
            ]
        )
    )

    return list(set(unique_products + covisit_products + word2vec_products))

In [28]:
pred_df_clicks = (
    test.sort_values(["session", "ts"])
    .groupby(["session"])
    .parallel_apply(lambda x: generate_click_candidates(x))
)
pred_df_clicks = pred_df_clicks.parallel_apply(lambda x: x[:200])

In [18]:
candidate_df = pred_df_clicks.explode().reset_index()
candidate_df.columns = ["session", "aid"]

In [27]:
candidate_df.head()

Unnamed: 0,session,aid
0,12899779,989185
1,12899779,164098
2,12899779,1163906
3,12899779,1709322
4,12899779,1383306


### Load item-user features and combine with candidate dataframe

In [83]:
item_features = pl.read_parquet(config.validation_path + "item_features.parquet")
user_features = pl.read_parquet(config.validation_path + "user_features.parquet")

In [84]:
display(item_features.head())

aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,item_n_orders_7d,item_avg_time_between_clicks,item_avg_time_between_carts,item_avg_time_between_orders,item_avg_click_hour,item_avg_cart_hour,item_avg_order_hour,item_avg_click_day_of_month,item_avg_cart_day_of_month,item_avg_order_day_of_month
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,duration[ms],duration[ms],duration[ms],f64,f64,f64,f64,f64,f64
0,38,0,0,2,0,0,7,0,0,7h 54m 8s 28ms,,,13.973684,-1.0,-1.0,12.078947,-1.0,-1.0
1,31,1,0,2,0,0,28,0,0,8h 47m 51s 31ms,,,13.483871,15.0,-1.0,12.709677,10.0,-1.0
2,16,0,0,1,0,0,6,0,0,19h 15m 9s 69ms,,,11.5,-1.0,-1.0,16.6875,-1.0,-1.0
3,1415,93,17,90,9,0,298,27,2,12m 25s 0ms,3h 30m 51s 12ms,12h 10m 7s 43ms,13.407774,14.55914,14.529412,19.440989,20.150538,18.529412
4,143,5,0,4,0,0,14,1,0,1h 56m 3s 6ms,1d 22h 4m 20s,,14.916084,14.8,-1.0,12.328671,15.4,-1.0


In [85]:
# convert time features to unix timestamp in seconds
item_features = item_features.with_columns(
    [
        pl.col("item_avg_time_between_clicks")
        .cast(pl.Int64)
        .alias("item_avg_time_between_clicks"),
        pl.col("item_avg_time_between_carts")
        .cast(pl.Int64)
        .alias("item_avg_time_between_carts"),
        pl.col("item_avg_time_between_orders")
        .cast(pl.Int64)
        .alias("item_avg_time_between_orders"),
    ]
)
item_features = item_features.fill_null(-1)
display(item_features.head())

aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,item_n_orders_7d,item_avg_time_between_clicks,item_avg_time_between_carts,item_avg_time_between_orders,item_avg_click_hour,item_avg_cart_hour,item_avg_order_hour,item_avg_click_day_of_month,item_avg_cart_day_of_month,item_avg_order_day_of_month
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
0,38,0,0,2,0,0,7,0,0,28448351,-1,-1,13.973684,-1.0,-1.0,12.078947,-1.0,-1.0
1,31,1,0,2,0,0,28,0,0,31671933,-1,-1,13.483871,15.0,-1.0,12.709677,10.0,-1.0
2,16,0,0,1,0,0,6,0,0,69309466,-1,-1,11.5,-1.0,-1.0,16.6875,-1.0,-1.0
3,1415,93,17,90,9,0,298,27,2,745331,12651576,43807687,13.407774,14.55914,14.529412,19.440989,20.150538,18.529412
4,143,5,0,4,0,0,14,1,0,6963528,165860000,-1,14.916084,14.8,-1.0,12.328671,15.4,-1.0


In [86]:
display(user_features.head())

session,user_n_clicks,user_n_carts,user_n_orders,user_n_clicks_24h,user_n_carts_24h,user_n_orders_24h,user_n_clicks_7d,user_n_carts_7d,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,duration[ms],duration[ms]
11098528,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,,
11098529,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,,
11098530,5,1,0,5,1,0,5,1,0,2,2,6,22.0,22.0,-1.0,4m 26s 0ms,4m 1s 0ms
11098531,20,0,4,20,0,4,20,0,4,11,11,21,22.0,-1.0,22.0,23s 0ms,18s 0ms
11098532,2,0,0,2,0,0,2,0,0,2,2,2,22.0,-1.0,-1.0,13m 15s,13m 15s


In [87]:
# convert time features to unix timestamp in seconds
user_features = user_features.with_columns(
    [
        pl.col("user_avg_duration_between_events")
        .cast(pl.Int64)
        .alias("user_avg_duration_between_events"),
        pl.col("user_avg_duration_between_clicks")
        .cast(pl.Int64)
        .alias("user_avg_duration_between_clicks"),
    ]
)
user_features = user_features.fill_null(-1)
display(user_features.head())

session,user_n_clicks,user_n_carts,user_n_orders,user_n_clicks_24h,user_n_carts_24h,user_n_orders_24h,user_n_clicks_7d,user_n_carts_7d,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks
i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64,i64
11098528,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,-1,-1
11098529,1,0,0,1,0,0,1,0,0,1,1,1,22.0,-1.0,-1.0,-1,-1
11098530,5,1,0,5,1,0,5,1,0,2,2,6,22.0,22.0,-1.0,266400,241250
11098531,20,0,4,20,0,4,20,0,4,11,11,21,22.0,-1.0,22.0,23739,18052
11098532,2,0,0,2,0,0,2,0,0,2,2,2,22.0,-1.0,-1.0,795000,795000


### Load candidate dataframe

In [94]:
candidate_df = pl.read_parquet(config.validation_path + "candidate_df.parquet")

### Ensure merge columns are the same type

In [95]:
# change type of session and aid to i32
candidate_df = candidate_df.with_columns(
    [pl.col("session").cast(pl.Int32), pl.col("aid").cast(pl.Int32)]
)
display(candidate_df.head())

session,aid
i32,i32
12899779,989185
12899779,164098
12899779,1163906
12899779,1709322
12899779,1383306


In [96]:
assert item_features["aid"].n_unique() > 1800000
assert user_features["session"].n_unique() > 1800000

In [99]:
user_features["session"].min(), user_features["session"].max()

(11098528, 12899778)

In [100]:
candidate_df["session"].min(), candidate_df["session"].max()

(12899779, 14571581)

### Combine candidate_df with item_features and user_features

In [97]:
# combined candidate_df with item_features
candidate_df = candidate_df.join(item_features, on="aid", how="left")
candidate_df = candidate_df.join(user_features, on="session", how="left")
display(candidate_df.head())

session,aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,item_n_orders_7d,item_avg_time_between_clicks,item_avg_time_between_carts,item_avg_time_between_orders,item_avg_click_hour,item_avg_cart_hour,item_avg_order_hour,item_avg_click_day_of_month,item_avg_cart_day_of_month,item_avg_order_day_of_month
i32,i32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64
12899779,989185,8,0,0,8,0,0,8,0,0,1360714,-1,-1,15.5,-1.0,-1.0,6.0,-1.0,-1.0
12899779,164098,903,30,5,6,0,0,48,0,0,2666757,26470068,208357750,13.929125,13.033333,19.2,11.101883,12.9,14.4
12899779,1163906,7,0,0,1,0,0,2,0,0,334310833,-1,-1,14.714286,-1.0,-1.0,11.714286,-1.0,-1.0
12899779,1709322,8,0,0,1,0,0,6,0,0,49035285,-1,-1,16.0,-1.0,-1.0,18.75,-1.0,-1.0
12899779,1383306,4,0,0,1,0,0,1,0,0,-347917666,-1,-1,11.25,-1.0,-1.0,6.5,-1.0,-1.0


In [93]:
candidate_df.write_parquet(
    config.validation_path + "candidate_df_with_user_item_features.parquet"
)