In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict, Counter

import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
class config:
    data_path = "../otto/data/optimised_dataset/"
    local_validation = True
    debug = False
    validation_path = "../otto/data/optimised_dataset/validation/"
    train_file = "train.parquet"
    test_file = "test.parquet"
    test_labels_file = "test_labels.parquet"
    submission_path = "submissions/"
    submission_file = "submission_{:%Y-%m-%d_%H-%M}.csv"
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    type_weight = {0: 1, 1: 6, 2: 3}
    version = 1
    chunk_size = 100_000
    random_state = 42
    fraction = 0.02
    n_samples = 30
    n_top = 15
    diff_clicks = 24 * 60 * 60

In [15]:
train = pl.read_parquet(config.data_path + config.train_file)
test = pl.read_parquet(config.data_path + config.test_file)

In [16]:
sentences_df = (
    pl.concat([train, test]).groupby("session").agg(pl.col("aid").alias("sentence"))
)

In [17]:
sentences = sentences_df["sentence"].to_list()

In [18]:
%%time

word2vec = Word2Vec(
    sentences=sentences, vector_size=100, window=5, negative=10, workers=12
)

CPU times: user 1h 13min 25s, sys: 4 s, total: 1h 13min 29s
Wall time: 10min 52s


In [19]:
word2vec.save("word2vec-windowsize-5-vector-size-100-full-data.model")

In [32]:
%%time

from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(word2vec.wv.index_to_key)}
index = AnnoyIndex(10, "euclidean")

for aid, idx in aid2idx.items():
    index.add_item(idx, word2vec.wv.vectors[idx])

index.build(100)

CPU times: user 6min 27s, sys: 1min 24s, total: 7min 51s
Wall time: 24.4 s


True

In [31]:
session_types = ["clicks", "carts", "orders"]
test_session_products = (
    test.to_pandas().reset_index(drop=True).groupby("session")["aid"].apply(list)
)
test_session_types = (
    test.to_pandas().reset_index(drop=True).groupby("session")["type"].apply(list)
)

labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

In [27]:
for products, types in zip(test_session_products, test_session_types):
    products = list(dict.fromkeys(products[::-1]))

    most_recent_product = products[0]

    nns = [
        word2vec.wv.index_to_key[i]
        for i in index.get_nns_by_item(aid2idx[most_recent_product], 21)[1:]
    ]

    print(products, types, nns)

    break

[11830] [0] [884502, 1732105, 87442, 1633746, 1125638, 487136, 432989, 1182614, 807298, 876129, 582732, 588923, 1134980, 1481487, 146086, 409620, 517826, 825072, 135833, 1344942]
