In [2]:
from typing import Tuple
import numpy as np
import pandas as pd
import polars as pl
from omegaconf import DictConfig, OmegaConf
import hydra
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
import pathlib
from annoy import AnnoyIndex
from gensim.models import Word2Vec
from typing import Dict, List, Tuple

In [3]:
initialize(config_path="../conf", job_name="features_app")
config = compose(config_name="config", overrides=[
                                                    "data_path=../data/", 
                                                    "validation_path=../data/local_validation/", 
                                                    "ranker_path=../data/ranker/", 
                                                    "artifact_path=../artifacts/",
                                                    "model_path=../models/",
                                                    ]
                                                )
print(OmegaConf.to_yaml(config))

name: ''
tags:
- covisit
- word2vec
description: ' '
data_path: ../data/
validation_path: ../data/local_validation/
ranker_path: ../data/ranker/
artifact_path: ../artifacts/
debug: false
local_validation: true
word2vec: true
train_file: train.parquet
test_file: test.parquet
test_labels_file: test_labels.parquet
submission_path: submissions/
submission_file: submission.csv
model_path: ../models/
curr_model_path: models/word2vec.model
type_labels:
  clicks: 0
  carts: 1
  orders: 2
type_weight:
  0: 0.5
  1: 9
  2: 0.5
version: 1
chunk_size: 200000
random_state: 42
fraction: 0.02
n_top: 15
n_top_clicks: 20
n_samples: 50
time_diff: 7 * 24 * 60 * 60
one_week: 7 * 24 * 60 * 60
vector_size: 100
window: 5
negative: 10
workers: 12
epochs: 2
min_count: 5
n_trees: 100
n_recent: 10



The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path="../conf", job_name="features_app")


In [4]:
def load_data() -> Tuple[pl.DataFrame, pl.DataFrame]:
    """Load data from parquet files."""
    if config.local_validation:
        train = pl.read_parquet(config.validation_path + config.train_file)
        # test = pl.read_parquet(config.validation_path + config.test_file)
        test = pl.read_parquet(config.ranker_path + config.test_file) # for ranker model
    else:
        train = pl.read_parquet(config.data_path + config.train_file)
        test = pl.read_parquet(config.data_path + config.test_file)
    return train, test


def load_model():
    """Load word2vec model."""
    if not config.word2vec:
        return None
    print("Loading word2vec model...")
    model = Word2Vec.load(f"{config.model_path}word2vec_windowl_{config.window}.model")
    print(
        f"Model loaded from path: {config.model_path}word2vec_windowl_{config.window}.model"
    )
    return model


def build_index(model, n_trees=100) -> Tuple[AnnoyIndex, Dict[str, int]]:
    """Build index for word2vec model."""
    if config.word2vec:
        print("Building index for word2vec model...")
        aid2idx = {aid: i for i, aid in enumerate(model.wv.index_to_key)}
        index = AnnoyIndex(model.wv.vector_size, metric="euclidean")
        for idx in aid2idx.values():
            index.add_item(idx, model.wv.vectors[idx])
        index.build(n_trees=n_trees)
        return index, aid2idx
    else:
        return None, None


def load_combined_covisitation(type: str = "clicks") -> pd.DataFrame:
    top_20 = pd.read_pickle(
        f"{config.data_path}top_20_{type}_v{config.version}.pkl"
    )
    print(f"Size of top_20_{type}:", len(top_20))
    return top_20

In [61]:
train, test = load_data()

In [6]:
print("Shape of train and test data:", train.shape, test.shape)

Shape of train and test data: (163955180, 4) (2127742, 4)


In [7]:
covisit_clicks = load_combined_covisitation(type="clicks")


Size of top_20_clicks: 1813303


In [8]:
model = load_model()
index, aid2idx = build_index(model)

Loading word2vec model...
Model loaded from path: ../models/word2vec_windowl_5.model
Building index for word2vec model...


In [23]:
key_to_index = model.wv.key_to_index
vectors = model.wv.vectors

# reduce dimensionality of vectors
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(vectors)
vectors = pca.transform(vectors)

In [83]:
def add_mean_word2vec_feature(aids):
    """Add mean word2vec feature to dataframe."""
    if config.word2vec and len(aids) == 0 or not config.word2vec:
        return np.zeros(config.vector_size)
    else:
        return list(
            np.mean(
            [vectors[key_to_index[aid]] for aid in aids if aid in key_to_index], axis=0
        ))


# how many items (that user has already clicked) have recommended this item with their co-visitation matrix
def apply_covisit_clicks_feature(aids):
    covisit_recomms = [0] * len(aids)
    window_start = 0
    for idx, window_end in enumerate(range(len(aids))):
        user_sequence = aids[window_start:window_end]
        curr_aid = aids[window_end]
        unique_aids = list(set(user_sequence))
        covisit_count = sum(curr_aid in covisit_clicks[aid] for aid in unique_aids if aid in covisit_clicks)
        covisit_recomms[idx] = covisit_count
    assert len(covisit_recomms) == len(aids)
    return covisit_recomms


def explode_word2vec_to_columns(df):
    df = df.to_pandas()
    columns = [f'word2vec_feature_{i}' for i in range(10)]
    word2vec_features = df['word2vec_feature'].apply(pd.Series)
    word2vec_features.columns = columns
    word2vec_features, df = pl.DataFrame(word2vec_features), pl.DataFrame(df)
    df = pl.concat([df, word2vec_features], how='horizontal')
    df = df.drop(columns=['word2vec_feature'])
    return df


In [84]:
# logspace averaged mean of the feature vecture
def add_word2vec_feature(df):
    return df.select(
        [
            pl.col("*"),
            pl.col(["aid"])
            .apply(add_mean_word2vec_feature)
            .list()
            .over("session")
            .alias("word2vec_feature"),
        ]
    )


def add_covisit_clicks_feature(df):
    return df.select(
        [
            pl.col("*"),
            pl.col(["aid"])
            .apply(apply_covisit_clicks_feature)
            .over("session")
            .alias("covisit_clicks_feature"),
        ]
    )


# number of clicks in a session at each timestamp
def add_click_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 0)
          .then(pl.col('aid').cumcount().over('session'))
          .otherwise(pl.lit(None)).alias('click_num_chrono')
          .forward_fill()
    ])


# number of clicks in a session at each timestamp in reverse chronological order
def add_click_num_reverse_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 0)
          .then(pl.col('aid').cumcount().reverse().forward_fill().over('session'))
          .otherwise(pl.lit(None)).alias('click_num_reverse_chrono')
          .forward_fill()
    ])


def add_counter(event_types):
    cumcount, counter = [], 0
    for event in event_types:
        if event == 1:
            counter += 1
        cumcount.append(counter)
    return cumcount


# number of carts in a session at each timestamp
def add_cart_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 1)
          .then(pl.col(['type']).apply(add_counter).forward_fill().over('session'))
          .otherwise(pl.lit(0)).alias('cart_num_chrono')
    ])


# number of carts in a session at each timestamp
def add_order_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 2)
          .then(pl.col(['type']).apply(add_counter).forward_fill().over('session'))
          .otherwise(pl.lit(0)).alias('order_num_chrono')
    ])


# doesn't seem correct - please cross-check
def add_aid_interacted_with_count(df):
    temp_df = df.unique(subset=['session', 'aid'])
    temp_df = temp_df.select([
            pl.col('*'),
            pl.col('session').cumcount().over('session').alias('aid_interacted_with_unique')
        ])
    df = df.join(temp_df, on=['session', 'aid'], how='left')
    return df.select(pl.exclude("^.*right$"))


def add_sec_since_last_action(df):
    return df.select(
        [
            pl.col("*"),
            pl.col("ts")
            .diff()
            .over("session")
            .alias("seconds_since_last_action"),
        ]
    ).fill_null(-1)


def add_sec_since_first_action(df):
    return df.select(
        [
            pl.col("*"),
            (pl.col("ts") - pl.col("ts").min())
            .over("session")
            .alias("seconds_since_first_action"),
        ]
    ).fill_null(-1)


def add_sec_to_session_end(df):
    return df.select([
        pl.col('*'),
        (pl.col('ts').max() - pl.col('ts'))
        .over('session')
        .alias('seconds_to_session_end')
    ]).fill_null(-1)


def add_action_num_reverse_chrono_unique(df):
    temp_df = df.unique(subset=['session', 'aid'])
    temp_df = temp_df.select([
            pl.col('*'),
            pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono_unique')
        ])
    df = df.join(temp_df, on=['session', 'aid'], how='left')
    return df.select(pl.exclude("^.*right$"))


def add_action_num_reverse_chrono(df):
    return df.select([
        pl.col('*'),
        pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
    ])


def add_session_length(df):
    return df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])


def add_type_weighted_log_recency_score(df):
    type_weights = {0:0.5, 1:9, 2:0.5}
    type_weighted_log_recency_score = pl.Series(df['log_recency_score'] / df['type'].apply(lambda x: type_weights[x]))
    return df.with_column(type_weighted_log_recency_score.alias('type_weighted_log_recency_score'))


def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns(pl.Series(2**linear_interpolation - 1).alias('log_recency_score')).fill_nan(1)


def apply(df, pipeline):
    for f in pipeline:
        df = f(df)
    return df

In [85]:
# add features
pipeline = [
    add_covisit_clicks_feature,
    add_click_num_chrono,
    add_click_num_reverse_chrono,
    add_cart_num_chrono,
    add_order_num_chrono,
    add_aid_interacted_with_count,
    add_sec_since_last_action,
    add_sec_since_first_action,
    add_sec_to_session_end,
    add_action_num_reverse_chrono_unique,
    add_action_num_reverse_chrono,
    add_session_length,
    add_log_recency_score,
    add_type_weighted_log_recency_score,
    add_word2vec_feature,
    explode_word2vec_to_columns
]

In [86]:
user_features = apply(test.head(10000), pipeline)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)
  word2vec_features = df['word2vec_feature'].apply(pd.Series)


In [105]:
# user_features.groupby('session').agg([
#     pl.col('type_weighted_log_recency_score').mean().alias('type_weighted_log_recency_score_mean'),
#     pl.col('type_weighted_log_recency_score').sum().alias('type_weighted_log_recency_score_sum'),
#     pl.col('type_weighted_log_recency_score').max().alias('type_weighted_log_recency_score_max'),
#     pl.col('type_weighted_log_recency_score').min().alias('type_weighted_log_recency_score_min'),
#     pl.col('type_weighted_log_recency_score').std().alias('type_weighted_log_recency_score_std'),
#     pl.col('type_weighted_log_recency_score').var().alias('type_weighted_log_recency_score_var'),
#     pl.col('type_weighted_log_recency_score').median().alias('type_weighted_log_recency_score_median'),
#     pl.col('type_weighted_log_recency_score').quantile(0.25).alias('type_weighted_log_recency_score_q1'),
#     pl.col('type_weighted_log_recency_score').quantile(0.75).alias('type_weighted_log_recency_score_q3'),
#     pl.col('type_weighted_log_recency_score').kurtosis().alias('type_weighted_log_recency_score_kurtosis'),
#     pl.col('type_weighted_log_recency_score').count().alias('type_weighted_log_recency_score_count'),
# ]).head(10)

session,type_weighted_log_recency_score_mean,type_weighted_log_recency_score_sum,type_weighted_log_recency_score_max,type_weighted_log_recency_score_min,type_weighted_log_recency_score_std,type_weighted_log_recency_score_var,type_weighted_log_recency_score_median,type_weighted_log_recency_score_q1,type_weighted_log_recency_score_q3,type_weighted_log_recency_score_kurtosis,type_weighted_log_recency_score_count
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32
11098564,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,,1
11098676,0.962832,15.405305,2.0,0.018304,0.615807,0.379218,0.928804,0.531513,1.530812,-1.117153,16
11098669,0.995082,5.970491,2.0,0.143547,0.695332,0.483487,0.933871,0.42839,1.530812,-1.218815,6
11098607,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,,1
11098551,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,,1
11098639,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,,1
11098606,2.0,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,,1
11098739,0.995082,5.970491,2.0,0.143547,0.695332,0.483487,0.933871,0.42839,1.530812,-1.218815,6
11098615,0.535671,2.142683,1.24901,0.111111,0.533399,0.284515,0.391281,0.143547,1.24901,-1.269535,4
11098595,0.978408,38.157907,2.0,0.143547,0.555597,0.308688,0.928171,0.484849,1.450588,-1.148419,39
