In [41]:
from typing import Tuple
import numpy as np
import pandas as pd
import polars as pl
from omegaconf import DictConfig, OmegaConf
import hydra
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
import pathlib
from annoy import AnnoyIndex
from gensim.models import Word2Vec
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

In [42]:
initialize(config_path="../conf", job_name="features_app")
config = compose(config_name="config", overrides=[
                                                    "data_path=../data/", 
                                                    "validation_path=../data/local_validation/", 
                                                    "ranker_path=../data/ranker/", 
                                                    "artifact_path=../artifacts/",
                                                    "model_path=../models/",
                                                    ]
                                                )
print(OmegaConf.to_yaml(config))

ValueError: GlobalHydra is already initialized, call GlobalHydra.instance().clear() if you want to re-initialize

In [43]:
def load_data() -> Tuple[pl.DataFrame, pl.DataFrame]:
    """Load data from parquet files."""
    if config.local_validation:
        train = pl.read_parquet(config.validation_path + config.train_file)
        # test = pl.read_parquet(config.validation_path + config.test_file)
        test = pl.read_parquet(config.ranker_path + config.test_file) # for ranker model
    else:
        train = pl.read_parquet(config.data_path + config.train_file)
        test = pl.read_parquet(config.data_path + config.test_file)
    return train, test


def load_model():
    """Load word2vec model."""
    if not config.word2vec:
        return None
    print("Loading word2vec model...")
    model = Word2Vec.load(f"{config.model_path}word2vec_windowl_{config.window}.model")
    print(
        f"Model loaded from path: {config.model_path}word2vec_windowl_{config.window}.model"
    )
    return model


def build_index(model, n_trees=100) -> Tuple[AnnoyIndex, Dict[str, int]]:
    """Build index for word2vec model."""
    if config.word2vec:
        print("Building index for word2vec model...")
        aid2idx = {aid: i for i, aid in enumerate(model.wv.index_to_key)}
        index = AnnoyIndex(model.wv.vector_size, metric="euclidean")
        for idx in aid2idx.values():
            index.add_item(idx, model.wv.vectors[idx])
        index.build(n_trees=n_trees)
        return index, aid2idx
    else:
        return None, None


def load_combined_covisitation(type: str = "clicks") -> pd.DataFrame:
    top_20 = pd.read_pickle(
        f"{config.data_path}top_20_{type}_v{config.version}.pkl"
    )
    print(f"Size of top_20_{type}:", len(top_20))
    return top_20

In [44]:
train, test = load_data()

In [45]:
print("Shape of train and test data:", train.shape, test.shape)

Shape of train and test data: (163955180, 4) (2127742, 4)


In [15]:
covisit_clicks = load_combined_covisitation(type="clicks")


Size of top_20_clicks: 1813303


In [16]:
model = load_model()
index, aid2idx = build_index(model)

Loading word2vec model...
Model loaded from path: ../models/word2vec_windowl_5.model
Building index for word2vec model...


In [17]:
key_to_index = model.wv.key_to_index
vectors = model.wv.vectors

# reduce dimensionality of vectors
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(vectors)
vectors = pca.transform(vectors)

### USER-ITEM-FEATURES

In [18]:
def add_mean_word2vec_feature(aids):
    """Add mean word2vec feature to dataframe."""
    if config.word2vec and len(aids) == 0 or not config.word2vec:
        return np.zeros(config.vector_size)
    else:
        return list(
            np.mean(
            [vectors[key_to_index[aid]] for aid in aids if aid in key_to_index], axis=0
        ))


def add_word2vec_count_feature(aids):
    pass


def add_avg_cosine_similarity(aids):
    pass


# how many items (that user has already clicked) have recommended this item with their co-visitation matrix
def apply_covisit_clicks_feature(aids):
    covisit_recomms = [0] * len(aids)
    window_start = 0
    for idx, window_end in enumerate(range(len(aids))):
        user_sequence = aids[window_start:window_end]
        curr_aid = aids[window_end]
        unique_aids = list(set(user_sequence))
        covisit_count = sum(curr_aid in covisit_clicks[aid] for aid in unique_aids if aid in covisit_clicks)
        covisit_recomms[idx] = covisit_count
    assert len(covisit_recomms) == len(aids)
    return covisit_recomms


def explode_word2vec_to_columns(df):
    df = df.to_pandas()
    columns = [f'word2vec_feature_{i}' for i in range(10)]
    word2vec_features = df['word2vec_feature'].apply(pd.Series)
    word2vec_features.columns = columns
    word2vec_features, df = pl.DataFrame(word2vec_features), pl.DataFrame(df)
    df = pl.concat([df, word2vec_features], how='horizontal')
    df = df.drop(columns=['word2vec_feature'])
    return df


In [50]:
def cusum_scores(df):
    return df.with_columns([
        pl.col('^log_recency_score|type_weighted_log_recency_score$').cumsum().over(['session', 'aid'])
    ])

# logspace averaged mean of the feature vecture
def add_word2vec_feature(df):
    return df.select(
        [
            pl.col("*"),
            pl.col(["aid"])
            .apply(add_mean_word2vec_feature)
            .list()
            .over("session")
            .alias("word2vec_feature"),
        ]
    )


def add_covisit_clicks_feature(df):
    return df.select(
        [
            pl.col("*"),
            pl.col(["aid"])
            .apply(apply_covisit_clicks_feature)
            .over("session")
            .alias("covisit_clicks_feature"),
        ]
    )


# number of clicks in a session at each timestamp
def add_click_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 0)
          .then(pl.col('aid').cumcount().over('session'))
          .otherwise(pl.lit(None)).alias('click_num_chrono')
          .forward_fill()
    ])


# number of clicks in a session at each timestamp in reverse chronological order
def add_click_num_reverse_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 0)
          .then(pl.col('aid').cumcount().reverse().forward_fill().over('session'))
          .otherwise(pl.lit(None)).alias('click_num_reverse_chrono')
          .forward_fill()
    ])


def add_counter(event_types):
    cumcount, counter = [], 0
    for event in event_types:
        if event == 1:
            counter += 1
        cumcount.append(counter)
    return cumcount


# number of carts in a session at each timestamp
def add_cart_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 1)
          .then(pl.col(['type']).apply(add_counter).forward_fill().over('session'))
          .otherwise(pl.lit(0)).alias('cart_num_chrono')
    ])


# number of carts in a session at each timestamp
def add_order_num_chrono(df):
    return df.select([
        pl.col('*'),
        pl.when(pl.col('type') == 2)
          .then(pl.col(['type']).apply(add_counter).forward_fill().over('session'))
          .otherwise(pl.lit(0)).alias('order_num_chrono')
    ])


# doesn't seem correct - please cross-check
def add_aid_interacted_with_count(df):
    temp_df = df.unique(subset=['session', 'aid'])
    temp_df = temp_df.select([
            pl.col('*'),
            pl.col('session').cumcount().over('session').alias('aid_interacted_with_unique')
        ])
    df = df.join(temp_df, on=['session', 'aid'], how='left')
    return df.select(pl.exclude("^.*right$"))


def add_sec_since_last_action(df):
    return df.select(
        [
            pl.col("*"),
            pl.col("ts")
            .diff()
            .over("session")
            .alias("seconds_since_last_action"),
        ]
    ).fill_null(-1)


def add_sec_since_first_action(df):
    return df.select(
        [
            pl.col("*"),
            (pl.col("ts") - pl.col("ts").min())
            .over("session")
            .alias("seconds_since_first_action"),
        ]
    ).fill_null(-1)


def add_sec_to_session_end(df):
    return df.select([
        pl.col('*'),
        (pl.col('ts').max() - pl.col('ts'))
        .over('session')
        .alias('seconds_to_session_end')
    ]).fill_null(-1)


def add_action_num_reverse_chrono_unique(df):
    temp_df = df.unique(subset=['session', 'aid'])
    temp_df = temp_df.select([
            pl.col('*'),
            pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono_unique')
        ])
    df = df.join(temp_df, on=['session', 'aid'], how='left')
    return df.select(pl.exclude("^.*right$"))


def add_action_num_reverse_chrono(df):
    return df.select([
        pl.col('*'),
        pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
    ])


def add_session_length(df):
    return df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])


def add_type_weighted_log_recency_score(df):
    type_weights = {0:0.5, 1:9, 2:0.5}
    type_weighted_log_recency_score = pl.Series(df['log_recency_score'] / df['type'].apply(lambda x: type_weights[x]))
    return df.with_column(type_weighted_log_recency_score.alias('type_weighted_log_recency_score'))


def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns(pl.Series(2**linear_interpolation - 1).alias('log_recency_score')).fill_nan(1)


def apply(df, pipeline):
    for f in pipeline:
        df = f(df)
    return df

In [47]:
# add features
pipeline = [
    add_covisit_clicks_feature,
    add_click_num_chrono,
    add_click_num_reverse_chrono,
    add_cart_num_chrono,
    add_order_num_chrono,
    add_aid_interacted_with_count,
    add_sec_since_last_action,
    add_sec_since_first_action,
    add_sec_to_session_end,
    add_action_num_reverse_chrono_unique,
    add_action_num_reverse_chrono,
    add_session_length,
    add_log_recency_score,
    add_type_weighted_log_recency_score,
    add_word2vec_feature,
    explode_word2vec_to_columns
]

In [52]:
user_item_features = apply(test, pipeline)
user_item_features = cusum_scores(user_item_features)
user_item_features = user_item_features.unique(subset=['session', 'aid'], keep='last')
user_item_features

session,aid,ts,type,covisit_clicks_feature,click_num_chrono,click_num_reverse_chrono,cart_num_chrono,order_num_chrono,aid_interacted_with_unique,seconds_since_last_action,seconds_since_first_action,seconds_to_session_end,action_num_reverse_chrono_unique,action_num_reverse_chrono,session_length,log_recency_score,type_weighted_log_recency_score,word2vec_feature_0,word2vec_feature_1,word2vec_feature_2,word2vec_feature_3,word2vec_feature_4,word2vec_feature_5,word2vec_feature_6,word2vec_feature_7,word2vec_feature_8,word2vec_feature_9
i32,i32,i32,i32,i64,i64,i64,i64,i64,i64,i32,i32,i32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
11098530,264500,1661119288,0,0,1,4,0,0,0,88,88,1244,1,4,6,0.285968,0.571937,3.736949,-1.010742,1.887903,0.122541,0.314799,-0.311605,-0.072674,3.270477,-0.534411,1.142654
11098530,409236,1661120532,1,1,4,1,1,0,1,367,1332,0,0,0,6,2.699277,3.509665,3.736949,-1.010742,1.887903,0.122541,0.314799,-0.311605,-0.072674,3.270477,-0.534411,1.142654
11098531,1239060,1661119227,0,0,1,22,0,0,1,27,27,519,9,22,24,0.101241,0.202482,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1309633,1661119284,0,3,5,18,0,0,4,16,84,462,6,18,24,0.22744,0.454879,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1449555,1661119311,0,4,6,17,0,0,5,27,111,435,5,17,24,0.261187,0.522375,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1557766,1661119323,0,5,7,16,0,0,2,12,123,423,8,16,24,0.427382,0.854764,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1553691,1661119429,0,3,14,9,0,0,9,40,229,317,1,9,24,0.566805,1.133611,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,624163,1661119485,0,9,15,8,0,0,10,56,285,261,0,8,24,0.609884,1.219768,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1365569,1661119543,0,9,19,4,0,0,7,8,343,203,3,4,24,1.908827,3.817655,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458
11098531,1728212,1661119746,2,10,19,4,0,0,6,203,546,0,4,3,24,2.280539,4.561078,0.358725,-0.341024,-0.363563,0.089733,0.167635,0.414435,0.203166,0.1015,-0.32519,-0.155458


In [54]:
def add_datetime(df: pl.DataFrame) -> pl.DataFrame:
    MILLISECONDS_IN_SECOND = 1000

    df = df.with_columns(
        [
            (pl.col("ts").cast(pl.Int64) * MILLISECONDS_IN_SECOND)
            .cast(pl.Datetime)
            .dt.with_time_unit("ms")
            .alias("datetime")
        ]
    )
    return df


def add_day_of_week_pandas(df):
    df = df.to_pandas()
    df['day_of_week'] = df['datetime'].dt.dayofweek
    return pl.DataFrame(df)


def add_hour_of_the_day_pandas(df):
    df = df.to_pandas()
    df['hour_of_the_day'] = df['datetime'].dt.hour
    return pl.DataFrame(df)


def add_is_day_pandas(df):
    df = df.to_pandas()
    df['is_day'] = df['hour_of_the_day'].apply(lambda x: 1 if x >= 6 and x <= 18 else 0)
    return pl.DataFrame(df)


test = add_datetime(test)
test = add_day_of_week_pandas(test)
test = add_hour_of_the_day_pandas(test)
test = add_is_day_pandas(test)

In [55]:
test.head()

session,aid,ts,type,datetime,day_of_week,hour_of_the_day,is_day
i32,i32,i32,u8,datetime[ns],i64,i64,i64
11098530,264500,1661119200,0,2022-08-21 22:00:00,6,22,0
11098530,264500,1661119288,0,2022-08-21 22:01:28,6,22,0
11098530,409236,1661119369,0,2022-08-21 22:02:49,6,22,0
11098530,409236,1661119441,0,2022-08-21 22:04:01,6,22,0
11098530,409236,1661120165,0,2022-08-21 22:16:05,6,22,0


### USER FEATURES

In [72]:

real_session_gap = 2 * 60 * 60 


def add_mean_type(df):
    result = df.groupby('session').agg(
        [
            pl.col('type').mean().alias('mean_type')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_type_ratio(df):
    result = df.groupby('session').agg(
        [
            pl.col('type').filter(pl.col('type') == 0).count().alias('click_count'),
            pl.col('type').filter(pl.col('type') == 1).count().alias('cart_count'),
            pl.col('type').filter(pl.col('type') == 2).count().alias('order_count'),
            pl.col('type').count().alias('total_count')
        ]
    ).fill_null(-1)
    result = result.with_columns(
        [
            (pl.col('click_count') / pl.col('total_count')).alias('click_ratio'),
            (pl.col('cart_count') / pl.col('total_count')).alias('cart_ratio'),
            (pl.col('order_count') / pl.col('total_count')).alias('order_ratio')

        ]
    )
    return df.join(result, on='session', how='left')


def add_avg_hour_user_clicks(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .filter(pl.col('type') == 0)
            .dt.hour()
            .mean()
            .alias('avg_hour_user_clicks')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_hour_user_carts(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .filter(pl.col('type') == 1)
            .dt.hour()
            .mean()
            .alias('avg_hour_user_carts')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_hour_user_orders(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .filter(pl.col('type') == 2)
            .dt.hour()
            .mean()
            .alias('avg_hour_user_orders')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_hour_user_interactions(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .dt.hour()
            .mean()
            .alias('avg_hour_user_interactions')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_last_day_of_week_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .max()
            .alias('last_day_of_week_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_first_day_of_week_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .min()
            .alias('first_day_of_week_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_day_of_week_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .mean()
            .alias('avg_day_of_week_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


# number of real sessions based on time gap activity
def add_num_real_sessions(df):
    result = df.groupby('session').agg(
        [
            pl.col('ts')
            .diff()
            .filter(pl.col('datetime').diff() > real_session_gap)
            .count()
            .alias('num_real_sessions')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_hour_in_each_real_session(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .filter(pl.col('datetime').diff() > real_session_gap)
            .dt.hour()
            .mean()
            .alias('avg_hour_in_each_real_session')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_items_in_each_real_session(df):
    result = df.groupby('session').agg(
        [
            pl.col('aid')
            .filter(pl.col('datetime').diff() > real_session_gap)
            .count()
            # .mean()
            .alias('avg_items_in_each_real_session')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_mean_type_in_real_session(df):
    result = df.groupby('session').agg(
        [
            pl.col('type')
            .filter(pl.col('datetime').diff() > real_session_gap)
            .mean()
            .alias('mean_type_in_each_real_session')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_avg_time_between_clicks(df):
    result = df.groupby('session').agg(
        [
            pl.col('ts')
            .filter(pl.col('type') == 0)
            .diff()
            .mean()
            .alias('avg_time_between_clicks')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_num_of_day_events(df):
    result = df.groupby('session').agg(
        [
            pl.col('is_day')
            .filter(pl.col('is_day') == 1)
            .count()
            .alias('num_day_events')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_num_of_night_events(df):
    result = df.groupby('session').agg(
        [
            pl.col('is_day')
            .filter(pl.col('is_day') == 0)
            .count()
            .alias('num_night_events')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_num_of_weekend_events(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .filter(pl.col('day_of_week') > 4)
            .count()
            .alias('num_weekend_events')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_num_of_weekday_events(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .filter(pl.col('day_of_week') < 5)
            .count()
            .alias('num_weekday_events')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')



def add_if_weekday_user(df):
    # if major fraction of events are weekday events
    return df.with_column(
        pl.when(
            pl.col('num_weekday_events') > pl.col('num_weekend_events')
        )
        .then(1)
        .otherwise(0)
        .alias('if_weekday_user')
    )


def add_if_weekend_user(df):
    # if major fraction of events are weekend events
    return df.with_column(
        pl.when(
            pl.col('num_weekday_events') < pl.col('num_weekend_events')
        )
        .then(1)
        .otherwise(0)
        .alias('if_weekend_user')
    )


def add_first_hour_of_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .dt.hour()
            .min()
            .alias('first_hour_of_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_last_hour_of_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('datetime')
            .dt.hour()
            .max()
            .alias('last_hour_of_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_first_day_of_week_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .min()
            .alias('first_day_of_week_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')


def add_last_day_of_week_user_activity(df):
    result = df.groupby('session').agg(
        [
            pl.col('day_of_week')
            .max()
            .alias('last_day_of_week_user_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='session', how='left')
    


In [70]:
pipeline = [
    add_avg_day_of_week_user_activity,
    add_first_day_of_week_user_activity,
    add_last_day_of_week_user_activity,
    add_first_day_of_week_user_activity,
    add_last_day_of_week_user_activity,
    add_avg_hour_user_interactions,
    add_first_hour_of_user_activity,
    add_last_hour_of_user_activity,
    add_avg_hour_in_each_real_session,
    add_avg_items_in_each_real_session,
    add_avg_time_between_clicks,
    add_num_real_sessions,
    add_num_of_day_events,
    add_num_of_night_events,
    add_num_of_weekend_events,
    add_num_of_weekday_events,
    add_if_weekday_user,
    add_if_weekend_user,
    add_mean_type,
    add_mean_type_in_real_session,
    add_type_ratio
]

In [71]:
user_features = apply(test, pipeline)
user_features = user_features.unique(subset=['session'])
user_features = user_features.drop(['datetime', 'ts', 'day_of_week', 'is_day', 'hour_of_the_day', 'type', 'aid'])
user_features

session,avg_day_of_week_user_activity,first_day_of_week_user_activity,last_day_of_week_user_activity,first_day_of_week_user_activity_right,last_day_of_week_user_activity_right,avg_hour_user_interactions,first_hour_of_user_activity,last_hour_of_user_activity,avg_hour_in_each_real_session,avg_items_in_each_real_session,avg_time_between_clicks,num_real_sessions,num_day_events,num_night_events,num_weekend_events,num_weekday_events,if_weekday_user,if_weekend_user,mean_type,mean_type_in_eacn_real_session,click_count,cart_count,order_count,total_count,click_ratio,cart_ratio,order_ratio
i32,f64,i64,i64,i64,i64,f64,i64,i64,f64,i64,f64,i64,i64,i64,i64,i64,i32,i32,f64,f64,i64,i64,i64,i64,f64,f64,f64
11098530,6.0,6,6,6,6,22.0,22,22,22.0,5,241.25,5,0,6,6,0,0,1,0.166667,0.25,5,1,0,6,0.833333,0.166667,0.0
11098531,6.0,6,6,6,6,22.0,22,22,22.0,24,18.052632,20,0,24,24,0,0,1,0.333333,0.421053,20,0,4,24,0.833333,0.0,0.166667
11098535,5.4,0,6,0,6,21.1,13,22,22.0,4,6784.0,9,1,9,9,1,0,1,0.1,0.1,9,1,0,10,0.9,0.1,0.0
11098539,6.0,6,6,6,6,22.0,22,22,22.0,0,54.5,2,0,3,3,0,0,1,0.0,-1.0,3,0,0,3,1.0,0.0,0.0
11098542,6.0,6,6,6,6,22.0,22,22,22.0,14,50.928571,21,0,22,22,0,0,1,0.318182,0.375,15,7,0,22,0.681818,0.318182,0.0
11098551,6.0,6,6,6,6,22.0,22,22,22.0,1,-1.0,0,0,1,1,0,0,1,0.0,0.0,1,0,0,1,1.0,0.0,0.0
11098552,6.0,6,6,6,6,22.0,22,22,22.0,1,-1.0,0,0,1,1,0,0,1,0.0,0.0,1,0,0,1,1.0,0.0,0.0
11098557,6.0,6,6,6,6,22.0,22,22,22.0,11,40.272727,11,0,12,12,0,0,1,0.0,0.0,12,0,0,12,1.0,0.0,0.0
11098558,6.0,6,6,6,6,22.0,22,22,22.0,5,73.9,11,0,12,12,0,0,1,0.083333,0.090909,11,1,0,12,0.916667,0.083333,0.0
11098559,6.0,6,6,6,6,22.0,22,22,22.0,2,57.5,2,0,3,3,0,0,1,0.0,0.0,3,0,0,3,1.0,0.0,0.0


In [53]:
test.head()

session,aid,ts,type
i32,i32,i32,u8
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0


### ITEM FEATURES

In [137]:
one_day = 24 * 60 * 60
seven_days = 7 * one_day

def add_avg_hour_of_day_item_interactions(df):
    result = df.groupby('aid').agg(
        [
            pl.col('hour_of_the_day')
            .mean()
            .alias('avg_hour_of_day_item_interactions')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_avg_day_of_week_item_interactions(df):
    result = df.groupby('aid').agg(
        [
            pl.col('day_of_week')
            .mean()
            .alias('avg_day_of_week_item_interactions')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_first_day_of_week_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('day_of_week')
            .min()
            .alias('first_day_of_week_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_last_day_of_week_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('day_of_week')
            .max()
            .alias('last_day_of_week_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_first_hour_of_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('datetime')
            .dt.hour()
            .min()
            .alias('first_hour_of_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_last_hour_of_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('datetime')
            .dt.hour()
            .max()
            .alias('last_hour_of_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_retype_ratio(df, event_type):
    type2id = {"clicks": 0, "carts": 1, "orders": 2}
    duplicated_counts = df.groupby(['session', 'aid']).agg(
        [
            pl.col('aid')
            .filter(pl.col('type') == type2id[event_type])
            .count()
            .alias(f'{event_type}_count'),
        ]
    )
    # might be a good idea to put no-reclicks to zero
    # adjust all values by -1
    retype_ratio_df = duplicated_counts.groupby('aid').agg(
        [
            (pl.col(f'{event_type}_count')
             .mean())
            .alias(f're{event_type}_count_ratio')
        ]
    ).fill_null(0)
    return df.join(retype_ratio_df, on='aid', how='left')


def add_most_popular_day_of_week_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('day_of_week')
            .mode().mean()
            .alias('most_popular_day_of_week_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_most_popular_hour_of_day_item_activity(df):
    result = df.groupby('aid').agg(
        [
            pl.col('hour_of_the_day')
            .mode().mean()
            .alias('most_popular_hour_of_day_item_activity')
        ]
    ).fill_null(-1)
    return df.join(result, on='aid', how='left')


def add_item_clicks_ratio(df):
    result = df.groupby('aid').agg(
        [
            (pl.col('type')
             .filter(pl.col('type') == 0)
             .count()
             .alias('clicks_count')
             / pl.col('type')
             .count()
             .alias('total_count'))
            .alias('item_clicks_ratio')
        ]
    ).fill_null(0)
    return df.join(result, on='aid', how='left')


def add_item_carts_ratio(df):
    result = df.groupby('aid').agg(
        [
            (pl.col('type')
             .filter(pl.col('type') == 1)
             .count()
             .alias('carts_count')
             / pl.col('type')
             .count()
             .alias('total_count'))
            .alias('item_carts_ratio')
        ]
    ).fill_null(0)
    return df.join(result, on='aid', how='left')



def add_type_ratio_last_7days(df):
    result = df.groupby('aid').agg(
        [
            (pl.col('type')
             .filter(pl.col('type') == 0)
             .count()
             .alias('clicks_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - seven_days)
             .count()
             .alias('total_count'))
            .alias('item_clicks_ratio_last_7days'),
            (pl.col('type')
             .filter(pl.col('type') == 1)
             .count()
             .alias('carts_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - seven_days)
             .count()
             .alias('total_count'))
            .alias('item_carts_ratio_last_7days'),
            (pl.col('type')
             .filter(pl.col('type') == 2)
             .count()
             .alias('orders_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - seven_days)
             .count()
             .alias('total_count'))
            .alias('item_orders_ratio_last_7days'),
        ]
    ).fill_null(0)
    return df.join(result, on='aid', how='left')


# not sure if this is a good idea
def add_type_ratio_last_24hours(df):
    result = df.groupby('aid').agg(
        [
            (pl.col('type')
             .filter(pl.col('type') == 0)
             .count()
             .alias('clicks_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - one_day)
             .count()
             .alias('total_count'))
            .alias('item_clicks_ratio_last_24hours'),
            (pl.col('type')
             .filter(pl.col('type') == 1)
             .count()
             .alias('carts_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - one_day)
             .count()
             .alias('total_count'))
            .alias('item_carts_ratio_last_24hours'),
            (pl.col('type')
             .filter(pl.col('type') == 2)
             .count()
             .alias('orders_count')
             / pl.col('type')
             .filter(pl.col('ts') > pl.col('ts').max() - one_day)
             .count()
             .alias('total_count'))
            .alias('item_orders_ratio_last_24hours'),
        ]
    ).fill_null(0)
    return df.join(result, on='aid', how='left')


In [138]:
pipeline = [
    add_avg_hour_of_day_item_interactions,
    add_avg_day_of_week_item_interactions,
    add_first_day_of_week_item_activity,
    add_last_day_of_week_item_activity,
    add_first_hour_of_item_activity,
    add_last_hour_of_item_activity,
    lambda df: add_retype_ratio(df, 'clicks'),
    lambda df: add_retype_ratio(df, 'carts'),
    lambda df: add_retype_ratio(df, 'orders'),
    add_most_popular_day_of_week_item_activity,
    add_most_popular_hour_of_day_item_activity,
    add_item_clicks_ratio,
    add_item_carts_ratio,
    add_type_ratio_last_7days,
    add_type_ratio_last_24hours,
]

In [139]:
item_features = apply(test, pipeline)
item_features.unique(subset=['aid'])
item_features = item_features.drop(columns=['session', 'ts', 'datetime', 'type', 'day_of_week', 'hour_of_the_day', 'is_day'])
item_features

aid,avg_hour_of_day_item_interactions,avg_day_of_week_item_interactions,first_day_of_week_item_activity,last_day_of_week_item_activity,first_hour_of_item_activity,last_hour_of_item_activity,reclicks_count_ratio,recarts_count_ratio,reorders_count_ratio,most_popular_day_of_week_item_activity,most_popular_hour_of_day_item_activity,item_clicks_ratio,item_carts_ratio,item_clicks_ratio_last_7days,item_carts_ratio_last_7days,item_orders_ratio_last_7days,item_clicks_ratio_last_24hours,item_carts_ratio_last_24hours,item_orders_ratio_last_24hours
i32,f64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
264500,13.772727,1.909091,0,6,3,22,1.222222,0.0,0.0,1.0,7.0,1.0,0.0,1.0,0.0,0.0,11.0,0.0,0.0
264500,13.772727,1.909091,0,6,3,22,1.222222,0.0,0.0,1.0,7.0,1.0,0.0,1.0,0.0,0.0,11.0,0.0,0.0
409236,14.873418,3.075949,0,6,3,23,1.415842,0.108911,0.039604,1.0,18.0,0.905063,0.06962,0.905063,0.06962,0.025316,4.766667,0.366667,0.133333
409236,14.873418,3.075949,0,6,3,23,1.415842,0.108911,0.039604,1.0,18.0,0.905063,0.06962,0.905063,0.06962,0.025316,4.766667,0.366667,0.133333
409236,14.873418,3.075949,0,6,3,23,1.415842,0.108911,0.039604,1.0,18.0,0.905063,0.06962,0.905063,0.06962,0.025316,4.766667,0.366667,0.133333
409236,14.873418,3.075949,0,6,3,23,1.415842,0.108911,0.039604,1.0,18.0,0.905063,0.06962,0.905063,0.06962,0.025316,4.766667,0.366667,0.133333
452188,19.75,5.75,5,6,13,22,1.5,0.0,0.5,6.0,22.0,0.75,0.0,0.75,0.0,0.25,3.0,0.0,1.0
1239060,22.0,6.0,6,6,22,22,1.0,0.0,0.0,6.0,22.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1557766,22.0,6.0,6,6,22,22,2.0,0.0,0.0,6.0,22.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
452188,19.75,5.75,5,6,13,22,1.5,0.0,0.5,6.0,22.0,0.75,0.0,0.75,0.0,0.25,3.0,0.0,1.0
