In [None]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=True)

import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

In [None]:
def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            cmin = df[col].min()
            cmax = df[col].max()
            if str(col_type)[:3] == 'int':
                if cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif cmin > np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

# Feature Extraction

In [None]:
GENERATE_FOR = "kaggle" # "kaggle"

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [None]:
CANDIDATE_COUNT = 100

In [None]:
if GENERATE_FOR == "local":
    train_path = "./splitted_raw_data/train.parquet"
    val_path = "./splitted_raw_data/val.parquet"
    
elif GENERATE_FOR == "kaggle":
    train_path = "./splitted_raw_data/all_train.parquet"
    val_path = "./splitted_raw_data/test.parquet"

### CANDIDATE COVISIT FEATURES

In [None]:
def get_covisitation_features(input_cand_df,
                              input_user_int_df,
                              input_covisit_df,
                              covisit_name="clicks",
                              score_col="wgt",
                              scoring_name="covisitation",
                              session_history_wanted_types=None):      
    
    if session_history_wanted_types:
        filtered_input_user_int_df = input_user_int_df.filter(pl.col('type').is_in(session_history_wanted_types))
    else: 
        filtered_input_user_int_df = input_user_int_df
    
    candidates_w_covisit = input_cand_df[["session", "aid"]].rename({"aid":"aid_x"}).\
    join(filtered_input_user_int_df.rename({"aid":"aid_y"})[["session", "aid_y"]],
         how="left",
         on="session").fill_null(0).join(input_covisit_df, how="left", on=["aid_x", "aid_y"])

    candidates_w_covisit = candidates_w_covisit.fill_null(0.)
    
    candidates_w_covisit_gby = (
        candidates_w_covisit
        .groupby(["session", "aid_x"])
        .agg(
            [
                pl.col(score_col).max().alias(covisit_name + '_' + scoring_name + '_' + "max"),
                pl.col(score_col).min().alias(covisit_name + '_' + scoring_name + '_' + "min"),
                pl.col(score_col).std().alias(covisit_name + '_' + scoring_name + '_' + "std"),
                pl.col(score_col).sum().alias(covisit_name + '_' + scoring_name + '_' + "sum"),
                pl.col(score_col).mean().alias(covisit_name + '_' + scoring_name + '_' + "mean"),
                pl.col(score_col).count().alias(covisit_name + '_' + scoring_name + '_' + "count"),
            ]
        )
    ).sort("session", reverse=False)

    candidates_w_covisit_gby = candidates_w_covisit_gby.rename({"aid_x":"aid"})
    
    candidates_w_covisit_gby = candidates_w_covisit_gby.with_column(pl.col("aid").cast(pl.Int32))
    candidates_w_covisit_gby = candidates_w_covisit_gby.with_column(pl.col("session").cast(pl.Int32)) 
    return candidates_w_covisit_gby

In [None]:
def generate_candidate_history_pair_score_features(input_val_df_path,
                                                   score_df_tuples_w_names,
                                                   score_col,
                                                   scoring_name="covisitation"):
    
    val_df = pl.read_parquet(input_val_df_path)

    for type_str in tqdm(list(type_labels.keys())):

        pf = ParquetFile(f"./candidate_data/{GENERATE_FOR}_{CANDIDATE_COUNT}candidates_{type_str}.parquet")
        chunk = 10_000_000

        total_candidate_df = []

        for batch_i, batch in tqdm(enumerate(pf.iter_batches(batch_size = chunk))):
            candidate_df = batch.to_pandas()
            del batch
            candidate_df = pl.from_pandas(candidate_df)

            candidate_df = candidate_df.with_column(pl.col("aid").cast(pl.Int32))
            candidate_df = candidate_df.with_column(pl.col("session").cast(pl.Int32)) 

            val_df = val_df.with_column(pl.col("aid").cast(pl.Int32))
            val_df = val_df.with_column(pl.col("session").cast(pl.Int32))

            for covisit in score_df_tuples_w_names:
                covisit[0] = covisit[0].with_column(pl.col("aid_x").cast(pl.Int32))
                covisit[0] = covisit[0].with_column(pl.col("aid_y").cast(pl.Int32))

                candidate_df = candidate_df.join(
                    get_covisitation_features(input_cand_df=candidate_df,
                                              input_user_int_df=val_df,
                                              input_covisit_df=covisit[0],
                                              covisit_name=covisit[1],
                                              score_col=score_col,
                                              scoring_name=scoring_name,
                                              session_history_wanted_types=covisit[2]),
                    how="left",
                    on=["session", "aid"])

            candidate_df = candidate_df.with_column(pl.col("aid").cast(pl.Int64))
            candidate_df = candidate_df.with_column(pl.col("session").cast(pl.Int64))         

            total_candidate_df.append(candidate_df)

            del candidate_df

        total_candidate_df = pl.concat(total_candidate_df)    
        
        total_candidate_df = total_candidate_df.with_columns([pl.col(total_candidate_df.columns[2:]).cast(pl.Float32),])
        total_candidate_df.write_parquet(f'../raw_data/{GENERATE_FOR}_{scoring_name}_features/{scoring_name}_features_{type_str}_{CANDIDATE_COUNT}candidates.pqt')

        del total_candidate_df;gc.collect()

### Covisitation pair 'wgt' features

In [None]:
DISK_PIECES = 4

print("Reading clicks covisitation...")
clicks_cov_df = pl.from_pandas(pd.concat([pd.read_parquet(f'../raw_data/{GENERATE_FOR}_covisitation/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_clicks_v{VER}_{k}.pqt') for k in range(0, DISK_PIECES)], ignore_index=True))
print("Reading carts-orders covisitation...")
carts_orders_cov_df = pl.from_pandas(pd.concat([pd.read_parquet(f'../raw_data/{GENERATE_FOR}_covisitation/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_carts_orders_v{VER}_{k}.pqt') for k in range(0, DISK_PIECES)], ignore_index=True))
print("Reading buy2buy covisitation...")
buy2buy_cov_df = pl.from_pandas(pd.concat([pd.read_parquet(f'../raw_data/{GENERATE_FOR}_covisitation/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_buy2buy_v{VER}_{k}.pqt') for k in range(0, 1)], ignore_index=True))

In [None]:
def all_covisits_features(input_covisit_df,
                         score_col,
                         covisit_name):
    aid_cov_feat_df = (
        clicks_cov_df
        .groupby(["aid_x"])
        .agg(
            [
                pl.col(score_col).max().alias(covisit_name + '_' + score_col + '_' + "max"),
                pl.col(score_col).min().alias(covisit_name + '_' + score_col + '_' + "min"),
                pl.col(score_col).std().alias(covisit_name + '_' + score_col + '_' + "std"),
                pl.col(score_col).sum().alias(covisit_name + '_' + score_col + '_' + "sum"),
                pl.col(score_col).mean().alias(covisit_name + '_' + score_col + '_' + "mean"),
                pl.col(score_col).count().alias(covisit_name + '_' + score_col + '_' + "count"),
            ]
        )
    ).sort("aid_x", reverse=False).rename({"aid_x":"aid"})
    
    aid_cov_feat_df = aid_cov_feat_df.with_columns([pl.col(["aid"]).cast(pl.Int64)])

    aid_cov_feat_df.write_parquet(f'../raw_data/{GENERATE_FOR}_covisitation_features/{covisit_name}_covisitation_features.pqt')

In [None]:
score_df_tuples_w_names = [[clicks_cov_df, "all_clicks"],
                           [carts_orders_cov_df, "all_carts_orders"],
                           [buy2buy_cov_df, "all_buy2buy"]]

for covisit_tuple in tqdm(score_df_tuples_w_names):
    all_covisits_features(input_covisit_df=covisit_tuple[0],
                          score_col="wgt",
                          covisit_name=covisit_tuple[1])

In [None]:
score_df_tuples_w_names = [[clicks_cov_df, "clicks", None],
                           [carts_orders_cov_df, "carts_orders", None],
                           [buy2buy_cov_df, "buy2buy", None]]

generate_candidate_history_pair_score_features(input_val_df_path=val_path,
                                               score_df_tuples_w_names=score_df_tuples_w_names,
                                               score_col="wgt",
                                               scoring_name="covisitation")

### Word2Vec pair 'similarity' features

In [None]:
print("Reading clicks w2v...")
clicks_cov_df = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_clicks_w2v_similarities.pqt')
clicks_cov_df = clicks_cov_df.rename({"similarity":"w2v_similarity"})
print("Reading carts-orders w2v...")
carts_orders_cov_df = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_carts_w2v_similarities.pqt')
carts_orders_cov_df = carts_orders_cov_df.rename({"similarity":"w2v_similarity"})
print("Reading buy2buy w2v...")
buy2buy_cov_df = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_top_{CANDIDATE_COUNT}_buy2buy_w2v_similarities.pqt')
buy2buy_cov_df = buy2buy_cov_df.rename({"similarity":"w2v_similarity"})

score_df_tuples_w_names = [[clicks_cov_df, "clicks", None],
                           [carts_orders_cov_df, "carts_orders", None],
                           [buy2buy_cov_df, "buy2buy", None]]

generate_candidate_history_pair_score_features(input_val_df_path=val_path,
                                               score_df_tuples_w_names=score_df_tuples_w_names,
                                               score_col="w2v_similarity",
                                               scoring_name="word2vec")

## Generating Feature Data Frames

In [None]:
def generate_datetime_features(input_df):
    input_df["datetime"] = pd.to_datetime(input_df.ts + (2 * 60 * 60), unit='s')
    input_df["hour"] = input_df["datetime"].dt.hour
    input_df["dayofweek"] = input_df["datetime"].dt.dayofweek
    input_df["is_weekend"] = (input_df["dayofweek"]>4).astype(int)
    return input_df

train_df = pd.read_parquet(train_path)
val_df = pd.read_parquet(val_path)

train_df = generate_datetime_features(train_df)
val_df = generate_datetime_features(val_df)

item_df = pd.concat([train_df,val_df], ignore_index=True)
user_df = val_df
user_item_int_df = val_df    

### Extract AID-Type Occurence Counts & Probabilities

In [None]:
aid_count_df = item_df[item_df.type==0].groupby("aid")["ts"].count().rename("click_count").to_frame().reset_index()
aid_cart_df = item_df[item_df.type==1].groupby("aid")["ts"].count().rename("cart_count").to_frame().reset_index()
aid_order_df = item_df[item_df.type==2].groupby("aid")["ts"].count().rename("order_count").to_frame().reset_index()

aid_count_df = aid_count_df.merge(aid_cart_df, how="left", on="aid")
aid_count_df = aid_count_df.merge(aid_order_df, how="left", on="aid")

aid_count_df.fillna(0., inplace=True)

aid_count_df["click_prob"] = aid_count_df["click_count"] / aid_count_df["click_count"].sum()
aid_count_df["cart_prob"] = aid_count_df["cart_count"] / aid_count_df["cart_count"].sum()
aid_count_df["order_prob"] = aid_count_df["order_count"] / aid_count_df["order_count"].sum()

aid_count_df.to_parquet(f'./all_features/{GENERATE_FOR}_aid_occurences.pqt')

display(aid_count_df.head())

del aid_count_df, aid_order_df, aid_cart_df; gc.collect()

### Co-Occurence Features

In [None]:
co_order_v_order = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(orders)vs(orders)_cooccurences_v10_*.pqt"))
co_order_v_order = pd.concat([pd.read_parquet(f) for f in co_order_v_order], ignore_index=True)

co_cart_v_cart = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(carts)vs(carts)_cooccurences_v10_*.pqt"))
co_cart_v_cart = pd.concat([pd.read_parquet(f) for f in co_cart_v_cart], ignore_index=True)

co_click_v_order = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(clicks)vs(orders)_cooccurences_v10_*.pqt"))
co_click_v_order = pd.concat([pd.read_parquet(f) for f in co_click_v_order], ignore_index=True)

co_cart_v_order = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(carts)vs(orders)_cooccurences_v10_*.pqt"))
co_cart_v_order = pd.concat([pd.read_parquet(f) for f in co_cart_v_order], ignore_index=True)

co_click_v_cart = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(clicks)vs(carts)_cooccurences_v10_*.pqt"))
co_click_v_cart = pd.concat([pd.read_parquet(f) for f in co_click_v_cart], ignore_index=True)

co_order_v_cart = sorted(glob.glob(f"../raw_data/{GENERATE_FOR}_cooccurence/{GENERATE_FOR}_(orders)vs(carts)_cooccurences_v10_*.pqt"))
co_order_v_cart = pd.concat([pd.read_parquet(f) for f in co_order_v_cart], ignore_index=True)

In [None]:
def generate_PPMI_features(co_df,
                           count_df,
                           aidx_type="order",
                           aidy_type="order"):
    co_df_w_probs = co_df.\
        merge(count_df[["aid", f"{aidx_type}_prob"]], left_on="aid_x", right_on="aid").drop("aid", 1).\
                rename(columns={f"{aidx_type}_prob": "aid_x_prob"}).\
        merge(count_df[["aid", f"{aidy_type}_prob"]], left_on="aid_y", right_on="aid").drop("aid", 1).\
                rename(columns={f"{aidy_type}_prob": "aid_y_prob"})

    co_df_w_probs["pair_prob"] = (co_df_w_probs["wgt"] / co_df_w_probs["wgt"].sum())
    
    co_df_w_probs["PPMI"] =\
    np.maximum(np.log2(co_df_w_probs["pair_prob"] /\
                       (co_df_w_probs["aid_x_prob"] * co_df_w_probs["aid_y_prob"])), 0)
    
    return co_df_w_probs[["aid_x", "aid_y", "PPMI"]]

aid_count_df = pd.read_parquet(f'./all_features/{GENERATE_FOR}_aid_occurences.pqt')

ppmi_target_combs = [
    [co_order_v_order, "order", "order"],
    [co_cart_v_cart, "cart", "cart"],
    [co_click_v_order, "click", "order"],
    [co_cart_v_order, "cart", "order"],
    [co_click_v_cart, "click", "cart"],
    [co_order_v_cart, "order", "cart"],
]

for ppmi_target_comb in ppmi_target_combs:
    ppmi_df = generate_PPMI_features(co_df=ppmi_target_comb[0],
                                     count_df=aid_count_df,
                                     aidx_type=ppmi_target_comb[1],
                                     aidy_type=ppmi_target_comb[2])
    
    ppmi_df.to_parquet(f'./all_features/{GENERATE_FOR}_ppmi_{ppmi_target_comb[1]}_{ppmi_target_comb[2]}.pqt')
    display(ppmi_df.head())
    del ppmi_df; gc.collect()
    
del co_order_v_order, co_cart_v_cart, co_click_v_order, co_cart_v_order, co_click_v_cart, co_order_v_cart; gc.collect()

In [None]:
score_df_tuples_w_names = [[pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_order_cart.pqt"),
                            "order_vs_cart", [2]],
                            [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_click_cart.pqt"),
                            "click_vs_cart", [0]],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_cart_cart.pqt"),
                            "cart_vs_cart", [1]],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_click_order.pqt"),
                            "click_vs_order", [0]],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_order_order.pqt"),
                            "order_vs_order", [2]],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_cart_order.pqt"),
                            "cart_vs_order", [1]]
                          ]

generate_candidate_history_pair_score_features(input_val_df_path=val_path,
                                               score_df_tuples_w_names=score_df_tuples_w_names,
                                               score_col="PPMI",
                                               scoring_name="ppmi")

In [None]:
score_df_tuples_w_names = [[pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_order_cart.pqt"),
                            "order_vs_cart", None],
                            [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_click_cart.pqt"),
                            "click_vs_cart", None],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_cart_cart.pqt"),
                            "cart_vs_cart", None],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_click_order.pqt"),
                            "click_vs_order", None],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_order_order.pqt"),
                            "order_vs_order", None],
                           [pl.read_parquet(f"./all_features/{GENERATE_FOR}_ppmi_cart_order.pqt"),
                            "cart_vs_order", None]
                          ]

generate_candidate_history_pair_score_features(input_val_df_path=val_path,
                                               score_df_tuples_w_names=score_df_tuples_w_names,
                                               score_col="PPMI",
                                               scoring_name="ppmi_all_history")

### Common Features

In [None]:
def datetime_aggregator(input_df,
                        group_cols=[],
                        wanted_cols=[]):
    return_df = input_df.groupby(group_cols).agg(
        {'hour':['mean', 'std'],
         'dayofweek':['mean', 'std'],
         'is_weekend':['mean']
        })
    return_df.columns = ['_'.join(group_cols) + '_' +  '_'.join(col) for col in return_df.columns]
    return return_df

def type_distribution_aggregator(input_df, 
                                 group_cols=[]):
    return_df = input_df.groupby(group_cols)['type'].value_counts(normalize=True)
    return_df = return_df.unstack('type')
    return_df.columns = ['_'.join(group_cols) + '_type' + str(col) + "_mean" for col in return_df.columns]
    return return_df

def type_based_aggregator(input_df,
                          group_cols=[],
                          wanted_cols=[],
                          aggregators=[]):
    type_dfs = []
    for type_id in range(3):
        for aggregator in aggregators:
            aggregator_df = aggregator(input_df[input_df.type==type_id].reset_index(drop=True),
                                       group_cols=group_cols,
                                       wanted_cols=wanted_cols)
            aggregator_df.columns = ["type" + str(type_id) + "_" + col for col in aggregator_df.columns]
        type_dfs.append(aggregator_df)
        
    return pd.concat(type_dfs, axis=1)

def existence_amount_aggregator(input_df,
                                 group_cols=[],
                                wanted_cols=[],
                                return_counts=False):
    
    return_df = input_df.groupby(group_cols).agg({col:["count"] for col in wanted_cols})
    return_df.columns = ['_'.join(group_cols) + '_' +  '_'.join(col) for col in return_df.columns]
        
    return_df['_'.join(group_cols) + '_cnt_pct_rank'] =\
            return_df[return_df.columns[0]].rank(pct=True).astype(np.float32)
    
    count_cols = list(return_df.columns[:1])
    
    for count_col in count_cols:  
        return_df[count_col.replace("count", "existed")] = (return_df[count_col]>0).astype(int)
        return_df[count_col.replace("count", "existed_multiple")] = (return_df[count_col]>1).astype(int)
        if return_counts:
            return_df[count_col.replace("count", "existed_times")] = (return_df[count_col]).astype(int)
    
    return_df = return_df[[col for col in return_df.columns if ("count" not in col)]]
    
    return return_df

def nunique_aggregator(input_df,
                       group_cols=[],
                       wanted_cols=[]):
    
    return_df = input_df.groupby(group_cols).agg({col:["nunique"] for col in wanted_cols})
    return_df.columns = ['_'.join(group_cols) + '_' +  '_'.join(col) for col in return_df.columns]
    
    return_df['_'.join(group_cols) + '_nunq_pct_rank'] =\
            return_df[return_df.columns[0]].rank(pct=True).astype(np.float32)
        

    return return_df

### Item Features

In [None]:
def item_interacted_by_sessions_perc(input_df,
              group_cols=["aid"],
              wanted_cols=[]
             ):
    return_feature_list = []
    copy_df = input_df.copy()
    session_count = copy_df.session.nunique()
    
    return_df = 2
    
    for type_str in tqdm(type_labels.keys()):
        colname = f"session_toitem_{type_str}_perc"
        type_df = copy_df[copy_df.type==type_labels[type_str]].groupby(["aid", "session"])["ts"].count()    
        type_df = type_df.reset_index().groupby("aid")["session"].count().rename(colname).to_frame() / session_count
        if isinstance(return_df, int):
            return_df = type_df.copy()
        else:
            return_df = return_df.merge(type_df, how="outer", on="aid")
            
    return return_df.fillna(0.)


def item_lastweek_features(input_df,
              group_cols=["aid"],
              wanted_cols=[]
             ):
    
    input_df["ts"] = pd.to_datetime(input_df["ts"], unit="s")
    input_df["week"] = input_df["ts"].dt.week
    
    all_group_ids = pd.MultiIndex.from_product([input_df.aid.unique(),
                                                input_df.week.unique(),
                                                [0,1,2]],
                                               names=['aid', 'week', 'type'])

    return_dfs = []
    
    for aggfunc in ["count", "nunique"]:
        grouped = input_df.groupby(['aid', 'week', 'type'])["session"].agg(aggfunc).rename(aggfunc)
        grouped = grouped.reindex(all_group_ids, fill_value=0).reset_index()

        aid_lastweek_occ_ratio = (grouped.groupby(["aid", "type"])[aggfunc].last() /\
                                grouped.groupby(["aid", "type"])[aggfunc].sum()).fillna(0.).unstack('type')
        aid_lastweek_occ_ratio.columns = [ f"type_{col}_lastweek_{aggfunc}_occ_ratio" for col in aid_lastweek_occ_ratio]        
        return_dfs.append(aid_lastweek_occ_ratio)
        
        aid_lastweek_occ_amount = grouped.groupby(["aid", "type"])[aggfunc].last().fillna(0.).unstack('type')
        aid_lastweek_occ_amount.columns = [ f"type_{col}_lastweek_{aggfunc}_occ_amount" for col in aid_lastweek_occ_amount]        
        return_dfs.append(aid_lastweek_occ_amount)
        
        grouped["pct_change"] = grouped.groupby(["aid", "type"])[aggfunc].pct_change()

        aid_lastweek_pct_change = grouped.groupby(["aid", "type"])["pct_change"].last().fillna(-999.).\
                    replace([np.inf, -np.inf], -999.).unstack('type')
        aid_lastweek_pct_change.columns = [ f"type_{col}_lastweek_{aggfunc}_pct_change" for col in aid_lastweek_pct_change]
        return_dfs.append(aid_lastweek_pct_change)
        
    return pd.concat(return_dfs, axis=1)

def item_recurrent_signal(input_df,
                          group_cols=[],
                          wanted_cols=[]):
    
    item_recurrent_signal_df = input_df.groupby(['aid','session'])["ts"].nunique()\
            .rename("recurrent_session_acts_per_item").reset_index()
    
    item_recurrent_signal_df = item_recurrent_signal_df.groupby('aid').agg({
        'recurrent_session_acts_per_item': ['mean'],
    })
    item_recurrent_signal_df.columns = ['aid_' +  '_'.join(col) for col in item_recurrent_signal_df.columns]

    return item_recurrent_signal_df

In [None]:
item_features = pd.concat([
    existence_amount_aggregator(item_df,
                                group_cols=["aid"],
                                wanted_cols=["session", "aid"]),
    nunique_aggregator(item_df,
                       group_cols=["aid"],
                       wanted_cols=["session"]),
    datetime_aggregator(item_df,
                        group_cols=["aid"]),
    type_distribution_aggregator(item_df,
                                 group_cols=["aid"]),
    item_interacted_by_sessions_perc(item_df),
    item_lastweek_features(item_df),
    item_recurrent_signal(item_df),
    type_based_aggregator(item_df,
                          group_cols=["aid"],
                          wanted_cols=["aid", "session"],
                          aggregators=[datetime_aggregator,
                                       nunique_aggregator,
                                       existence_amount_aggregator,
                                      item_recurrent_signal])
], axis=1)

item_features = reduce_memory(item_features)

item_features.to_parquet(f'./all_features/{GENERATE_FOR}_item_features.pqt')

print("Item features are created!")

del item_features; gc.collect()

### User Features

In [None]:
def session_len(input_df,
                group_cols=["session"],
                wanted_cols=[],
                return_min_max=False
               ):
    return_df = input_df[group_cols + ["ts"]].copy()
    return_df = return_df.groupby(group_cols).agg({"ts":["min", "max"]})
    return_df.columns = ["session_start", "session_end"]
    return_df["session_len"] = return_df["session_end"] - return_df["session_start"]
    
    if return_min_max:
        return return_df
    else:
        return return_df[["session_len"]]
    
def partial_session_features(input_df,
                        group_cols=["session"],
                        wanted_cols=[]
           ):
    
    return_feature_list = []
    
    return_df = input_df.copy()
    return_df["ts_diff_gt_thr"] = (return_df.groupby("session")["ts"].diff() > 60*60*6).astype(int)
    return_df["partial_session_id"] = return_df.groupby("session")["ts_diff_gt_thr"].cumsum()

    max_partial_sessions_per_session = return_df.groupby("session")["partial_session_id"].\
                max().rename("max_partial_session_id")
    return_feature_list.append("max_partial_session_id")
    
    return_df = return_df.merge(max_partial_sessions_per_session, how="left", on="session")

    #########
    
    mean_nunq_items = return_df.groupby(["session", "partial_session_id"])["aid"].nunique().\
        reset_index().groupby("session")["aid"].mean().rename("partial_mean_nunique_aid")
    return_df = return_df.merge(mean_nunq_items, how="left", on="session")
    return_feature_list.append("partial_mean_nunique_aid")

    #########
    
    mean_items = return_df.groupby(["session", "partial_session_id"])["aid"].count().\
        reset_index().groupby("session")["aid"].mean().rename("partial_mean_count_aid")
    return_df = return_df.merge(mean_items, how="left", on="session")
    return_feature_list.append("partial_mean_count_aid")

    #########
    
    for type_str in type_labels.keys():
        mean_type_colname = f"partial_mean_{type_str}"
        mean_type = return_df[return_df.type==type_labels[type_str]].groupby(["session", "partial_session_id"])["ts"].count().\
            reset_index().groupby("session")["ts"].mean().rename(mean_type_colname)
        return_df = return_df.merge(mean_type, how="left", on="session").fillna(0)
        return_feature_list.append(mean_type_colname)    

        #########
        
        mean_type_colname = f"partial_mean_tsdiff_{type_str}"
        return_df.loc[return_df.type==type_labels[type_str], "ts_diff"] =\
            return_df[return_df.type==type_labels[type_str]].groupby(["session", "partial_session_id"])["ts"].\
                    diff().rename("ts_diff")
        
        ts_meandiff = return_df[return_df.type==type_labels[type_str]].groupby("session")["ts_diff"].mean().rename(mean_type_colname)
        return_df = return_df.merge(ts_meandiff, how="left", on="session").fillna(-999)
        return_feature_list.append(mean_type_colname)    
        
        #########
        
    return return_df[["session"] + return_feature_list].groupby("session").first()

def order_size_stats(input_df,
                        group_cols=["session"],
                        wanted_cols=[]
           ):
    return_df = input_df[input_df.type==2].reset_index().copy()
    
    order_size_stat_df = return_df.groupby(["session", "ts"]).agg({"aid":["nunique"]})
    order_size_stat_df.columns = ["aid_in_orders_nunique"]
    order_size_stat_df = order_size_stat_df.reset_index().groupby("session").agg({
        "aid_in_orders_nunique": ["min", "max", "mean"]
    })
    order_size_stat_df.columns = ['_'.join(group_cols) + '_' +  '_'.join(col) for col in order_size_stat_df.columns]
    return order_size_stat_df

def user_action_conversion_ratios(input_df,
                                  group_cols=["session"],
                                  wanted_cols=[]
                                 ):
    return_df = input_df.copy()
    
    for i in range(3):
        return_df.loc[return_df.type==i, f"type_{i}"] = 1
    return_df.fillna(0., inplace=True)
    
    return_df = return_df.groupby(["session", "aid"]).agg({"type_0": "max",
                                                 "type_1": "max",
                                                 "type_2": "max"}).reset_index()
    return_df["session_click_cart_relation"] = ((return_df["type_0"] == 1.) & return_df["type_1"] == 1.).astype(int)
    return_df["session_click_order_relation"] = ((return_df["type_0"] == 1.) & return_df["type_2"] == 1.).astype(int)
    return_df["session_cart_order_relation"] = ((return_df["type_1"] == 1.) & return_df["type_2"] == 1.).astype(int)
    
    return_df = return_df.groupby(["session"]).agg({
        "session_click_cart_relation":"mean",
        "session_click_order_relation":"mean",
        "session_cart_order_relation":"mean",
    })
    return return_df

In [None]:
user_features = pd.concat([
    existence_amount_aggregator(user_df,
                                group_cols=["session"],
                                wanted_cols=["session", "aid"]),
    session_len(user_df),
    nunique_aggregator(user_df,
                       group_cols=["session"],
                       wanted_cols=["aid"]),
    datetime_aggregator(user_df,
                        group_cols=["session"]),
    type_distribution_aggregator(user_df,
                                 group_cols=["session"]),
    partial_session_features(user_df),
    order_size_stats(user_df),
    user_action_conversion_ratios(user_df),
    type_based_aggregator(user_df,
                          group_cols=["session"],
                          wanted_cols=["aid", "session"],
                          aggregators=[datetime_aggregator,
                                       session_len,
                                       nunique_aggregator,
                                       existence_amount_aggregator])
], axis=1)

user_features = reduce_memory(user_features)

user_features.to_parquet(f'./all_features/{GENERATE_FOR}_user_features.pqt')

print("User features are created!")

In [None]:
del user_features

### User-Item Interaction Features

In [None]:
def is_last_aid_of_the_session(input_df,
                               group_cols=["session", "aid"],
                               wanted_cols=[]
                              ):
    
    return_df = input_df[group_cols].copy()
    return_df["is_aid_interacted_last"] = 0
    return_df.loc[return_df.session.shift(-1) != return_df.session, "is_aid_interacted_last"] = 1
    return_df = return_df.groupby(group_cols).agg({"is_aid_interacted_last":["max"]})
    return_df.columns = ["is_aid_interacted_last_in_session"]
    return return_df

def aid_session_ts_offsets(input_df,
                group_cols=["session", "aid"],
                wanted_cols=[]):
    session_lens = session_len(input_df,
                               return_min_max=True).reset_index()
    return_df = input_df[group_cols + ["ts"]].copy()
    return_df = return_df.groupby(group_cols).agg({"ts":["last"]})
    return_df.columns = ["session_aid_last_ts"]
    return_df.reset_index(inplace=True)
    return_df = return_df.merge(session_lens, how="left", on="session")
    return_df["aid_ts_session_end_offset"] = return_df["session_end"] - return_df["session_aid_last_ts"]
    return_df["aid_ts_session_start_offset"] = return_df["session_aid_last_ts"] - return_df["session_start"]

    return_df = return_df[group_cols + ["aid_ts_session_end_offset", "aid_ts_session_start_offset"]].set_index(group_cols)
    return return_df

def reverse_order_of_aid_for_session(input_df,
                               group_cols=["session", "aid"],
                               wanted_cols=[]
                              ):
    
    return_df = input_df[group_cols].copy()
    
    return_df.loc[:, "session_action_order"] = return_df.groupby("session")["aid"].cumcount()

    session_aid_counts = return_df.groupby("session")["aid"].count()\
                        .rename("session_action_count").reset_index()
    return_df = return_df.merge(session_aid_counts, how="left", on="session")
    return_df["session_action_order"] = return_df["session_action_count"] - return_df["session_action_order"]
    
    return_df = return_df.groupby(group_cols)["session_action_order"].min().rename("session_action_last_order").to_frame()
    
    return return_df

In [None]:
user_item_int_features = pd.concat([
    existence_amount_aggregator(user_item_int_df,
                                group_cols=["session", "aid"],
                                wanted_cols=["aid"],
                                return_counts=True),
    reverse_order_of_aid_for_session(user_item_int_df),
    aid_session_ts_offsets(user_item_int_df),
#     nunique_aggregator(user_df,
#                        group_cols=["session"],
#                        wanted_cols=["aid"]),
    datetime_aggregator(user_item_int_df,
                        group_cols=['session', 'aid']),
    type_distribution_aggregator(user_item_int_df,
                                 group_cols=['session', 'aid']),
    type_based_aggregator(user_item_int_df,
                          group_cols=['session', 'aid'],
                          wanted_cols=["aid"],
                          aggregators=[datetime_aggregator,
                                       reverse_order_of_aid_for_session,
                                       aid_session_ts_offsets,
#                                        nunique_aggregator,
                                       existence_amount_aggregator])
], axis=1)

user_item_int_features = reduce_memory(user_item_int_features)

user_item_int_features.to_parquet(f'./all_features/{GENERATE_FOR}_user_item_int_features.pqt')

print("User-Item Interaction features are created!")

In [None]:
del user_item_int_features
gc.collect()

In [None]:
del item_df, train_df

## Merging Features w/ Candidates

In [None]:
print("Reading item features...")
item_features = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_item_features.pqt')
print("Reading user features...")
user_features = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_user_features.pqt')
print("Reading user-item interaction features...")
user_item_int_features = pl.scan_parquet(f'./all_features/{GENERATE_FOR}_user_item_int_features.pqt')

val_df = pl.scan_parquet(val_path)
    
for type_str in tqdm(list(type_labels.keys())):
    
    covisit_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_covisitation_features/covisitation_features_{type_str}_{CANDIDATE_COUNT}candidates.pqt')
    all_clicks_covisit_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_covisitation_features/all_clicks_covisitation_features.pqt')
    all_cart_covisit_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_covisitation_features/all_carts_orders_covisitation_features.pqt')
    all_buy2buy_covisit_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_covisitation_features/all_buy2buy_covisitation_features.pqt')
    
    ppmi_all_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_ppmi_all_history_features/ppmi_all_history_features_{type_str}_{CANDIDATE_COUNT}candidates.pqt')

    
#     w2v_feature_df = pl.scan_parquet(f'../raw_data/{GENERATE_FOR}_word2vec_features/word2vec_features_{type_str}_{CANDIDATE_COUNT}candidates.pqt')

    pf = ParquetFile(f"./candidate_data/{GENERATE_FOR}_{CANDIDATE_COUNT}candidates_{type_str}.parquet")
    chunk = 10_000_000
    
    total_candidate_df = 0
    
    
    for batch_i, batch in tqdm(enumerate(pf.iter_batches(batch_size = chunk))):
        candidate_df = batch.to_pandas()
        candidate_df = pl.from_pandas(candidate_df)  

        rank_repeater = np.hstack([list(range(1,CANDIDATE_COUNT+1)) for i in range(int(len(candidate_df)/CANDIDATE_COUNT))])
        candidate_df = candidate_df.with_column(pl.Series(name="candidate_rank", values=rank_repeater))
        del rank_repeater;gc.collect()

        candidate_df = candidate_df.join(covisit_feature_df, on=['session',
                                                                 'aid'], how='left').fill_null(-1)
        candidate_df = candidate_df.unique()

        candidate_df = candidate_df.join(ppmi_all_feature_df, on=['session',
                                                              'aid'], how='left').fill_null(-1)
        candidate_df = candidate_df.unique()
        
        candidate_df = candidate_df.join(all_clicks_covisit_feature_df, on=['aid'], how='left').fill_null(-1)
        candidate_df = candidate_df.join(all_cart_covisit_feature_df, on=['aid'], how='left').fill_null(-1)
        candidate_df = candidate_df.join(all_buy2buy_covisit_feature_df, on=['aid'], how='left').fill_null(-1)
        
        candidate_df = candidate_df.unique()

        #print('Candidate Rank Features, Done...')
        candidate_df = candidate_df.join(item_features, on='aid', how='left').fill_null(-1)
        #print('Item Features, Done...')
        candidate_df = candidate_df.join(user_features, on='session', how='left').fill_null(-1)
        #print('User Features, Done...')
        candidate_df = candidate_df.join(user_item_int_features,
                                          on=['session', 'aid'],
                                          how='left').fill_null(-1)
        #print('User-Item Features, Done...')
        tar = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
        tar = tar.loc[ tar['type'] == type_str ]
        aids = tar.ground_truth.explode().rename('aid')
        tar = tar[['session']]
        tar = tar.merge(aids, left_index=True, right_index=True, how='left')
        tar['label'] = 1
        #print('Extract Labels, Done...')
        
        tar = pl.from_pandas(tar)
        
        candidate_df = candidate_df.join(tar, on=['session','aid'], how='left').fill_null(0)
        candidate_df = candidate_df.unique()
        candidate_df.write_parquet(f'./candidated_features/{GENERATE_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p{batch_i}.pqt')
        
        del candidate_df,tar,aids;gc.collect()
        
    del covisit_feature_df;gc.collect()