In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import polars as pl

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            cmin = df[col].min()
            cmax = df[col].max()
            if str(col_type)[:3] == 'int':
                if cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif cmin > np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

# Feature Extraction

In [3]:
GENERATE_FOR = "kaggle" # "kaggle"

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

## Generating Features

In [5]:
def generate_datetime_features(input_df):
    input_df["datetime"] = pd.to_datetime(input_df.ts + (2 * 60 * 60), unit='s')
    input_df["hour"] = input_df["datetime"].dt.hour
    input_df["dayofweek"] = input_df["datetime"].dt.dayofweek
    input_df["is_weekend"] = (input_df["dayofweek"]>4).astype(int)
    return input_df

def datetime_aggregator(input_df,
                        group_cols=[]):
    return_df = input_df.groupby(group_cols).agg(
        {'hour':['mean', 'std', 'size'],
         'dayofweek':['mean', 'std'],
         'is_weekend':['mean']
        })
    return_df.columns = ['_'.join(group_cols) + '_' +  '_'.join(col) for col in return_df.columns]
    return return_df

def type_distribution_aggregator(input_df, 
                                 group_cols=[]):
    return_df = input_df.groupby(group_cols)['type'].value_counts(normalize=True)
    return_df = return_df.unstack('type')
    return_df.columns = ['_'.join(group_cols) + '_type' + str(col) + "_mean" for col in return_df.columns]
    return return_df

def type_based_aggregator(input_df,
                          group_cols=[],
                          aggregators=[]):
    type_dfs = []
    for type_id in range(3):
        for aggregator in aggregators:
            aggregator_df = aggregator(input_df[input_df.type==type_id].reset_index(drop=True),
                                       group_cols=group_cols)
            aggregator_df.columns = ["type" + str(type_id) + "_" + col for col in aggregator_df.columns]
        type_dfs.append(aggregator_df)
        
    return pd.concat(type_dfs, axis=1)

In [6]:
if GENERATE_FOR == "local":
    train_df = pd.read_parquet(f"./splitted_raw_data/train.parquet")
    val_df = pd.read_parquet(f"./splitted_raw_data/val.parquet")

elif GENERATE_FOR == "kaggle":
    train_df = pd.read_parquet(f"./splitted_raw_data/all_train.parquet")
    val_df = pd.read_parquet(f"./splitted_raw_data/test.parquet")

train_df = generate_datetime_features(train_df)
val_df = generate_datetime_features(val_df)

item_df = pd.concat([train_df,val_df], ignore_index=True)
user_df = val_df
user_item_int_df = val_df    

print("Data is read!")

############

item_features = item_df.groupby('aid').agg({'aid':'count', 'session':'nunique'})
item_features.columns = ['aid_' + "_".join(col) for col in item_features.columns]

item_features = pd.concat([
    item_features,
    datetime_aggregator(item_df, group_cols=["aid"]),
    type_distribution_aggregator(item_df, group_cols=["aid"]),
    type_based_aggregator(item_df, group_cols=["aid"],
                          aggregators=[datetime_aggregator])
], axis=1)

item_features = reduce_memory(item_features)

item_features.to_parquet(f'./all_features/{GENERATE_FOR}_item_features.pqt')

print("Item features are created!")

############

user_features = user_df.groupby('session').agg({'session':'count', 'aid':'nunique'})

user_features.columns = ['session_' + "_".join(col) for col in user_features.columns]

user_features = pd.concat([
    user_features,
    datetime_aggregator(user_df, group_cols=["session"]),
    type_distribution_aggregator(user_df, group_cols=["session"]),
    type_based_aggregator(user_df, group_cols=["session"],
                          aggregators=[datetime_aggregator])
], axis=1)

user_features = reduce_memory(user_features)

user_features.to_parquet(f'./all_features/{GENERATE_FOR}_user_features.pqt')

print("User features are created!")

############

user_item_int_features = user_item_int_df.groupby(['session', 'aid']).agg({'aid':'count'})

user_item_int_features.columns = ['session_aid_' + "_".join(col) for col in user_item_int_features.columns]

user_item_int_features = pd.concat([
    user_item_int_features,
    datetime_aggregator(user_item_int_df, group_cols=['session', 'aid']),
    type_distribution_aggregator(user_item_int_df, group_cols=['session', 'aid']),
    type_based_aggregator(user_item_int_df, group_cols=['session', 'aid'],
                          aggregators=[datetime_aggregator])
], axis=1)

user_item_int_features = reduce_memory(user_item_int_features)

user_item_int_features.to_parquet(f'./all_features/{GENERATE_FOR}_user_item_int_features.pqt')

print("User-Item Interaction features are created!")

Data is read!
Item features are created!
User features are created!
User-Item Interaction features are created!


## Merging Features w/ Candidates

In [5]:
for type_str in tqdm(list(type_labels.keys())):
    
    candidate_df = pl.read_parquet(f"./candidate_data/{GENERATE_FOR}_candidates_{type_str}.parquet").drop("__index_level_0__")
    rank_repeater = np.hstack([list(range(1,51)) for i in range(int(len(candidate_df)/50))])
    candidate_df = candidate_df.with_column(pl.Series(name="candidate_rank", values=rank_repeater))    
    
    item_features = pl.read_parquet(f'./all_features/{GENERATE_FOR}_item_features.pqt')
    candidate_df = candidate_df.join(item_features, on='aid', how='left')
    
    user_features = pl.read_parquet(f'./all_features/{GENERATE_FOR}_user_features.pqt')
    candidate_df = candidate_df.join(user_features, on='session', how='left')

    user_item_int_features = pl.read_parquet(f'./all_features/{GENERATE_FOR}_user_item_int_features.pqt')
    candidate_df = candidate_df.join(user_item_int_features,
                                      on=['session', 'aid'],
                                      how='left').fill_null(-1)

    tar = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
    tar = tar.loc[ tar['type'] == type_str ]
    aids = tar.ground_truth.explode().rename('aid')
    tar = tar[['session']]
    tar = tar.merge(aids, left_index=True, right_index=True, how='left')
    tar['label'] = 1
    
    tar = pl.from_pandas(tar)
    
    candidate_df = candidate_df.join(tar, on=['session','aid'], how='left').fill_null(0)
    
    candidate_df.write_parquet(f'./candidated_features/{GENERATE_FOR}_{type_str}_all_data.pqt')
    
    del candidate_df, item_features, user_features

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# for type_str in tqdm(list(type_labels.keys())):
    
#     candidate_df = pd.read_parquet(f"./candidate_data/{GENERATE_FOR}_candidates_{type_str}.parquet")

    
#     item_features = pd.read_parquet(f'./all_features/{GENERATE_FOR}_item_features.pqt')
#     candidate_df = candidate_df.merge(item_features, left_on='aid', right_index=True, how='left').fillna(-1)
    
#     user_features = pd.read_parquet(f'./all_features/{GENERATE_FOR}_user_features.pqt')
#     candidate_df = candidate_df.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)

#     user_item_int_features = pd.read_parquet(f'./all_features/{GENERATE_FOR}_user_item_int_features.pqt')
#     candidate_df = candidate_df.merge(user_item_int_features,
#                                       left_on=['session',
#                                                'aid'],
#                                       right_index=True,
#                                       how='left').fillna(-1)
    
#     tar = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
#     tar = tar.loc[ tar['type'] == type_str ]
#     aids = tar.ground_truth.explode().astype('int32').rename('aid')
#     tar = tar[['session']].astype('int32')
#     tar = tar.merge(aids, left_index=True, right_index=True, how='left')
#     tar['label'] = 1
    
#     candidate_df = candidate_df.merge(tar,on=['session','aid'],how='left').fillna(0)
#     candidate_df.to_parquet(f'./candidated_features/{GENERATE_FOR}_{type_str}_all_data.pqt')
    
#     del candidate_df, item_features, user_features

In [5]:
dff = pd.read_parquet(f'./candidated_features/{GENERATE_FOR}_clicks_all_data.pqt')

In [8]:
dff.tail()

Unnamed: 0,session,aid,candidate_rank,aid_a_i_d,aid_s_e_s_s_i_o_n,aid_hour_mean,aid_hour_std,aid_hour_size,aid_dayofweek_mean,aid_dayofweek_std,aid_is_weekend_mean,aid_type0_mean,aid_type1_mean,aid_type2_mean,type0_aid_hour_mean,type0_aid_hour_std,type0_aid_hour_size,type0_aid_dayofweek_mean,type0_aid_dayofweek_std,type0_aid_is_weekend_mean,type1_aid_hour_mean,type1_aid_hour_std,type1_aid_hour_size,type1_aid_dayofweek_mean,type1_aid_dayofweek_std,type1_aid_is_weekend_mean,type2_aid_hour_mean,type2_aid_hour_std,type2_aid_hour_size,type2_aid_dayofweek_mean,type2_aid_dayofweek_std,type2_aid_is_weekend_mean,session_s_e_s_s_i_o_n,session_a_i_d,session_hour_mean,session_hour_std,session_hour_size,session_dayofweek_mean,session_dayofweek_std,session_is_weekend_mean,session_type0_mean,session_type1_mean,session_type2_mean,type0_session_hour_mean,type0_session_hour_std,type0_session_hour_size,type0_session_dayofweek_mean,type0_session_dayofweek_std,type0_session_is_weekend_mean,type1_session_hour_mean,type1_session_hour_std,type1_session_hour_size,type1_session_dayofweek_mean,type1_session_dayofweek_std,type1_session_is_weekend_mean,type2_session_hour_mean,type2_session_hour_std,type2_session_hour_size,type2_session_dayofweek_mean,type2_session_dayofweek_std,type2_session_is_weekend_mean,session_aid_a_i_d,session_aid_hour_mean,session_aid_hour_std,session_aid_hour_size,session_aid_dayofweek_mean,session_aid_dayofweek_std,session_aid_is_weekend_mean,session_aid_type0_mean,session_aid_type1_mean,session_aid_type2_mean,type0_session_aid_hour_mean,type0_session_aid_hour_std,type0_session_aid_hour_size,type0_session_aid_dayofweek_mean,type0_session_aid_dayofweek_std,type0_session_aid_is_weekend_mean,type1_session_aid_hour_mean,type1_session_aid_hour_std,type1_session_aid_hour_size,type1_session_aid_dayofweek_mean,type1_session_aid_dayofweek_std,type1_session_aid_is_weekend_mean,type2_session_aid_hour_mean,type2_session_aid_hour_std,type2_session_aid_hour_size,type2_session_aid_dayofweek_mean,type2_session_aid_dayofweek_std,type2_session_aid_is_weekend_mean,label
90062545,12899778,1670370,46,207,131,14.082126,5.444158,207,2.927536,2.192123,0.309179,0.927536,0.067633,0.004831,14.135417,5.519351,192.0,2.963542,2.176438,0.3125,13.714286,4.462296,14.0,2.642857,2.405351,0.285714,9.0,-1.0,1.0,0.0,-1.0,0.0,1,1,23.0,-1.0,1,6.0,-1.0,1.0,1.0,-1.0,-1.0,23.0,-1.0,1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
90062546,12899778,148668,47,620,405,14.583871,5.417711,620,2.890323,1.96436,0.259677,0.93871,0.053226,0.008065,14.532646,5.400146,582.0,2.917526,1.971414,0.266323,15.060606,5.841337,33.0,2.636364,1.867789,0.181818,17.4,4.615192,5.0,1.4,1.140175,0.0,1,1,23.0,-1.0,1,6.0,-1.0,1.0,1.0,-1.0,-1.0,23.0,-1.0,1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
90062547,12899778,221635,48,283,201,14.636043,5.614543,283,3.568905,2.008322,0.409894,0.908127,0.091873,-1.0,14.634241,5.512634,257.0,3.571985,1.985217,0.404669,14.653846,6.662986,26.0,3.538461,2.266818,0.461538,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,1,23.0,-1.0,1,6.0,-1.0,1.0,1.0,-1.0,-1.0,23.0,-1.0,1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
90062548,12899778,422409,49,56,39,14.714286,6.848263,56,3.410714,2.060686,0.428571,0.946429,0.035714,0.017857,14.81132,6.702631,53.0,3.396226,2.022371,0.415094,11.5,14.849242,2.0,5.5,0.707107,1.0,16.0,-1.0,1.0,0.0,-1.0,0.0,1,1,23.0,-1.0,1,6.0,-1.0,1.0,1.0,-1.0,-1.0,23.0,-1.0,1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
90062549,12899778,715922,50,699,425,14.143062,6.113503,699,2.778255,2.028673,0.247496,0.865522,0.104435,0.030043,14.11405,6.024664,605.0,2.801653,2.025581,0.247934,14.493151,6.729131,73.0,2.575342,2.074433,0.232877,13.761905,6.67761,21.0,2.809524,2.015417,0.285714,1,1,23.0,-1.0,1,6.0,-1.0,1.0,1.0,-1.0,-1.0,23.0,-1.0,1.0,6.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0
