In [1]:
import os, gc

import pandas as pd
import numpy as np
import polars as pl

import xgboost as xgb
xgb.set_config(verbosity=2)
from sklearn.model_selection import GroupKFold

In [2]:
class config:
    data_path = "../data/"
    validation_path = "../data/local_validation/"
    model_path = "../models/"

In [3]:
candidate_df = pd.read_parquet(config.validation_path + "candidate_df_with_user_item_features_and_target.parquet")
display(candidate_df.head())

Unnamed: 0,session,aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,...,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks,click
0,11098528,42241,2341.0,601.0,253.0,14.0,6.0,0.0,62.0,20.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
1,11098528,1732105,8049.0,1877.0,526.0,34.0,1.0,0.0,255.0,73.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
2,11098528,166160,1902.0,119.0,42.0,10.0,0.0,0.0,90.0,6.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
3,11098528,884502,28184.0,2408.0,930.0,75.0,5.0,1.0,996.0,105.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
4,11098528,1182614,32360.0,3303.0,1426.0,219.0,26.0,2.0,1727.0,187.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0


In [4]:
positives = candidate_df.loc[candidate_df['click']==1]
negatives = candidate_df.loc[candidate_df['click']==0].sample(frac=0.5)
candidate_df = pd.concat([positives,negatives],axis=0,ignore_index=True)

In [5]:
del positives, negatives

In [6]:
FEATURES = candidate_df.columns.drop(["session", "aid"]).tolist()

print(FEATURES)

['n_clicks', 'n_carts', 'n_orders', 'item_n_clicks_24h', 'item_n_carts_24h', 'item_n_orders_24h', 'item_n_clicks_7d', 'item_n_carts_7d', 'item_n_orders_7d', 'item_avg_time_between_clicks', 'item_avg_time_between_carts', 'item_avg_time_between_orders', 'item_avg_click_hour', 'item_avg_cart_hour', 'item_avg_order_hour', 'item_avg_click_day_of_month', 'item_avg_cart_day_of_month', 'item_avg_order_day_of_month', 'user_n_clicks', 'user_n_carts', 'user_n_orders', 'user_n_clicks_24h', 'user_n_carts_24h', 'user_n_orders_24h', 'user_n_clicks_7d', 'user_n_carts_7d', 'user_n_orders_7d', 'user_n_unique_items_7d', 'user_n_unique_items_24h', 'user_session_length', 'user_avg_click_hour', 'user_avg_cart_hour', 'user_avg_order_hour', 'user_avg_duration_between_events', 'user_avg_duration_between_clicks', 'click']


In [7]:
candidate_df = candidate_df.sort_values('session')
display(candidate_df.head())

Unnamed: 0,session,aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,...,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks,click
18040513,11098528,205357,7805.0,723.0,0.0,31.0,2.0,0.0,164.0,16.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
10567668,11098528,45494,6634.0,720.0,228.0,17.0,4.0,2.0,191.0,26.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
11440511,11098528,600258,3066.0,138.0,0.0,14.0,0.0,0.0,90.0,8.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
44003133,11098528,1197172,5498.0,673.0,255.0,37.0,9.0,1.0,244.0,39.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
35083861,11098528,307904,7774.0,568.0,239.0,44.0,6.0,0.0,284.0,35.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0


In [8]:
gc.collect()

18

In [10]:
CANDIDATES = 200

skf = GroupKFold(n_splits=5)
for fold, (train_idx, valid_idx) in enumerate(
    skf.split(candidate_df, candidate_df["click"], groups=candidate_df["session"])
):

    X_train = candidate_df.loc[train_idx, FEATURES]
    y_train = candidate_df.loc[train_idx, "click"]
    X_valid = candidate_df.loc[valid_idx, FEATURES]
    y_valid = candidate_df.loc[valid_idx, "click"]

    groups_train = candidate_df.loc[train_idx].groupby('session').aid.agg('count').values
    groups_valid = candidate_df.loc[valid_idx].groupby('session').aid.agg('count').values

    dtrain = xgb.DMatrix(X_train, y_train, group=groups_train)
    dvalid = xgb.DMatrix(X_valid, y_valid, group=groups_valid)

    xgb_parms = {"objective": "rank:pairwise", "tree_method": "gpu_hist"}
    model = xgb.train(
        xgb_parms,
        dtrain=dtrain,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        num_boost_round=1000,
        verbose_eval=100,
    )
    model.save_model(config.model_path + f"XGB_fold{fold}_click.xgb")
    del model, dtrain, dvalid, X_train, X_valid, y_train, y_valid, groups_train, groups_valid
    gc.collect()