In [1]:
import os, gc

import pandas as pd
import numpy as np
import polars as pl

import xgboost as xgb
xgb.set_config(verbosity=2)
from sklearn.model_selection import GroupKFold

In [2]:
class config:
    data_path = "../data/"
    validation_path = "../data/local_validation/"
    model_path = "../models/"

In [3]:
candidate_df = pd.read_parquet(config.validation_path + "candidate_df_with_user_item_features_and_target.parquet")
display(candidate_df.head())

Unnamed: 0,session,aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,...,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks,click
0,11098528,42241,2341.0,601.0,253.0,14.0,6.0,0.0,62.0,20.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
1,11098528,1732105,8049.0,1877.0,526.0,34.0,1.0,0.0,255.0,73.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
2,11098528,166160,1902.0,119.0,42.0,10.0,0.0,0.0,90.0,6.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
3,11098528,884502,28184.0,2408.0,930.0,75.0,5.0,1.0,996.0,105.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
4,11098528,1182614,32360.0,3303.0,1426.0,219.0,26.0,2.0,1727.0,187.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0


In [4]:
positives = candidate_df.loc[candidate_df['click']==1]
negatives = candidate_df.loc[candidate_df['click']==0].sample(frac=0.5)
candidate_df = pd.concat([positives,negatives],axis=0,ignore_index=True)

In [5]:
del positives, negatives

In [6]:
FEATURES = candidate_df.columns.drop(["session", "aid", "click"]).tolist()

print(FEATURES)

['n_clicks', 'n_carts', 'n_orders', 'item_n_clicks_24h', 'item_n_carts_24h', 'item_n_orders_24h', 'item_n_clicks_7d', 'item_n_carts_7d', 'item_n_orders_7d', 'item_avg_time_between_clicks', 'item_avg_time_between_carts', 'item_avg_time_between_orders', 'item_avg_click_hour', 'item_avg_cart_hour', 'item_avg_order_hour', 'item_avg_click_day_of_month', 'item_avg_cart_day_of_month', 'item_avg_order_day_of_month', 'user_n_clicks', 'user_n_carts', 'user_n_orders', 'user_n_clicks_24h', 'user_n_carts_24h', 'user_n_orders_24h', 'user_n_clicks_7d', 'user_n_carts_7d', 'user_n_orders_7d', 'user_n_unique_items_7d', 'user_n_unique_items_24h', 'user_session_length', 'user_avg_click_hour', 'user_avg_cart_hour', 'user_avg_order_hour', 'user_avg_duration_between_events', 'user_avg_duration_between_clicks']


In [7]:
candidate_df = candidate_df.sort_values('session')
display(candidate_df.head())

Unnamed: 0,session,aid,n_clicks,n_carts,n_orders,item_n_clicks_24h,item_n_carts_24h,item_n_orders_24h,item_n_clicks_7d,item_n_carts_7d,...,user_n_orders_7d,user_n_unique_items_7d,user_n_unique_items_24h,user_session_length,user_avg_click_hour,user_avg_cart_hour,user_avg_order_hour,user_avg_duration_between_events,user_avg_duration_between_clicks,click
25300263,11098528,45494,6634.0,720.0,228.0,17.0,4.0,2.0,191.0,26.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
40381137,11098528,1732105,8049.0,1877.0,526.0,34.0,1.0,0.0,255.0,73.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
35854945,11098528,166160,1902.0,119.0,42.0,10.0,0.0,0.0,90.0,6.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
7849873,11098528,1182614,32360.0,3303.0,1426.0,219.0,26.0,2.0,1727.0,187.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0
24905131,11098528,490677,6201.0,417.0,173.0,22.0,0.0,0.0,177.0,14.0,...,0,1,1,1,22.0,-1.0,-1.0,-1,-1,0.0


In [1]:
from sklearn.preprocessing import StandardScaler

skf = GroupKFold(n_splits=5)
for fold, (train_idx, valid_idx) in enumerate(
    skf.split(candidate_df, candidate_df["click"], groups=candidate_df["session"])
):
    train_df = candidate_df.loc[train_idx]
    train_df = train_df.sort_values('session', ascending=True)
    qid_train = list(train_df['session'].values)

    valid_df = candidate_df.loc[valid_idx]
    valid_df = valid_df.sort_values('session', ascending=True)
    qid_valid = list(valid_df['session'].values)

    X_train = train_df[FEATURES]
    y_train = train_df["click"]
    X_valid = valid_df[FEATURES]
    y_valid = valid_df["click"]

    dtrain = xgb.DMatrix(X_train, y_train, qid=qid_train)
    dvalid = xgb.DMatrix(X_valid, y_valid, qid=qid_valid)

    xgb_parms = {"objective": "rank:pairwise", "tree_method": "gpu_hist"}
    model = xgb.train(
        xgb_parms,
        dtrain=dtrain,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        num_boost_round=5000,
        verbose_eval=100,
    )
    model.save_model(config.model_path + f"XGB_fold{fold}_click.xgb")
    del model, dtrain, dvalid, X_train, X_valid, y_train, y_valid, train_df, valid_df
    gc.collect()
    break