In [85]:
import gc

import cudf
import joblib
import lightgbm as lgb
from logzero import logger
from pathlib import Path
import polars as pl
import numpy as np
from sklearn.model_selection import GroupKFold


from src.io import groupby_list, read_data_pl
from src.eval import get_recall

In [86]:
def generate_features(eval_flag=True, root_path=Path("./data")):
    if eval_flag:
        root_path = root_path / "validation"
    
    df_test = pl.read_parquet(str(root_path / "test.parquet"))
    df_train = pl.concat([df_test, pl.read_parquet(str(root_path / "train.parquet"))])
    df_test = df_test.sort(by=["session", "ts"], reverse=True)
    df_train = df_train.sort(by=["session", "ts"], reverse=True)
    logger.info("Read and sorted data.")
    
    df_test["time_since_interacted"] = (df_test.groupby("session")["ts"].transform("max") - df_test["ts"]).astype(np.int32)
    df_test["time_since_carted"] = df_test.loc[df_test.type == 1].groupby(["session", "aid"])["time_since_interacted"].transform("min").astype(np.int32)
    df_test["time_since_first_interaction"] = df_test.groupby("session")["time_since_interacted"].transform("max").astype(np.int32)
    df_test["num_aids_interacted"] = df_test.groupby("session")["aid"].transform("nunique").astype(np.int16)
    df_test["time_spent"] = df_test.groupby("session")["ts"].diff().shift(-1).fillna(-1).astype(np.int32)
    df_test["median_time_spent_session"] = df_test.groupby(["session"])["time_spent"].transform("median").astype(np.int32)
    df_test["type_2"] = (df_test["type"] + 1)**3
    df_test["weighted_interaction_count"] = df_test.groupby(["session", "aid"])["type_2"].transform("sum").astype(np.int16)
    df_test["weighted_time_since_interacted"] = df_test["time_since_interacted"] /df_test["type_2"]
    df_test["weighted_time_since_interacted"] = df_test.groupby(["session", "aid"])["weighted_time_since_interacted"].transform("mean").astype(np.float32)
    df_test["time_since_ordered"] = df_test.loc[df_test.type == 2].groupby(["session", "aid"])["time_since_interacted"].transform("min").astype(np.int32)
    df_test["order_frequency"] = df_test.loc[df_test.type == 2].groupby(["session", "aid"])["time_since_interacted"].diff().mean().astype(np.float32)
    df_test["num_orders"] = df_test[df_test.type == 2].groupby("session").cumcount()
    df_test["num_orders"] = df_test.groupby("session")["num_orders"].transform("max") - df_test["num_orders"].fillna(method="ffill")
    df_test["num_carts"] = df_test[df_test.type == 1].groupby("session").cumcount()
    df_test["num_carts"] = df_test.groupby("session")["num_carts"].transform("max") - df_test["num_carts"].fillna(method="ffill")

    del df_test["type_2"]
    df_test = df_test.drop_duplicates(subset=["session", "aid"], keep="last")
    logger.info("Session, aid features created.")

    num_days = (df_train.ts.max() - df_train.ts.min()) // (24 * 60 * 60)
    df_test["aid_counts"] = df_test.aid.map(df_train.groupby("aid")["session"].size() / num_days).astype(np.float32)
    df_test["aid_touch_rate"] = df_test.aid.map(df_train.groupby("aid")["session"].nunique() / num_days).astype(np.float32)
    df_test["aid_cart_rate"] = df_test.aid.map(df_train.loc[df_train.type == 1].groupby("aid")["session"].nunique() / num_days).astype(np.float32) 
    df_test["aid_order_rate"] = df_test.aid.map(df_train.loc[df_train.type == 2].groupby("aid")["session"].nunique() / num_days).astype(np.float32)
    df_test["aid_reorder_mean"] = df_test.aid.map(df_train.loc[df_train.type == 2].groupby(["session", "aid"])["ts"].size().reset_index(drop=False).groupby("aid")[0].mean()).astype(np.float32)
    df_test["aid_reclick_mean"] = df_test.aid.map(df_train.groupby(["session", "aid"])["ts"].size().reset_index(drop=False).groupby("aid")[0].mean()).astype(np.float32)
    df_test["aid_recart_mean"] = df_test.aid.map(df_train.loc[df_train.type == 1].groupby(["session", "aid"])["ts"].size().reset_index(drop=False).groupby("aid")[0].mean()).astype(np.float32)
    df_test["aid_latest_sold_day"] = (df_test.ts.max() - df_test.aid.map(df_train.groupby("aid")["ts"].max())) // (24*60*60)
    logger.info("Aid features created.")
    del num_days
    gc.collect()
    
    df_train["next_aid"] = df_train.groupby("session")["aid"].shift(-1).fillna(-1).astype(np.int32)
    next_aid_cnt = df_train.groupby(["aid", "next_aid"]).size().astype(np.int32)
    next_aid_cnt.name = "next_aid_cnt"
    next_aid_cnt = next_aid_cnt.reset_index(drop=False)
    next_aid_cnt["aid_cnt"] = next_aid_cnt.groupby("aid")["next_aid"].transform("size").astype(np.int32)
    next_aid_cnt["next_aid_prob"] = (next_aid_cnt["next_aid_cnt"] / next_aid_cnt["aid_cnt"]).astype(np.float32)
    del next_aid_cnt["aid_cnt"]
    del df_train["next_aid"]
    gc.collect()
    df_test["next_aid"] = df_test.groupby("session")["aid"].shift(-1).fillna(-1).astype(np.int32)
    df_test = cudf.merge(df_test, next_aid_cnt, on=["aid", "next_aid"], how="left")
    df_test["next_aid_prob"] = df_test["next_aid_prob"].fillna(0).astype(np.float32)
    del next_aid_cnt
    del df_test["next_aid"]
    gc.collect()
    
    df_train["prev_aid"] = df_train.groupby("session")["aid"].shift(1).fillna(-1).astype(np.int32)
    prev_aid_cnt = df_train.groupby(["aid", "prev_aid"]).size().astype(np.int32)
    prev_aid_cnt.name = "prev_aid_cnt"
    prev_aid_cnt = prev_aid_cnt.reset_index(drop=False)
    prev_aid_cnt["aid_cnt"] = prev_aid_cnt.groupby("aid")["prev_aid"].transform("size")
    prev_aid_cnt["prev_aid_prob"] = (prev_aid_cnt["prev_aid_cnt"] / prev_aid_cnt["aid_cnt"]).astype(np.float32)
    del prev_aid_cnt["aid_cnt"]
    del df_train["prev_aid"]
    df_test["prev_aid"] = df_test.groupby("session")["aid"].shift(1).fillna(-1).astype(np.int32)
    df_test = cudf.merge(df_test, prev_aid_cnt, on=["aid", "prev_aid"], how="left")
    df_test["prev_aid_prob"] = df_test["prev_aid_prob"].astype(np.float32)
    logger.info("Aid session stats created.")
    return df_test


In [87]:
ROOT_PATH = Path("./data")
TEST_LABELS = "validation/test_labels.parquet"

################ Validation ##############

In [88]:
EVAL = True

In [89]:
def get_aid_features(df_test, df_train):
    t1 = df_train.groupby("aid").agg([
        pl.count("session").alias("aid_counts"),
        pl.n_unique("session").alias("aid_sess_counts"),
        pl.col("session").filter(pl.col("type") == 0).unique().count().alias("aid_click_unq_sess"),
        pl.col("session").filter(pl.col("type") == 1).unique().count().alias("aid_cart_unq_sess"),
        pl.col("session").filter(pl.col("type") == 2).unique().count().alias("aid_order_unq_sess"),
        pl.max("ts").alias("latest_sold_ts")
    ])
    t2 = df_train.join(
        df_train.groupby(["session", "aid"]).agg([
            (pl.col("type") == 0).sum().alias("aid_sess_clicks_mean"),
            (pl.col("type") == 1).sum().alias("aid_sess_carts_mean"),
            (pl.col("type") == 2).sum().alias("aid_sess_orders_mean"),
            (pl.col("ts").max() - pl.col("ts").min()).alias("max_ts_diff_mean")
        ]), on=["session", "aid"], how="left"
        ).groupby(["aid"]).agg([
        pl.mean("aid_sess_clicks_mean").cast(pl.Float32),
        pl.mean("aid_sess_carts_mean").cast(pl.Float32),
        pl.mean("aid_sess_orders_mean").cast(pl.Float32),
        pl.mean("max_ts_diff_mean").cast(pl.Float32)
    ])
    t3 = df_test.groupby("aid").agg([
        pl.n_unique("session").alias("aid_sess_counts_last7"),
        pl.col("session").filter(pl.col("type") == 0).unique().count().alias("aid_click_unq_sess_last7"),
        pl.col("session").filter(pl.col("type") == 1).unique().count().alias("aid_cart_unq_sess_last7"),
        pl.col("session").filter(pl.col("type") == 2).unique().count().alias("aid_order_unq_sess_last7"),
    ])
    t1 = t1.join(t2, on=["aid"], how="outer")
    t1 = t1.join(t3, on=["aid"], how="outer")
    return df_test.join(t1, on=["aid"], how="left")


def get_session_features(df_test):
    t1 = df_test.groupby("session").agg([
        pl.count("aid").alias("session_aid_counts"),
        pl.n_unique("aid").alias("session_unq_aid_counts"),
        pl.max("ts").alias("max_ts"),
        pl.min("ts").alias("min_ts"),
        pl.col("aid").filter(pl.col("type") == 1).unique().count().alias("click_counts"),
        pl.col("aid").filter(pl.col("type") == 1).unique().count().alias("cart_counts"),
        pl.col("aid").filter(pl.col("type") == 1).unique().count().alias("order_counts"),
    ])
    t2 = df_test.join(
        df_test.groupby(["session", "aid"]).agg([
            (pl.col("type") == 0).sum().alias("sess_reclicks_mean"),
            (pl.col("type") == 1).sum().alias("sess_recarts_mean"),
            (pl.col("type") == 2).sum().alias("sess_reorders_mean"),
            (pl.col("ts").max() - pl.col("ts").min()).alias("sess_ts_diff_mean")
        ]), on=["session", "aid"], how="left"
        ).groupby(["session"]).agg([
        pl.mean("sess_reclicks_mean").cast(pl.Float32),
        pl.mean("sess_recarts_mean").cast(pl.Float32),
        pl.mean("sess_reorders_mean").cast(pl.Float32),
        pl.mean("sess_ts_diff_mean").cast(pl.Float32)
    ])
    t1 = t1.join(t2, on=["session"], how="outer")
    return df_test.join(t1, on=["session"], how="left")



def get_session_aid_features(df_test):
    t1 = df_test.with_columns(
        (pl.col("max_ts") - pl.col("ts")).alias("time_since_last_interaction"),
    )
    return t1

In [90]:
root_path = ROOT_PATH
if EVAL:
    root_path = root_path / "validation"

df_test = pl.read_parquet(str(root_path / "test.parquet"))
df_train = pl.concat([df_test, pl.read_parquet(str(root_path / "train.parquet"))])
df_test = df_test.sort(by=["session", "ts"], reverse=True)
df_train = df_train.sort(by=["session", "ts"], reverse=True)
logger.info("Read and sorted data.")


df_test = get_aid_features(df_test, df_train)
logger.info("Computed aid features")
df_test = get_session_features(df_test)
logger.info("Computed session features")

df_test = get_session_aid_features(df_test)
logger.info("Computed aid session features")


[I 230126 19:53:41 3454410488:9] Read and sorted data.
[I 230126 19:55:02 3454410488:13] Computed aid features
[I 230126 19:55:07 3454410488:15] Computed session features
[I 230126 19:55:07 3454410488:18] Computed aid session features


In [91]:
def read_targets_pl(root_path="./data", label_file="validation/test_labels.parquet"):
    df = pl.read_parquet(str(Path(root_path) / label_file))
    mapps = pl.DataFrame({"type": ["clicks", "carts", "orders"], "type_int": [0, 1, 2]})
    df = df.explode(
        pl.col('ground_truth')
        ).select(
            [
                pl.col('ground_truth').cast(pl.Int32).alias('aid'),
                pl.col('session').cast(pl.Int32),
                pl.col('type')
            ]
            ).join(mapps, on=["type"], how="left").select(
                [
                    pl.col(["session", "aid"]),
                    pl.col("type_int").cast(pl.Int8).alias("type")
                ]
            )
    return df

In [92]:
df_gt = read_targets_pl()
df_gt = df_gt.select([pl.col('*').exclude("type"), pl.col("type").alias("target")])
df_gt

session,aid,target
i32,i32,i8
11098528,1679529,0
11098528,1199737,1
11098528,990658,2
11098528,950341,2
11098528,1462506,2
11098528,1561739,2
11098528,907564,2
11098528,369774,2
11098528,440367,2
11098528,92401,2


In [93]:
df_test = df_test.join(df_gt, on=["session", "aid"], how="left")
df_test.shape

(7975140, 31)

In [94]:
df_test = df_test.sort(by=["session", "ts"]).unique(subset=["session", "aid", "target"], maintain_order=True, keep="last")

In [95]:
df_test.shape

(5675012, 31)

In [96]:
import numba
import pandas as pd
@numba.njit
def recall_at_k(labels, preds, weights, groups):
    ii = 0
    scores = []
    for p, grp in enumerate(groups):
        j = ii + grp
        l_ = labels[ii:j]
        gt_sum = sum(l_ > 0)
        if gt_sum > 0:        
            p_ = preds[ii:j]
            idx = np.argsort(p_)[::-1][:20]
            score = sum(l_[idx] > 0)/min(gt_sum, 20)
            scores.append(score)
        ii = j
    scores = np.array(scores)
    return 'recall', np.mean(scores), True


def train_cv_model(df_feats, feat_cols, label_path="data/validation", save_path="data/classif_models", version="v0"):
    cvlist = list(GroupKFold(5).split(df_feats, groups=df_feats.session.values))
    task_scores = {}
    Path(save_path).mkdir(exist_ok=True, parents=True)
    task_idx = {"clicks": 0 , "carts": 1, "orders": 2}
    for task in ["clicks", "carts", "orders"]:
        logger.info("Mapped target to data.")
        df_feats[f"{task}_preds"] = np.nan
        for i, (tr_idx, vl_idx) in enumerate(cvlist):
            tr, vl = df_feats.iloc[tr_idx], df_feats.iloc[vl_idx]
            target_sums = tr.groupby("session")["target"].sum()
            target_sums_valid = target_sums[target_sums > 0].index.unique()
            tr = tr.loc[tr.session.isin(target_sums_valid)]
            logger.info(f"Train shape: {tr.shape}, {tr.target.mean()}")
            tr_feats = tr[feat_cols]
            vl_feats = vl[feat_cols]

            model =  lgb.LGBMClassifier(n_estimators=3000, learning_rate=0.05, num_leaves=15,
                                        colsample_bytree=0.5, subsample=0.8, max_bin=255,
                                        random_state=786220225, metric="None")
            model.fit(tr_feats, (tr["target"].values == task_idx[task]),
                        # group=tr.groupby("session")["target"].size().values.get(),
                      eval_set=[(vl_feats, vl["target"].values == task_idx[task])],
                      # eval_group=[vl.groupby("session")["target"].size().values.get()],
                      eval_metric="auc",
                      verbose=100, early_stopping_rounds=200)
            joblib.dump(model, f"{save_path}/model_{task}_fold{i}.pkl")
            df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
            # df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict(vl_feats)

            del model
            gc.collect()
        df_ = df_feats.sort_values(by=["session", f"{task}_preds"], ascending=[True, True])
        logger.info("Evaluating.......")
        # get pred dict
        sess, aids = df_.session.values, df_.aid.values
        pred_dict = groupby_list(sess, aids, last_n=20)
        pred_dict = {k: list(v) for k, v in pred_dict.items()}

        # get gt dict
        df_gt = pd.read_parquet("data/validation/test_labels.parquet")  
        df_gt = df_gt.loc[df_gt.session.isin(df_.session.unique())] 
        gt_dict = df_gt.loc[df_gt.type == task].set_index("session")["ground_truth"].to_dict()
        gt_dict = {k: list(v) for k, v in gt_dict.items()}
        # score
        score = get_recall(gt_dict, pred_dict, k=20)
        del df_, df_gt
        gc.collect()
        task_scores[task] = score
        logger.info(f"Recall for {task}: {score:6.4f}")
    overall_score = task_scores["orders"] * 0.6  + task_scores["carts"] * 0.3 + task_scores["clicks"] * 0.1
    logger.info(f"Overall score {overall_score}")
    return df_feats, task_scores


In [97]:
df_test.columns

['session',
 'aid',
 'ts',
 'type',
 'aid_counts',
 'aid_sess_counts',
 'aid_click_unq_sess',
 'aid_cart_unq_sess',
 'aid_order_unq_sess',
 'latest_sold_ts',
 'aid_sess_clicks_mean',
 'aid_sess_carts_mean',
 'aid_sess_orders_mean',
 'max_ts_diff_mean',
 'aid_sess_counts_last7',
 'aid_click_unq_sess_last7',
 'aid_cart_unq_sess_last7',
 'aid_order_unq_sess_last7',
 'session_aid_counts',
 'session_unq_aid_counts',
 'max_ts',
 'min_ts',
 'click_counts',
 'cart_counts',
 'order_counts',
 'sess_reclicks_mean',
 'sess_recarts_mean',
 'sess_reorders_mean',
 'sess_ts_diff_mean',
 'time_since_last_interaction',
 'target']

In [None]:
feats = [
'type',
 'aid_counts',
 'aid_sess_counts',
 'aid_click_unq_sess',
 'aid_cart_unq_sess',
 'aid_order_unq_sess',
 'latest_sold_ts',
 'aid_sess_clicks_mean',
 'aid_sess_carts_mean',
 'aid_sess_orders_mean',
 'max_ts_diff_mean',
 'aid_sess_counts_last7',
 'aid_click_unq_sess_last7',
 'aid_cart_unq_sess_last7',
 'aid_order_unq_sess_last7',
 'session_aid_counts',
 'session_unq_aid_counts',
 'max_ts',
 'min_ts',
 'click_counts',
 'cart_counts',
 'order_counts',
 'sess_reclicks_mean',
 'sess_recarts_mean',
 'sess_reorders_mean',
 'sess_ts_diff_mean',
 'time_since_last_interaction'
]
df_test, scores = train_cv_model(df_test.to_pandas(), feats, version='v0.1')

[I 230126 19:55:15 3939706813:27] Mapped target to data.
[I 230126 19:55:17 3939706813:34] Train shape: (1092322, 32), 1.1969701862017006


[100]	valid_0's auc: 0.80439
[200]	valid_0's auc: 0.805442
[300]	valid_0's auc: 0.805671
[400]	valid_0's auc: 0.80582
[500]	valid_0's auc: 0.805905
[600]	valid_0's auc: 0.805952
[700]	valid_0's auc: 0.805959
[800]	valid_0's auc: 0.805995
[900]	valid_0's auc: 0.806019
[1000]	valid_0's auc: 0.806017
[1100]	valid_0's auc: 0.806004
[1200]	valid_0's auc: 0.806016
[1300]	valid_0's auc: 0.806011


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 19:57:25 3939706813:34] Train shape: (1093757, 32), 1.1968639898599296


[100]	valid_0's auc: 0.804662
[200]	valid_0's auc: 0.805792
[300]	valid_0's auc: 0.806084
[400]	valid_0's auc: 0.806199
[500]	valid_0's auc: 0.806283
[600]	valid_0's auc: 0.806389
[700]	valid_0's auc: 0.806383


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 19:58:35 3939706813:34] Train shape: (1095360, 32), 1.1946408203671424


[100]	valid_0's auc: 0.805771
[200]	valid_0's auc: 0.806846
[300]	valid_0's auc: 0.807117
[400]	valid_0's auc: 0.80725
[500]	valid_0's auc: 0.80728
[600]	valid_0's auc: 0.807234


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 19:59:34 3939706813:34] Train shape: (1095701, 32), 1.195528702851639


[100]	valid_0's auc: 0.805032
[200]	valid_0's auc: 0.806229
[300]	valid_0's auc: 0.806444
[400]	valid_0's auc: 0.806571
[500]	valid_0's auc: 0.806645
[600]	valid_0's auc: 0.806738
[700]	valid_0's auc: 0.806749
[800]	valid_0's auc: 0.806761
[900]	valid_0's auc: 0.806746
[1000]	valid_0's auc: 0.806701


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:01:13 3939706813:34] Train shape: (1089972, 32), 1.1944440703259052


[100]	valid_0's auc: 0.805175
[200]	valid_0's auc: 0.806303
[300]	valid_0's auc: 0.806555
[400]	valid_0's auc: 0.806609
[500]	valid_0's auc: 0.806728
[600]	valid_0's auc: 0.806784
[700]	valid_0's auc: 0.806871
[800]	valid_0's auc: 0.806906
[900]	valid_0's auc: 0.806969
[1000]	valid_0's auc: 0.806979
[1100]	valid_0's auc: 0.806942


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:03:28 3939706813:54] Evaluating.......


Collecting hits:   0%|          | 0/1755534 [00:00<?, ?it/s]

[I 230126 20:03:54 3939706813:70] Recall for clicks: 0.3235
[I 230126 20:03:54 3939706813:27] Mapped target to data.
[I 230126 20:03:56 3939706813:34] Train shape: (1092322, 33), 1.1969701862017006


[100]	valid_0's auc: 0.693329
[200]	valid_0's auc: 0.696922
[300]	valid_0's auc: 0.697728
[400]	valid_0's auc: 0.698302
[500]	valid_0's auc: 0.698491
[600]	valid_0's auc: 0.698813
[700]	valid_0's auc: 0.699384
[800]	valid_0's auc: 0.699723
[900]	valid_0's auc: 0.699931
[1000]	valid_0's auc: 0.700094
[1100]	valid_0's auc: 0.700103
[1200]	valid_0's auc: 0.700224
[1300]	valid_0's auc: 0.700401
[1400]	valid_0's auc: 0.700412
[1500]	valid_0's auc: 0.700613
[1600]	valid_0's auc: 0.700894
[1700]	valid_0's auc: 0.700936
[1800]	valid_0's auc: 0.701106
[1900]	valid_0's auc: 0.701191
[2000]	valid_0's auc: 0.701224
[2100]	valid_0's auc: 0.701295
[2200]	valid_0's auc: 0.701293
[2300]	valid_0's auc: 0.70128


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:07:12 3939706813:34] Train shape: (1093757, 33), 1.1968639898599296


[100]	valid_0's auc: 0.693781
[200]	valid_0's auc: 0.697714
[300]	valid_0's auc: 0.698135
[400]	valid_0's auc: 0.698472
[500]	valid_0's auc: 0.698628
[600]	valid_0's auc: 0.698987
[700]	valid_0's auc: 0.699605
[800]	valid_0's auc: 0.700092
[900]	valid_0's auc: 0.700498
[1000]	valid_0's auc: 0.700606
[1100]	valid_0's auc: 0.700692
[1200]	valid_0's auc: 0.700929
[1300]	valid_0's auc: 0.700961
[1400]	valid_0's auc: 0.70124
[1500]	valid_0's auc: 0.701295
[1600]	valid_0's auc: 0.701505
[1700]	valid_0's auc: 0.701572
[1800]	valid_0's auc: 0.70171
[1900]	valid_0's auc: 0.701847
[2000]	valid_0's auc: 0.701839
[2100]	valid_0's auc: 0.702011
[2200]	valid_0's auc: 0.702174
[2300]	valid_0's auc: 0.702169
[2400]	valid_0's auc: 0.702224
[2500]	valid_0's auc: 0.702362
[2600]	valid_0's auc: 0.702528
[2700]	valid_0's auc: 0.702576
[2800]	valid_0's auc: 0.702602
[2900]	valid_0's auc: 0.702603
[3000]	valid_0's auc: 0.702588


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:11:29 3939706813:34] Train shape: (1095360, 33), 1.1946408203671424


[100]	valid_0's auc: 0.699137
[200]	valid_0's auc: 0.703068
[300]	valid_0's auc: 0.704544
[400]	valid_0's auc: 0.70524
[500]	valid_0's auc: 0.705228
[600]	valid_0's auc: 0.705404
[700]	valid_0's auc: 0.705762
[800]	valid_0's auc: 0.706247
[900]	valid_0's auc: 0.706616
[1000]	valid_0's auc: 0.70672
[1100]	valid_0's auc: 0.706741
[1200]	valid_0's auc: 0.706774
[1300]	valid_0's auc: 0.70697
[1400]	valid_0's auc: 0.707121
[1500]	valid_0's auc: 0.707327
[1600]	valid_0's auc: 0.707631
[1700]	valid_0's auc: 0.707652
[1800]	valid_0's auc: 0.707727
[1900]	valid_0's auc: 0.707772
[2000]	valid_0's auc: 0.707721
[2100]	valid_0's auc: 0.707836
[2200]	valid_0's auc: 0.708039
[2300]	valid_0's auc: 0.708043
[2400]	valid_0's auc: 0.7081


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:15:09 3939706813:34] Train shape: (1095701, 33), 1.195528702851639


[100]	valid_0's auc: 0.697909
[200]	valid_0's auc: 0.700596
[300]	valid_0's auc: 0.701725
[400]	valid_0's auc: 0.702268
[500]	valid_0's auc: 0.702736
[600]	valid_0's auc: 0.703198
[700]	valid_0's auc: 0.703452
[800]	valid_0's auc: 0.703572
[900]	valid_0's auc: 0.70398
[1000]	valid_0's auc: 0.704139
[1100]	valid_0's auc: 0.704221
[1200]	valid_0's auc: 0.704576
[1300]	valid_0's auc: 0.704688
[1400]	valid_0's auc: 0.704757
[1500]	valid_0's auc: 0.704812
[1600]	valid_0's auc: 0.704778
[1700]	valid_0's auc: 0.705114
[1800]	valid_0's auc: 0.705336
[1900]	valid_0's auc: 0.705325
[2000]	valid_0's auc: 0.705335
[2100]	valid_0's auc: 0.705374
[2200]	valid_0's auc: 0.705547
[2300]	valid_0's auc: 0.705556
[2400]	valid_0's auc: 0.705589
[2500]	valid_0's auc: 0.705728
[2600]	valid_0's auc: 0.705883
[2700]	valid_0's auc: 0.705835
[2800]	valid_0's auc: 0.705902
[2900]	valid_0's auc: 0.706004
[3000]	valid_0's auc: 0.706096


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:19:26 3939706813:34] Train shape: (1089972, 33), 1.1944440703259052


[100]	valid_0's auc: 0.695373
[200]	valid_0's auc: 0.698779
[300]	valid_0's auc: 0.699991
[400]	valid_0's auc: 0.700574
[500]	valid_0's auc: 0.700611
[600]	valid_0's auc: 0.701267
[700]	valid_0's auc: 0.701506
[800]	valid_0's auc: 0.701919
[900]	valid_0's auc: 0.702425
[1000]	valid_0's auc: 0.702724
[1100]	valid_0's auc: 0.702911
[1200]	valid_0's auc: 0.703122
[1300]	valid_0's auc: 0.703407
[1400]	valid_0's auc: 0.703624
[1500]	valid_0's auc: 0.703874
[1600]	valid_0's auc: 0.704011
[1700]	valid_0's auc: 0.704139
[1800]	valid_0's auc: 0.704157
[1900]	valid_0's auc: 0.704211
[2000]	valid_0's auc: 0.704325
[2100]	valid_0's auc: 0.704541
[2200]	valid_0's auc: 0.704639
[2300]	valid_0's auc: 0.70481
[2400]	valid_0's auc: 0.704886
[2500]	valid_0's auc: 0.704915
[2600]	valid_0's auc: 0.704962
[2700]	valid_0's auc: 0.705041
[2800]	valid_0's auc: 0.705139
[2900]	valid_0's auc: 0.705127
[3000]	valid_0's auc: 0.705275


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:23:48 3939706813:54] Evaluating.......


Collecting hits:   0%|          | 0/306341 [00:00<?, ?it/s]

[I 230126 20:23:59 3939706813:70] Recall for carts: 0.3113
[I 230126 20:23:59 3939706813:27] Mapped target to data.
[I 230126 20:24:00 3939706813:34] Train shape: (1092322, 34), 1.1969701862017006


[100]	valid_0's auc: 0.869083
[200]	valid_0's auc: 0.87315
[300]	valid_0's auc: 0.874576
[400]	valid_0's auc: 0.875512
[500]	valid_0's auc: 0.875874
[600]	valid_0's auc: 0.876261
[700]	valid_0's auc: 0.876659
[800]	valid_0's auc: 0.876992
[900]	valid_0's auc: 0.877184
[1000]	valid_0's auc: 0.877292
[1100]	valid_0's auc: 0.877465
[1200]	valid_0's auc: 0.877502
[1300]	valid_0's auc: 0.877614
[1400]	valid_0's auc: 0.877719
[1500]	valid_0's auc: 0.877828
[1600]	valid_0's auc: 0.877771
[1700]	valid_0's auc: 0.877825
[1800]	valid_0's auc: 0.87791
[1900]	valid_0's auc: 0.877946
[2000]	valid_0's auc: 0.878059
[2100]	valid_0's auc: 0.878076
[2200]	valid_0's auc: 0.878153
[2300]	valid_0's auc: 0.8781
[2400]	valid_0's auc: 0.878125


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feats[f"{task}_preds"].iloc[vl_idx] = model.predict_proba(vl_feats)[:, 1]
[I 230126 20:27:39 3939706813:34] Train shape: (1093757, 34), 1.1968639898599296


[100]	valid_0's auc: 0.868344
[200]	valid_0's auc: 0.872281
[300]	valid_0's auc: 0.873709
[400]	valid_0's auc: 0.874607
[500]	valid_0's auc: 0.875155
[600]	valid_0's auc: 0.875596
[700]	valid_0's auc: 0.875948
[800]	valid_0's auc: 0.875957
[900]	valid_0's auc: 0.876226
[1000]	valid_0's auc: 0.876388
[1100]	valid_0's auc: 0.876498
[1200]	valid_0's auc: 0.876558


In [9]:
import json
df_feats.to_pandas().reset_index(drop=True).to_parquet("data/B4.1/val_classif_df.pq")
with open("data/B4.1/scores.json", "w") as f:
    json.dump(scores, f)

In [None]:
del df_feats
gc.collect()

In [None]:
from src.io import prepare_sub_from_dict

def predict_on_test(df, feat_cols, model_path="data/classif_models", version="v0"):
    for task in ["orders", "carts", "clicks"]:
        preds = []
        for i in range(5):
            model = joblib.load(str(Path(model_path) / f"model_{task}_fold{i}.pkl"))
            preds_ = model.predict_proba(df[feat_cols].to_pandas())[:, 1]
            preds.append(preds_)
        preds = np.mean(preds, 0)
        df[f"{task}_preds"] = preds
    return df


df_feats = generate_features(eval_flag=False)


In [6]:
feats = [
       'type',
       'time_since_interacted', 'time_since_carted', 'time_since_first_interaction',
       'time_spent', 'median_time_spent_session',
       'weighted_interaction_count', 'weighted_time_since_interacted',
       'time_since_ordered', 'order_frequency', 'aid_counts', 'aid_touch_rate',
       'aid_cart_rate', 'aid_order_rate', 'aid_reorder_mean',
       'aid_reclick_mean', 'aid_recart_mean',
       'next_aid_prob', 'prev_aid_prob', 'aid_latest_sold_day',
       'num_orders'
]
df_feats = predict_on_test(df_feats, feats, version="v0")

In [7]:
Path("data/B4").mkdir(exist_ok=True)

df_feats.to_pandas().reset_index(drop=True).to_parquet("data/B4/test_classif_df.pq")

In [8]:
def prepare_submission(df):
    task_predictions = {}
    for task in ["orders", "carts", "clicks"]:
        df_ = df_feats.sort_values(by=["session", f"{task}_preds"], ascending=[True, True])
        logger.info("Evaluating.......")
        # get pred dict
        sess, aids = df_.session.values.get(), df_.aid.values.get()
        pred_dict = groupby_list(sess, aids, last_n=20)
        pred_dict = {k: list(v) for k, v in pred_dict.items()}
        task_predictions[task] = pred_dict
    return prepare_sub_from_dict(task_predictions["clicks"], task_predictions["carts"], task_predictions["orders"])


sub = prepare_submission(df_feats)
sub.to_csv("data/B4/sub_classif.csv", index=False)

[I 230113 15:46:32 2126847256:5] Evaluating.......
  return d[key]
[I 230113 15:46:43 2126847256:5] Evaluating.......
[I 230113 15:46:51 2126847256:5] Evaluating.......
