# item–item knn cf using jaccard; temporal split; thresholded scoring; evaluated by leave-one-out basket completion (hit@k, coverage).

In [None]:

import pandas as pd
import numpy as np
import cornac
from cornac.data import Dataset
from cornac.eval_methods import RatioSplit
from cornac.metrics import Recall, NDCG, Precision, MAP, AUC
from cornac.models.ease import EASE
from tqdm.auto import trange

print("cornac:", cornac.__version__)

cornac: 2.3.5


In [29]:
import pandas as pd

def load_filtered_transactions(
    trans_path="/workspace/data/processed/transactions_clean.parquet",
    avail_path="/workspace/data/processed/articles_for_recs.parquet",
    bad_ids={"12025DK", "12025FI", "12025NO", "12025SE", "970300", "459978"},
    cols=("shopUserId", "orderId", "groupId"),
):
    global df  # ensure the output is assigned to the name `df`
    df = pd.read_parquet(trans_path, columns=list(cols))
    avail_df = pd.read_parquet(avail_path)

    gid = df["groupId"].astype(str).str.strip()
    avail_ids = set(avail_df["groupId"].astype(str).str.strip().unique())

    df = df.loc[gid.isin(avail_ids) & ~gid.isin(bad_ids)].reset_index(drop=True)
    return df

df = load_filtered_transactions()

In [30]:
df

Unnamed: 0,shopUserId,orderId,groupId
0,934415,893249,210770
1,934415,893249,210781
2,934415,893249,260976
3,934415,893249,270794
4,934415,893249,270794
...,...,...,...
260794,110507,166445,240012
260795,252853,166428,260345
260796,252853,166428,239301
260797,252844,166420,263855


Aggregate to user–item level;

In [31]:
import pandas as pd

def make_user_item_pairs(
    df: pd.DataFrame,
    user_col: str = "shopUserId",
    order_col: str = "orderId",
    item_col: str = "groupId",
    pref_value: float = 1.0,
) -> pd.DataFrame:
    """
    Deduplicate within the same order, then reduce to unique (user, item) pairs.
    Returns a DataFrame with [user_col, item_col, "pref"].
    """
    pairs = (
        df.drop_duplicates(subset=[user_col, order_col, item_col])
          .drop_duplicates(subset=[user_col, item_col])
          [[user_col, item_col]]
          .copy()
    )
    pairs["pref"] = float(pref_value)  # binary preference
    return pairs

pairs = make_user_item_pairs(df)

In [32]:
pairs

Unnamed: 0,shopUserId,groupId,pref
0,934415,210770,1.0
1,934415,210781,1.0
2,934415,260976,1.0
3,934415,270794,1.0
5,934415,261661,1.0
...,...,...,...
260792,252879,430037,1.0
260793,252874,261706,1.0
260794,110507,240012,1.0
260795,252853,260345,1.0


In [None]:
def filter_pairs_by_item_frequency(pairs, item_col="groupId", q_low=0.5, q_high=0.96, inclusive="both"): #quantile filter
    gid = pairs[item_col].astype(str).str.strip()
    counts = gid.value_counts()
    low, high = counts.quantile([q_low, q_high])
    mask = gid.map(counts).between(low, high, inclusive=inclusive)
    return pairs[mask].reset_index(drop=True)

pairs = filter_pairs_by_item_frequency(pairs)

print(
    f"Groups kept: {(gid_counts >= low).sum() - (gid_counts > high).sum()} (of {gid_counts.size})\n"
    f"Rows kept: {len(df)} (of {len(df)})\n"
    f"Count thresholds: low={low:.0f}, high={high:.0f}"
)


Groups kept: 678 (of 1457)
Rows kept: 260799 (of 260799)
Count thresholds: low=16, high=856


In [35]:
# Cornac triplets
def to_user_item_ratings(pairs, user_col="shopUserId", item_col="groupId", pref_col="pref"):
    return list(zip(pairs[user_col].astype(str),
                    pairs[item_col].astype(str),
                    pairs[pref_col].astype(float)))

uir = to_user_item_ratings(pairs)
len(uir)

91499

In [44]:
def make_splits(uir, test_size=0.10, val_size=0.10, seed=42, verbose=True, exclude_unknowns=True, print_stats=True):
    rs = RatioSplit(
        data=uir,
        test_size=test_size,
        val_size=val_size,
        exclude_unknowns=exclude_unknowns,
        seed=seed,
        verbose=verbose,
    )
    train_set, val_set, test_set = rs.train_set, rs.val_set, rs.test_set
    if print_stats:
        print("Users:", train_set.num_users, "Items:", train_set.num_items)
    return train_set, val_set, test_set

train_set, val_set, test_set = make_splits(uir)


rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 36217
Number of items = 312
Number of ratings = 73199
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 36217
Number of items = 312
Number of ratings = 6736
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 36217
Number of items = 312
Number of ratings = 6795
---
Total users = 36217
Total items = 312
Users: 36217 Items: 312


In [54]:
def tune_ease(rs, lamb_list=(1000, 2000), k=20):
    space = [cornac.hyperopt.Discrete("lamb", list(lamb_list))]
    gs = cornac.hyperopt.GridSearch(
        model=EASE(name="EASE"),
        space=space,
        metric=Recall(k=k),
        eval_method=rs,
    )
    gs.fit(rs.train_set, rs.val_set)  # <- pass splits explicitly
    return gs, gs.best_model, gs.best_params


gs, best_model, best_params = tune_ease(rs)
print(best_params)

Evaluating: {'lamb': 1000}
Evaluating: {'lamb': 2000}
Best parameter settings: {'lamb': 1000}
Recall@20 = 0.2806
{'lamb': 1000}


In [55]:
def run_experiment(rs, best_params, k=20, verbose=False):
    metrics = [Recall(k=k), Precision(k=k), NDCG(k=k), MAP()]
    ease_best = EASE(**best_params, name="EASE-best", verbose=False)
    exp = cornac.Experiment(
        eval_method=rs,
        models=[
            cornac.models.MostPop(),
            cornac.models.ItemKNN(k=100),
            cornac.models.BPR(learning_rate=0.01, lambda_reg=0.001, max_iter=200, name="BPR-default"),
            ease_best,
        ],
        metrics=metrics,
        verbose=verbose,
    )
    exp.run()
    return exp

exp = run_experiment(rs, best_params)



[MostPop] Training started!

[MostPop] Evaluation started!


Ranking:   0%|          | 0/8377 [00:00<?, ?it/s]

Ranking:   0%|          | 0/8401 [00:00<?, ?it/s]


[ItemKNN] Training started!


  0%|          | 0/678 [00:00<?, ?it/s]


[ItemKNN] Evaluation started!


Ranking:   0%|          | 0/8377 [00:00<?, ?it/s]

Ranking:   0%|          | 0/8401 [00:00<?, ?it/s]


[BPR-default] Training started!

[BPR-default] Evaluation started!


Ranking:   0%|          | 0/8377 [00:00<?, ?it/s]

Ranking:   0%|          | 0/8401 [00:00<?, ?it/s]


[EASE-best] Training started!

[EASE-best] Evaluation started!


Ranking:   0%|          | 0/8377 [00:00<?, ?it/s]

Ranking:   0%|          | 0/8401 [00:00<?, ?it/s]


VALIDATION:
...
            |    MAP | NDCG@20 | Precision@20 | Recall@20 | Time (s)
----------- + ------ + ------- + ------------ + --------- + --------
MostPop     | 0.0333 |  0.0453 |       0.0073 |    0.1207 |   1.3246
ItemKNN     | 0.0068 |  0.0334 |       0.0052 |    0.0853 |   6.5079
BPR-default | 0.0439 |  0.0602 |       0.0087 |    0.1466 |   4.3697
EASE-best   | 0.1112 |  0.1453 |       0.0166 |    0.2806 |   1.7859

TEST:
...
            |    MAP | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
----------- + ------ + ------- + ------------ + --------- + --------- + --------
MostPop     | 0.0332 |  0.0461 |       0.0074 |    0.1236 |    0.0006 |   1.4355
ItemKNN     | 0.0067 |  0.0319 |       0.0052 |    0.0848 |    0.0280 |   6.5442
BPR-default | 0.0435 |  0.0588 |       0.0085 |    0.1423 |    0.5207 |   4.3555
EASE-best   | 0.1112 |  0.1462 |       0.0169 |    0.2831 |    0.0518 |   1.8593



In [None]:
def finalize_ease(uir, best_params, test_size=0.10, seed=42, exclude_unknowns=True, verbose=True):
    rs_no_val = RatioSplit(
        data=uir,
        test_size=test_size,
        val_size=0.0,
        exclude_unknowns=exclude_unknowns,
        seed=seed,
        verbose=verbose,
    )
    final_ease = EASE(**best_params, name="EASE-final", verbose=True)
    final_ease.fit(rs_no_val.train_set)
    return final_ease, rs_no_val

final_ease, rs_no_val = finalize_ease(
    uir,
    best_params=best_params,
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 38445
Number of items = 312
Number of ratings = 82349
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 38445
Number of items = 312
Number of ratings = 7003
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 38445
Total items = 312


In [68]:
def evaluate_final_model(rs_no_val, final_ease, k=20, verbose=False, save_dir=None):
    metrics = [Recall(k=k), Precision(k=k), NDCG(k=k), MAP()]
    exp = cornac.Experiment(
        eval_method=rs_no_val,
        models=[final_ease],
        metrics=metrics,
        verbose=verbose,
        save_dir=save_dir,
    )
    exp.run()
    return exp

exp = evaluate_final_model(rs_no_val, final_ease, k=20, verbose=False)


[EASE-final] Training started!

[EASE-final] Evaluation started!


Ranking:   0%|          | 0/5958 [00:00<?, ?it/s]


TEST:
...
           |    MAP | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
---------- + ------ + ------- + ------------ + --------- + --------- + --------
EASE-final | 0.1267 |  0.1675 |       0.0192 |    0.3283 |    0.1192 |   2.1650



keeps a pair (i, j) only if at least 12 distinct users interacted with both items.

In [77]:
import numpy as np
import pandas as pd
from scipy import sparse as sp

# ---------- helpers ----------
def _prepare_item_data(final_ease):
    B = np.asarray(final_ease.get_item_vectors(), dtype=float)
    ts = final_ease.train_set
    raw_item_ids = np.asarray(ts.item_ids, dtype=str)
    X = ts.X if hasattr(ts, "X") else ts.matrix
    X = X.tocsr() if sp.issparse(X) else sp.csr_matrix(X)
    return B, raw_item_ids, X

def _cooccurrence_support(X):
    Xb = X.copy(); Xb.data[:] = 1
    S = (Xb.T @ Xb).tocsr()
    S.setdiag(0); S.eliminate_zeros()
    return S

def _top_supported_for_item(i, support, B, raw_item_ids, min_support=12, topk=10):
    row = support.getrow(i)
    if row.nnz == 0:
        return []
    mask = row.data >= min_support
    if not np.any(mask):
        return []
    cand = row.indices[mask]
    if cand.size < topk:
        return []
    w = B[i, cand]
    order = np.argsort(w)[::-1][:topk]
    return list(raw_item_ids[cand[order]])

def build_basket_completion(final_ease, min_support=12, topk=10):
    """Return a DataFrame with per-item Top-K complements using EASE embeddings and co-occurrence."""
    B, raw_item_ids, X = _prepare_item_data(final_ease)
    support = _cooccurrence_support(X)

    rows = []
    for i, pid in enumerate(raw_item_ids):
        recs = _top_supported_for_item(i, support, B, raw_item_ids, min_support=min_support, topk=topk)
        if len(recs) < topk:
            continue
        rows.append({"Product ID": pid, **{f"Top {k}": recs[k-1] for k in range(1, topk + 1)}})

    cols = ["Product ID"] + [f"Top {k}" for k in range(1, topk + 1)]
    return pd.DataFrame(rows, columns=cols)

def save_basket_completion(df, out_path):
    df.to_parquet(out_path, engine="pyarrow", index=False)

MIN_SUPPORT = 12
TOPK = 10
OUT_PATH = "/workspace/data/processed/basket_completion.parquet"
df_bc = build_basket_completion(final_ease, min_support=MIN_SUPPORT, topk=TOPK)
save_basket_completion(df_bc, OUT_PATH)


In [78]:
df_bc.sample(1)

Unnamed: 0,Product ID,Top 1,Top 2,Top 3,Top 4,Top 5,Top 6,Top 7,Top 8,Top 9,Top 10
40,242198,241208,242214,240108,240279,250124,240181,210712,210676,210761,210789
