# item–item cf using ease https://cornac.readthedocs.io/en/stable/api_ref/models.html#module-cornac.models.ease.recom_ease

In [115]:

import pandas as pd
import numpy as np
import cornac
from cornac.data import Dataset
from cornac.eval_methods import RatioSplit
from cornac.hyperopt import GridSearch, Discrete
from cornac.metrics import Recall, NDCG, Precision, MAP, AUC
from cornac.models.ease import EASE
from tqdm.auto import trange

print("cornac:", cornac.__version__)

cornac: 2.3.5


In [116]:
import pandas as pd

def load_filtered_transactions(
    trans_path="/workspace/data/processed/transactions_clean.parquet",
    avail_path="/workspace/data/processed/articles_for_recs.parquet",
    bad_ids={"12025DK", "12025FI", "12025NO", "12025SE", "970300", "459978"},
    cols=("shopUserId", "orderId", "groupId"),
):
    global df 
    df = pd.read_parquet(trans_path, columns=list(cols))
    avail_df = pd.read_parquet(avail_path)

    gid = df["groupId"].astype(str).str.strip()
    avail_ids = set(avail_df["groupId"].astype(str).str.strip().unique())

    df = df.loc[gid.isin(avail_ids) & ~gid.isin(bad_ids)].reset_index(drop=True)
    return df

df = load_filtered_transactions()

In [117]:
df

Unnamed: 0,shopUserId,orderId,groupId
0,943483,902721,260257
1,943480,902718,280034
2,943480,902718,290150
3,943480,902718,291294
4,943480,902718,292359
...,...,...,...
262693,110507,166445,240012
262694,252853,166428,260345
262695,252853,166428,239301
262696,252844,166420,263855


Aggregate to user–item level;

In [118]:
import pandas as pd

def make_user_item_pairs(
    df: pd.DataFrame,
    user_col: str = "shopUserId",
    order_col: str = "orderId",
    item_col: str = "groupId",
    pref_value: float = 1.0,
) -> pd.DataFrame:
    """
    Deduplicate within the same order, then reduce to unique (user, item) pairs.
    Returns a DataFrame with [user_col, item_col, "pref"].
    """
    pairs = (
        df.drop_duplicates(subset=[user_col, order_col, item_col])
          .drop_duplicates(subset=[user_col, item_col])
          [[user_col, item_col]]
          .copy()
    )
    pairs["pref"] = float(pref_value)  # binary preference
    return pairs

pairs = make_user_item_pairs(df)

In [119]:
pairs

Unnamed: 0,shopUserId,groupId,pref
0,943483,260257,1.0
1,943480,280034,1.0
2,943480,290150,1.0
3,943480,291294,1.0
4,943480,292359,1.0
...,...,...,...
262691,252879,430037,1.0
262692,252874,261706,1.0
262693,110507,240012,1.0
262694,252853,260345,1.0


In [120]:
def filter_pairs_by_item_frequency(
    pairs, item_col="groupId", q_low=0.5, q_high=0.96, inclusive="both", return_stats=False
):  # quantile filter
    gid = pairs[item_col].astype(str).str.strip()
    counts = gid.value_counts()
    low, high = counts.quantile([q_low, q_high])
    mask = gid.map(counts).between(low, high, inclusive=inclusive)
    filtered = pairs[mask].reset_index(drop=True)

    if not return_stats:
        return filtered

    stats = {
        "low": float(low),
        "high": float(high),
        "groups_total": int(counts.size),
        "groups_kept": int(counts.between(low, high, inclusive=inclusive).sum()),
        "rows_total": int(len(pairs)),
        "rows_kept": int(mask.sum()),
        "gid_counts": counts,  # included only for your print if you want the exact formula
    }
    return filtered, stats


In [121]:
pairs, s = filter_pairs_by_item_frequency(pairs, return_stats=True)

gid_counts = s["gid_counts"]
low, high = s["low"], s["high"]

print(
    f"Groups kept: {(gid_counts >= low).sum() - (gid_counts > high).sum()} (of {gid_counts.size})\n"
    f"Rows kept: {s['rows_kept']} (of {s['rows_total']})\n"
    f"Count thresholds: low={low:.0f}, high={high:.0f}"
)


Groups kept: 679 (of 1463)
Rows kept: 127755 (of 215106)
Count thresholds: low=16, high=852


In [122]:
# Cornac triplets
def to_user_item_ratings(pairs, user_col="shopUserId", item_col="groupId", pref_col="pref"):
    return list(zip(pairs[user_col].astype(str),
                    pairs[item_col].astype(str),
                    pairs[pref_col].astype(float)))

uir = to_user_item_ratings(pairs)
len(uir)

127755

In [123]:
def make_splits(uir, test_size=0.10, val_size=0.10, seed=42, verbose=True, exclude_unknowns=True, print_stats=True):
    rs = RatioSplit(
        data=uir,
        test_size=test_size,
        val_size=val_size,
        exclude_unknowns=exclude_unknowns,
        seed=seed,
        verbose=verbose,
    )
    if print_stats:
        print("Users:", rs.train_set.num_users, "Items:", rs.train_set.num_items)
    return rs

rs = make_splits(uir)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 43854
Number of items = 679
Number of ratings = 102203
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 43854
Number of items = 679
Number of ratings = 10281
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 43854
Number of items = 679
Number of ratings = 10304
---
Total users = 43854
Total items = 679
Users: 43854 Items: 679


In [124]:
def tune_ease(rs, lamb_list=(100, 500, 1000, 2000), k=20):
    space = [Discrete("lamb", list(lamb_list))]
    gs = GridSearch(
        model=EASE(name="EASE"),
        space=space,
        metric=Recall(k=k),
        eval_method=rs,                 # RatioSplit-objektet
    )
    gs.fit(train_set=rs.train_set, val_set=rs.val_set)
    return gs, gs.best_model, gs.best_params


gs, best_model, best_params = tune_ease(rs)
print(best_params)

Evaluating: {'lamb': 100}
Evaluating: {'lamb': 500}
Evaluating: {'lamb': 1000}
Evaluating: {'lamb': 2000}
Best parameter settings: {'lamb': 500}
Recall@20 = 0.2927
{'lamb': 500}


In [125]:
import tempfile, shutil

def run_experiment(rs, best_params, k=20, verbose=False):
    metrics = [Recall(k=k), Precision(k=k), NDCG(k=k), MAP()]
    ease_best = EASE(**best_params, name="EASE-best", verbose=False)

    tmpdir = tempfile.mkdtemp()
    try:
        exp = cornac.Experiment(
            eval_method=rs,
            models=[
                cornac.models.MostPop(),
                cornac.models.ItemKNN(k=100),
                cornac.models.BPR(learning_rate=0.01, lambda_reg=0.001, max_iter=200, name="BPR-default"),
                ease_best,
            ],
            metrics=metrics,
            verbose=verbose,
            show_validation=False,
            save_dir=tmpdir,  # logs go here
        )
        exp.run()
        return exp
    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)  # cleans logs/models


exp = run_experiment(rs, best_params)



[MostPop] Training started!

[MostPop] Evaluation started!


Ranking:   0%|          | 0/8440 [00:00<?, ?it/s]


[ItemKNN] Training started!


  0%|          | 0/679 [00:00<?, ?it/s]


[ItemKNN] Evaluation started!


Ranking:   0%|          | 0/8440 [00:00<?, ?it/s]

ItemKNN model is saved to /tmp/tmpqzdo31a7/ItemKNN/2025-10-27_10-31-51-393886.pkl

[BPR-default] Training started!

[BPR-default] Evaluation started!


Ranking:   0%|          | 0/8440 [00:00<?, ?it/s]


[EASE-best] Training started!

[EASE-best] Evaluation started!


Ranking:   0%|          | 0/8440 [00:00<?, ?it/s]


TEST:
...
            |    MAP | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
----------- + ------ + ------- + ------------ + --------- + --------- + --------
MostPop     | 0.0336 |  0.0456 |       0.0074 |    0.1219 |    0.0035 |   1.4432
ItemKNN     | 0.0067 |  0.0308 |       0.0049 |    0.0786 |    0.0210 |   6.5823
BPR-default | 0.0461 |  0.0618 |       0.0087 |    0.1458 |    0.5218 |   4.1921
EASE-best   | 0.1199 |  0.1552 |       0.0174 |    0.2905 |    0.0371 |   1.8888



In [126]:
def finalize_ease(uir, best_params, test_size=0.10, seed=42, exclude_unknowns=True, verbose=True):
    rs_no_val = RatioSplit(
        data=uir,
        test_size=test_size,
        val_size=0.0,
        exclude_unknowns=exclude_unknowns,
        seed=seed,
        verbose=verbose,
    )
    final_ease = EASE(**best_params, name="EASE-final", verbose=True)
    final_ease.fit(rs_no_val.train_set)
    return final_ease, rs_no_val

final_ease, rs_no_val = finalize_ease(
    uir,
    best_params=best_params,
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 46162
Number of items = 679
Number of ratings = 114979
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 46162
Number of items = 679
Number of ratings = 10576
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 46162
Total items = 679


In [127]:
import tempfile, shutil

def evaluate_final_model(rs_no_val, final_ease, k=20, verbose=False, save_dir=None):
    metrics = [Recall(k=k), Precision(k=k), NDCG(k=k), MAP()]

    created_tmp = False
    if save_dir is None:
        save_dir = tempfile.mkdtemp(prefix="cornac_tmp_")
        created_tmp = True

    try:
        exp = cornac.Experiment(
            eval_method=rs_no_val,
            models=[final_ease],
            metrics=metrics,
            verbose=verbose,
            save_dir=save_dir,
        )
        exp.run()
        return exp
    finally:
        if created_tmp:
            shutil.rmtree(save_dir, ignore_errors=True)


exp = evaluate_final_model(rs_no_val, final_ease, k=20, verbose=False)


[EASE-final] Training started!

[EASE-final] Evaluation started!




Ranking:   0%|          | 0/8708 [00:00<?, ?it/s]

EASE-final model is saved to /tmp/cornac_tmp_6bqvirh5/EASE-final/2025-10-27_10-32-00-423863.pkl

TEST:
...
           |    MAP | NDCG@20 | Precision@20 | Recall@20 | Train (s) | Test (s)
---------- + ------ + ------- + ------------ + --------- + --------- + --------
EASE-final | 0.1251 |  0.1629 |       0.0183 |    0.3064 |    0.0310 |   1.8543



keeps a pair (i, j) only if at least 12 distinct users interacted with both items.

In [None]:
import numpy as np
import pandas as pd
from scipy import sparse as sp

# ---------- helpers ----------
def _prepare_item_data(final_ease):
    B = np.asarray(final_ease.get_item_vectors(), dtype=float)
    ts = final_ease.train_set
    raw_item_ids = np.asarray(ts.item_ids, dtype=str)
    X = ts.X if hasattr(ts, "X") else ts.matrix
    X = X.tocsr() if sp.issparse(X) else sp.csr_matrix(X)
    return B, raw_item_ids, X

def _cooccurrence_support(X):
    Xb = X.copy(); Xb.data[:] = 1
    S = (Xb.T @ Xb).tocsr()
    S.setdiag(0); S.eliminate_zeros()
    return S

def build_basket_completion(final_ease, min_support=12, min_recs=4, topk_cap=10):
    """
    Build a per-item recommendation table:
      - Only keep items that have at least `min_recs` candidates (after support filter).
      - Cap each item's list to at most `topk_cap` recs.
      - Count drops for: (< min_recs recs) and (no candidates meeting min_support).
    """
    B, raw_item_ids, X = _prepare_item_data(final_ease)
    support = _cooccurrence_support(X)

    rows = []
    dropped_lt_minrecs = 0
    dropped_no_support = 0

    for i, pid in enumerate(raw_item_ids):
        row = support.getrow(i)
        if row.nnz == 0:
            # No co-occurrence at all
            dropped_no_support += 1
            continue

        mask = row.data >= min_support
        cand = row.indices[mask]

        if cand.size == 0:
            # No pairs met the min_support threshold
            dropped_no_support += 1
            continue

        # Score candidates and take up to the cap
        w = B[i, cand]
        order = np.argsort(w)[::-1]  # descending by similarity
        capped = cand[order][:min(topk_cap, cand.size)]
        recs = list(raw_item_ids[capped])

        if len(recs) < min_recs:
            dropped_lt_minrecs += 1
            continue

        # Build row with fixed columns Top 1..Top topk_cap; pad with Nones if shorter
        row_dict = {"Product ID": pid}
        for k in range(1, topk_cap + 1):
            row_dict[f"Top {k}"] = recs[k-1] if k-1 < len(recs) else None
        rows.append(row_dict)

    cols = ["Product ID"] + [f"Top {k}" for k in range(1, topk_cap + 1)]
    df = pd.DataFrame(rows, columns=cols)

    print(f"Dropped groupids due to having < {min_recs} recs: {dropped_lt_minrecs}")
    print(f"Dropped groupids due to having < {min_support} co-users (no pairs meeting threshold): {dropped_no_support}")

    return df

def save_basket_completion(df, out_path):
    df.to_parquet(out_path, engine="pyarrow", index=False)

# ---- params & run ----
MIN_SUPPORT = 12      # co-users threshold
MIN_RECS = 4          # keep only items with at least 4 recs
TOPK_CAP = 10         # cap lists at max 10 recs
#OUT_PATH = "/workspace/data/processed/basket_completion.parquet"

df_bc = build_basket_completion(final_ease, min_support=MIN_SUPPORT, min_recs=MIN_RECS, topk_cap=TOPK_CAP)
save_basket_completion(df_bc, OUT_PATH)


Dropped groupids due to having < 4 recs: 121
Dropped groupids due to having < 12 co-users (no pairs meeting threshold): 365


In [129]:
df_bc.sample(1)

Unnamed: 0,Product ID,Top 1,Top 2,Top 3,Top 4,Top 5,Top 6,Top 7,Top 8,Top 9,Top 10
75,266494,260182,260257,260930,260232,292250,264242,261192,260922,261608,261933
