# Personalized recommendation

After buying X, people next buy Y

In [1]:
# %% [markdown]
# Penultimate→Last item–item CF (next-basket prediction)
import pandas as pd
from collections import Counter, defaultdict
from math import sqrt


DATA_PATH = "/workspace/data/processed/transactions_clean.parquet"
OUTPUT_PATH = "/workspace/data/processed/personalized_cf_recs.parquet"

BAD_GROUP_IDS = {"12025DK","12025FI","12025NO","12025SE","970300","459978"}

LAM_SEQ = 50
POP_DISCOUNT = True
K_NEIGHBORS = 100

TOPK = 10
MIN_SCORE = 0.0
ITEM_SUPPORT_THR = 15

ALLOW_SELF_TRANSITION = False   # do not learn i->i edges
EVAL_ALLOW_REPURCHASE = False   # eval new-item next-order
EXPORT_ALLOW_REPURCHASE = False  # allow repurchases in production export

USER_MIN_ORDERS = 2


In [2]:
# %%
cols = ["shopUserId", "orderId", "groupId", "created"]
tx = pd.read_parquet(DATA_PATH, columns=cols).copy()

tx["created"] = pd.to_datetime(tx["created"], errors="coerce")
tx["groupId"] = tx["groupId"].astype(str).str.strip()

# drop empties / NaNs / banned groups
tx = tx[tx["groupId"].ne("") & ~tx["groupId"].str.lower().eq("nan")].copy()
tx = tx[~tx["groupId"].isin(BAD_GROUP_IDS)].reset_index(drop=True)

print(f"Rows: {len(tx):,} | Users: {tx['shopUserId'].nunique():,} | Orders: {tx['orderId'].nunique():,}")


Rows: 298,972 | Users: 61,452 | Orders: 109,808


Orders & split (leave‑last‑order‑out)

In [3]:
# %%
# One row per (user, order) with unique item set and order timestamp
o0 = (
    tx.groupby(["shopUserId", "orderId"], as_index=False)
      .agg(items=("groupId", lambda s: list(set(map(str, s)))),
           t=("created", "max"))
)

# Keep users with at least USER_MIN_ORDERS orders
user_order_counts = o0.groupby("shopUserId")["orderId"].transform("nunique")
o = o0[user_order_counts >= USER_MIN_ORDERS].copy().reset_index(drop=True)

# Identify the last order timestamp per user
lt = o.groupby("shopUserId")["t"].transform("max")
test_orders = o[o["t"] == lt].copy()  # last order per qualified user (evaluation target)

print(
    "Qualified users:", o["shopUserId"].nunique(),
    "| Qualified orders:", len(o),
    "| Test (last) orders:", len(test_orders)
)


Qualified users: 24632 | Qualified orders: 72988 | Test (last) orders: 24637


Item supports & candidates

In [4]:
# %%
# Take the last two orders per user (penultimate, last)
last_two = (
    o.sort_values("t")
     .groupby("shopUserId", as_index=False, sort=False)
     .tail(2)
     .sort_values(["shopUserId", "t"])
)

# For each user, prev = penultimate items, cur = last items
pairs = (
    last_two.groupby("shopUserId", sort=False)
            .agg(prev=("items", lambda s: list(map(str, s.iloc[0]))),
                 cur =("items", lambda s: list(map(str, s.iloc[1]))))
)

# Seeds for evaluation = penultimate basket
last_seed_train = dict(zip(pairs.index, pairs["prev"]))

print("Train pairs (prev→cur):", len(pairs))
pairs.head()


Train pairs (prev→cur): 24632


Unnamed: 0_level_0,prev,cur
shopUserId,Unnamed: 1_level_1,Unnamed: 2_level_1
100140,[264549],[264549]
100157,[240279],[210789]
100208,[210707],"[241562, 210695, 210686]"
100844,"[250122, 260513]","[260513, 270696, 270544, 541419]"
100948,"[261890, 200187, 266882, 270599, 260205]","[293647, 291088, 291054, 292045]"


Sequential transitions (i → j)

In [5]:
# %%
# Support counted only from prev (sources of transitions)
isup = Counter()
for items in pairs["prev"]:
    isup.update(set(items))

# Items seen anywhere in last-two (prev ∪ cur)
seen_items = set(isup.keys()) | {x for xs in pairs["cur"] for x in xs}

# Candidate prediction targets must have at least ITEM_SUPPORT_THR occurrences as prev
candidates = {i for i, c in isup.items() if c >= ITEM_SUPPORT_THR}

print(
    "Seen items (prev∪cur):", len(seen_items),
    "| Candidate items (as targets):", len(candidates)
)


Seen items (prev∪cur): 1770 | Candidate items (as targets): 636


In [6]:
# %%
psup_seq = Counter()
isup_in = Counter()
isup_out = Counter()

for _, r in pairs.iterrows():
    prev = set(r["prev"])
    cur  = set(r["cur"])
    for i in prev:
        for j in cur:
            if (i == j) and (not ALLOW_SELF_TRANSITION):
                continue
            psup_seq[(i, j)] += 1
            isup_out[i] += 1
            isup_in[j]  += 1

print("Transitions learned (i→j):", len(psup_seq))


Transitions learned (i→j): 105437


In [7]:
def build_neighbors_seq(psup_seq, isup_in, isup_out, K=None, lam=None, pop_discount=None):
    if K is None: K = K_NEIGHBORS
    if lam is None: lam = LAM_SEQ
    if pop_discount is None: pop_discount = POP_DISCOUNT

    nei = defaultdict(list)
    for (i, j), n_ij in psup_seq.items():
        p = n_ij / (isup_out[i] + lam)
        if pop_discount:
            p /= sqrt(max(1, isup_in[j]))
        nei[i].append((j, p))
    for i in list(nei):
        nei[i] = sorted(nei[i], key=lambda x: x[1], reverse=True)[:K]
    return nei

# cleaner call
neighbors = build_neighbors_seq(psup_seq, isup_in, isup_out)


In [8]:
# %%
from collections import defaultdict as ddict

def rec_seq(seed, k=TOPK, allow_repurchase=EVAL_ALLOW_REPURCHASE,
            thr=MIN_SCORE, use_candidates=True):
    S = set(seed)
    scores = ddict(float)
    for i in S:
        for j, w in neighbors.get(i, []):
            if (not allow_repurchase) and (j in S):
                continue
            if w < thr:
                continue
            if use_candidates and (j not in candidates):
                continue
            scores[j] += w
    ranked = [j for j, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)]
    return ranked[:k]


In [9]:
# %%
hits = tot = users = users_hit = 0

# Build a (user -> last basket) map from test_orders
test_last = dict(zip(test_orders["shopUserId"], test_orders["items"]))

for u, seed in last_seed_train.items():
    tgt = set(test_last.get(u, []))
    if not seed or not tgt:
        continue
    users += 1
    R = set(rec_seq(seed, k=TOPK))
    users_hit += int(len(tgt & R) > 0)
    hits += sum(1 for x in tgt if x in R)
    tot  += len(tgt)

print(
    "Hit@10:", round(hits / tot if tot else 0.0, 4),
    "| Users:", users,
    "| UserHitRate:", round(users_hit / users if users else 0.0, 4)
)

# Cold-start share among test labels
tot_items = sum(len(x) for x in test_orders["items"])
cold_items = sum(1 for xs in test_orders["items"] for x in xs if x not in seen_items)
print("Cold share (test):", round(cold_items / tot_items if tot_items else 0.0, 4))


Hit@10: 0.2482 | Users: 24632 | UserHitRate: 0.4046
Cold share (test): 0.0


In [10]:
# %%
# In production, recommend for the user's current most recent basket (last order)
last_order_all = (
    o.sort_values("t")
     .groupby("shopUserId", as_index=False)
     .tail(1)[["shopUserId","items"]]
)
seed_map_export = dict(zip(last_order_all["shopUserId"], last_order_all["items"]))

# No popularity backfill — only neighbor-based (similar) items
def rec_seq_export(seed, k=TOPK, allow_repurchase=EXPORT_ALLOW_REPURCHASE, thr=MIN_SCORE):
    S = set(seed)
    scores = ddict(float)
    for i in S:
        for j, w in neighbors.get(i, []):
            if w < thr:
                continue
            if (not allow_repurchase) and (j in S):
                continue
            scores[j] += w
    ranked = [j for j, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)]
    return ranked[:k]  # may be shorter than k if not enough similar items


In [11]:
# Generate export
rows = []
short_lists = 0
for u, seed in seed_map_export.items():
    recs = rec_seq_export(seed, k=TOPK)
    if len(recs) < TOPK:
        short_lists += 1
    rows.append({"shopUserId": u, "Recent Purchase": seed, "Recommended Items": recs})

df_recs = pd.DataFrame(rows, columns=["shopUserId","Recent Purchase","Recommended Items"])
df_recs.to_parquet(OUTPUT_PATH, engine="pyarrow", index=False)
print(f"Wrote {len(df_recs):,} users → {OUTPUT_PATH} | Users with <{TOPK} recs: {short_lists}")
df_recs.head()

Wrote 24,632 users → /workspace/data/processed/personalized_cf_recs.parquet | Users with <10 recs: 327


Unnamed: 0,shopUserId,Recent Purchase,Recommended Items
0,252934,"[261610, 291294]","[291612, 270054, 265546, 290150, 290278, 29207..."
1,218595,[265823],"[270247, 271010, 270103, 270573, 266072, 26493..."
2,253943,"[280053, 261193]","[261371, 280008, 290179, 261688, 261737, 21066..."
3,246488,[260513],"[265843, 265231, 290176, 260313, 260887, 26334..."
4,255340,[291839],"[280109, 290290, 352787, 291690, 262010, 26056..."
