# Personalized recommendation

After buying X, people next buy Y

In [318]:
# %% [markdown]
# penultimate→last item–item cf (next-basket prediction)

# %%
import pandas as pd
from collections import Counter, defaultdict
from math import sqrt

# paths
data_path = "/workspace/data/processed/order_items.parquet"
output_path = "/workspace/data/processed/personalized_cf_recs.parquet"

bad_group_ids = {"12025DK","12025FI","12025NO","12025SE","970300","459978"}


occ_support_thr = 15  # raw co-occurrence threshold (tune 5–15)


Orders & split (leave‑last‑order‑out)

In [304]:
import pandas as pd

tx = pd.read_parquet(data_path)

tx = tx.rename(columns={
    "customer_id": "shopUserId",
    "order_id": "orderId",
})

tx["created"] = pd.to_datetime(tx["created"], errors="coerce")
tx["groupId"] = tx["groupId"].astype(str)

tx = tx[~tx["groupId"].isin(bad_group_ids)]



orders = (
    tx.groupby("orderId", as_index=False)
      .agg(
          shopUserId=("shopUserId", "first"),
          items=("groupId", lambda s: sorted(set(s))),  # unique item groups per order
          t=("created", "max"),                          # order timestamp
      )
)


In [305]:
# --- Keep qualified users ---
qualified = (
    orders
    .assign(order_count=lambda d: d.groupby("shopUserId")["orderId"].transform("nunique"))
    .query("order_count >= 2")
    .drop(columns="order_count")
    .reset_index(drop=True)

)

In [306]:
pairs = (
    qualified.sort_values(["shopUserId","t"]) # sort by user, then by timestamp
    .groupby("shopUserId", as_index=False, sort=False).tail(2)
    .sort_values(["shopUserId","t"])
    .groupby("shopUserId", sort=False)
    .agg(
        source_orderId=("orderId",  lambda s: s.iloc[0]),
        source_items=("items",    lambda s: list(map(str, s.iloc[0]))),
        target_orderId=("orderId",   lambda s: s.iloc[1]),
        target_items=("items",    lambda s: list(map(str, s.iloc[1]))),
    )
    .reset_index()
)

In [307]:
pairs.head()

Unnamed: 0,shopUserId,source_orderId,source_items,target_orderId,target_items
0,100140,616637,[264549],864906,[264549]
1,100157,691070,[240279],767958,[210789]
2,100208,326906,[210707],378144,"[210686, 210695, 241562]"
3,100844,308029,"[250122, 260513]",582227,"[260513, 270544, 270696, 541419]"
4,100948,843741,"[200187, 260205, 261890, 266882, 270599]",872905,"[291054, 291088, 292045, 293647]"


In [308]:
import pandas as pd
from sklearn.model_selection import train_test_split
pairs_train, pairs_test = train_test_split(
    pairs, test_size=0.20, random_state=42, shuffle=True
)


In [309]:
edges = (
    pairs
        .explode("source_items").rename(columns={"source_items": "source_item"})
        .explode("target_items").rename(columns={"target_items": "target_item"})
        .query("source_item != target_item")
        .groupby(["source_item", "target_item"], as_index=False)
        .size()
        .rename(columns={"size": "count_ij"})
)

In [None]:
src_txns = (
    pairs.explode("source_items")["source_items"]
    .value_counts()
    .rename_axis("source_item")
    .reset_index(name="source_txns")
)

tgt_txns = (
    pairs_train.explode("target_items")["target_items"]
    .value_counts()
    .rename_axis("target_item")
    .reset_index(name="target_txns")
)

edges = (
    edges.merge(src_txns, on="source_item", how="left")
         .merge(tgt_txns, on="target_item", how="left")
)


In [311]:
edges.sort_values("count_ij", ascending=False)

Unnamed: 0,source_item,target_item,count_ij,source_txns,target_txns
4622,210338,210695,73,943,565.0
29523,240187,261637,65,818,712.0
4592,210338,210186,51,943,516.0
32349,241562,210338,49,678,596.0
60539,261637,240187,45,858,594.0
...,...,...,...,...,...
42960,260345,250122,1,13,80.0
42959,260345,242131,1,13,213.0
42958,260345,240276,1,13,363.0
42957,260345,239301,1,13,20.0


In [312]:
# Remove the most popular 5% of sources and targets (upper 0.95 quantile)

src_thr = edges["source_txns"].quantile(0.96)
tgt_thr = edges["target_txns"].quantile(0.96)

before = len(edges)
edges = edges[(edges["source_txns"] <= src_thr) & (edges["target_txns"] <= tgt_thr)].copy()
after = len(edges)

print(
    f"Applied popularity caps at 95th percentile:"
    f"\n  source_txns ≤ {src_thr:.0f}"
    f"\n  target_txns ≤ {tgt_thr:.0f}"
    f"\nRows: {before:,} → {after:,} (removed {before - after:,})"
)


Applied popularity caps at 95th percentile:
  source_txns ≤ 496
  target_txns ≤ 469
Rows: 105,437 → 97,498 (removed 7,939)


In [313]:
edges = edges[edges["count_ij"] >= occ_support_thr]

In [314]:
# Popularity-adjusted scoring using per-basket (presence) probabilities
# Assumes `edges` has: source_item, target_item, count_ij, source_txns, target_txns
# and `pairs_train` is your TRAIN prev→cur pairs dataframe.

import numpy as np

# 1) Baseline presence per basket for target j: P(j) = target_txns / (# of TRAIN pairs)
n_pairs = len(pairs_train)
edges["P_j"] = (edges["target_txns"] / n_pairs).astype(float)

# 2) Conditional presence: P(j | i) = count_ij / source_txns
edges["P_j_given_i"] = (edges["count_ij"] / edges["source_txns"].replace(0, np.nan)).astype(float)

# 3) Lift (popularity-adjusted); clip tiny baselines to avoid divide-by-zero
edges["lift"] = edges["P_j_given_i"] / edges["P_j"].clip(lower=1e-12)


In [320]:
edges = edges[edges["lift"] > 1.5]

In [321]:
edges.sort_values("lift", ascending=False)

Unnamed: 0,source_item,target_item,count_ij,source_txns,target_txns,P_j,P_j_given_i,lift
15071,210718,210785,18,162,101.0,0.005126,0.111111,21.677668
66230,261933,261968,16,200,96.0,0.004872,0.080000,16.420833
80973,270597,270599,15,132,138.0,0.007003,0.113636,16.226120
78964,270542,270599,15,156,138.0,0.007003,0.096154,13.729794
44569,260455,261998,21,355,88.0,0.004466,0.059155,13.245999
...,...,...,...,...,...,...,...,...
23421,218982,241562,16,345,469.0,0.023801,0.046377,1.948518
46922,260596,260513,16,496,364.0,0.018472,0.032258,1.746278
35254,242305,241562,16,388,469.0,0.023801,0.041237,1.732574
43514,260406,200400,15,428,406.0,0.020604,0.035047,1.700975
