# item–item knn cf using jaccard; temporal split; thresholded scoring; evaluated by leave-one-out basket completion (hit@k, coverage).

In [12]:
import pandas as pd
from datetime import timedelta

p="/workspace/data/processed/transactions_clean.parquet"
tx=pd.read_parquet(p,columns=["orderId","groupId","created"])


In [13]:
bad={"12025DK","12025FI","12025NO","12025SE","970300","459978"}
tx=tx[~tx["groupId"].astype(str).str.strip().isin(bad)].reset_index(drop=True)


In [14]:
def mk_baskets(x):
    x=x.copy()
    x["created"]=pd.to_datetime(x["created"],errors="coerce")
    return x.groupby("orderId",as_index=False).agg(items=("groupId",lambda s:list(dict.fromkeys(s))),t=("created","max"))
b=mk_baskets(tx)


In [15]:
from datetime import timedelta

def split_by_time(df, weeks=6):
    last_time = df["t"].max()
    split_time = last_time - timedelta(days=7 * weeks)
    train_df = df[df["t"] < split_time].reset_index(drop=True)
    test_eval_df = df[(df["t"] >= split_time) & (df["items"].apply(lambda s: len(set(s)) >= 2))].reset_index(drop=True)
    return train_df, test_eval_df, last_time, split_time

baskets_df = b
train_df, test_df, eval_time, split_time = split_by_time(baskets_df, weeks=6)

In [16]:
from collections import Counter
from itertools import combinations

def compute_supports(df):
    item_counts = Counter()
    pair_counts = Counter()
    for basket in df["items"]:
        unique_items = list(dict.fromkeys(basket))
        item_counts.update(unique_items)
        pair_counts.update(tuple(sorted(p)) for p in combinations(unique_items, 2))
    return item_counts, pair_counts

item_counts, pair_counts = compute_supports(train_df)

In [17]:
from collections import defaultdict

min_item_support = 10
min_pair_count = 5
max_neighbors = 100

def build_neighbors(item_counts, pair_counts):
    neighbors = defaultdict(list)
    for (i, j), co_count in pair_counts.items():
        if co_count < min_pair_count:
            continue
        ni, nj = item_counts[i], item_counts[j]
        if ni < min_item_support or nj < min_item_support:
            continue
        jaccard = co_count / (ni + nj - co_count)
        neighbors[i].append((j, jaccard))
        neighbors[j].append((i, jaccard))
    for item in list(neighbors):
        neighbors[item] = sorted(neighbors[item], key=lambda x: x[1], reverse=True)[:max_neighbors]
    return neighbors

neighbors = build_neighbors(item_counts, pair_counts)


Tune cut off threshold similarity based on coverage / hit rate drop

In [18]:
from collections import defaultdict

default_similarity_threshold = 0.02

def recommend_items(neighbors, basket_items, topk=10, candidate_set=None, threshold=default_similarity_threshold):
    scores = defaultdict(float)
    seed = set(map(str, basket_items))
    for i in seed:
        for j, w in neighbors.get(i, []):
            if j in seed:
                continue
            if candidate_set is not None and j not in candidate_set:
                continue
            if w < threshold:
                continue
            scores[j] += w
    return [i for i, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]]

def evaluate_hit_coverage(test_df, neighbors, topk=10, candidate_set=None, threshold=default_similarity_threshold):
    hits = 0
    total_holdouts = 0
    covered_baskets = 0
    total_baskets = 0
    for items in test_df["items"]:
        basket = list(dict.fromkeys(items))
        if len(basket) < 2:
            continue
        if len(recommend_items(neighbors, basket, topk, candidate_set, threshold)) > 0:
            covered_baskets += 1
        total_baskets += 1
        for x in basket:
            seed = [y for y in basket if y != x]
            recs = recommend_items(neighbors, seed, topk, candidate_set, threshold)
            hits += int(str(x) in recs)
            total_holdouts += 1
    return (hits / total_holdouts if total_holdouts else 0.0, covered_baskets / total_baskets if total_baskets else 0.0)



In [19]:
art = pd.read_parquet("/workspace/data/processed/articles_for_recs.parquet")
candidate_set = {str(i) for i, cnt in item_counts.items() if cnt >= min_item_support} & set(art["groupId"].astype(str))
hit, coverage = evaluate_hit_coverage(test_df, neighbors, candidate_set=candidate_set)
print(hit, coverage)

0.16558569959661473 0.9755901287553648


In [20]:
baseline_hit, _ = evaluate_hit_coverage(test_df, neighbors, candidate_set=candidate_set, threshold=0.0)
print(baseline_hit)

0.2053705607846239


In [21]:
item_counts_all, pair_counts_all = compute_supports(baskets_df)
neighbors_all = build_neighbors(item_counts_all, pair_counts_all)


In [22]:
def neighbors_to_df(neighbors):
    rows = []
    for i, lst in neighbors.items():
        for j, s in lst:
            rows.append((str(i), str(j), float(s)))
    return pd.DataFrame(rows, columns=["item_id", "neighbor_id", "score"])

neighbors_df = neighbors_to_df(neighbors_all)
valid_ids = set(art["groupId"].astype(str).unique())
neighbors_df = neighbors_df[neighbors_df["neighbor_id"].isin(valid_ids)]
neighbors_df = neighbors_df[neighbors_df["item_id"].isin(valid_ids)]

similarity_threshold = 0.02
top_n = 10

neighbors_df = neighbors_df[neighbors_df["score"] >= similarity_threshold]
neighbors_df["rank"] = neighbors_df.groupby("item_id")["score"].rank(method="first", ascending=False)
neighbors_df = (
    neighbors_df[neighbors_df["rank"] <= top_n]
    .assign(rank=lambda x: x["rank"].astype(int))
    .sort_values(["item_id", "rank"])
    .pivot(index="item_id", columns="rank", values="neighbor_id")
)
neighbors_df = neighbors_df.rename(columns=lambda r: f"Top {r}").reindex(columns=[f"Top {i}" for i in range(1, top_n + 1)])
neighbors_df.index.name = "Product ID"
neighbors_df = neighbors_df.reset_index().fillna("—")
neighbors_df.to_parquet("/workspace/data/processed/basket_completion.parquet", index=False)