# item–item knn cf using jaccard; temporal split; thresholded scoring; evaluated by leave-one-out basket completion (hit@k, coverage).

In [10]:
import pandas as pd
from datetime import timedelta

p="/workspace/data/processed/transactions_clean.parquet"
tx=pd.read_parquet(p,columns=["orderId","groupId","created"])


In [11]:
bad={"12025DK","12025FI","12025NO","12025SE","970300","459978"}
tx=tx[~tx["groupId"].astype(str).str.strip().isin(bad)].reset_index(drop=True)


In [12]:
def mk_baskets(x):
    x=x.copy()
    x["created"]=pd.to_datetime(x["created"],errors="coerce")
    return x.groupby("orderId",as_index=False).agg(items=("groupId",lambda s:list(dict.fromkeys(s))),t=("created","max"))
b=mk_baskets(tx)


In [13]:
def split(x,w=6):
    te=x["t"].max()
    t=te-timedelta(days=7*w)
    tr=x[x["t"]<t].reset_index(drop=True)
    te_o=x[(x["t"]>=t)&(x["items"].apply(lambda s: len(set(s))>=2))].reset_index(drop=True)
    return tr,te_o,te,t
tr,teo,te,ts=split(b,6) #tr is the training set, teo is the test set, te is the test set for evaluation, ts is the split date


In [14]:
from collections import Counter
from itertools import combinations

def supports(x):
    a=Counter(); p2=Counter()
    for s in x["items"]:
        u=list(dict.fromkeys(s))
        a.update(u)
        p2.update(tuple(sorted(c)) for c in combinations(u,2))
    return a,p2
a,p2=supports(tr)


In [15]:
from collections import defaultdict

mi,mp,k=10,5,100 #tune mi is the minimum support, mp is the minimum number of items in the basket, k is the number of neighbors
def nbrs(a,p2):
    d=defaultdict(list)
    for (i,j),n in p2.items():
        if n<mp: continue
        ni,nj=a[i],a[j]
        if ni<mi or nj<mi: continue
        s=n/(ni+nj-n) # jaccard similarity
        d[i].append((j,s)); d[j].append((i,s))
    for i in list(d):
        d[i]=sorted(d[i],key=lambda x:x[1],reverse=True)[:k]
    return d
nb=nbrs(a,p2)


Tune cut off threshold similarity based on coverage / hit rate drop

In [16]:
from collections import defaultdict

t=0.02
def rec(nb,seed,topk=10,cand=None,thr=t):
    from collections import defaultdict
    sc=defaultdict(float); s=set(map(str,seed))
    for i in s:
        for j,w in nb.get(i,[]):
            if j in s: continue
            if cand is not None and j not in cand: continue
            if w<thr: continue
            sc[j]+=w
    return [i for i,_ in sorted(sc.items(),key=lambda x:x[1],reverse=True)[:topk]]

def hit_cov(teo,nb,topk=10,cand=None,thr=t):
    h=0; th=0; c=0; tb=0
    for it in teo["items"]:
        b=list(dict.fromkeys(it))
        if len(b)<2: continue
        if len(rec(nb,b,topk,cand,thr))>0: c+=1
        tb+=1
        for x in b:
            s=[y for y in b if y!=x]
            r=rec(nb,s,topk,cand,thr)
            h+=int(str(x) in r); th+=1
    return (h/th if th else 0.0,c/tb if tb else 0.0)


In [18]:
art=pd.read_parquet("/workspace/data/processed/articles_for_recs.parquet")
cand={str(i) for i,cnt in a.items() if cnt>=mi}&set(art["groupId"].astype(str))
hit,cov=hit_cov(teo,nb,cand=cand)
print(hit,cov)


0.16923382294561334 0.9718614718614719


In [19]:
h0,_=hit_cov(teo,nb,cand=cand,thr=0.0); print(h0)


0.20857483128225487


In [20]:
a_all,p2_all=supports(b)
nb_all=nbrs(a_all,p2_all)

def nb_df(nb):
    r=[]
    for i,l in nb.items():
        for j,s in l:
            r.append((str(i),str(j),float(s)))
    return pd.DataFrame(r,columns=["item_id","neighbor_id","score"])

df=nb_df(nb_all)
v=set(art["groupId"].astype(str).unique())
df=df[df["neighbor_id"].isin(v)]
df=df[df["item_id"].isin(v)]
thr=0.02; top=10
df=df[df["score"]>=thr]
df["rank"]=df.groupby("item_id")["score"].rank(method="first",ascending=False)
df=(df[df["rank"]<=top]
      .assign(rank=lambda x: x["rank"].astype(int))
      .sort_values(["item_id","rank"])
      .pivot(index="item_id",columns="rank",values="neighbor_id"))
df=df.rename(columns=lambda r: f"Top {r}").reindex(columns=[f"Top {i}" for i in range(1,top+1)])
df.index.name="Product ID"
df=df.reset_index().fillna("—")
df.to_parquet("/workspace/data/processed/basket_completion.parquet",index=False)