when in the basket

In [1]:
import pandas as pd
from functools import reduce

# ---------- config ----------
PATH_BASKET   = "/workspace/data/processed/basket_completion.parquet"
PATH_PAIR     = "/workspace/data/processed/pair_complements.parquet"
PATH_SEMANTIC = "/workspace/data/processed/semantic_similarity_recs.parquet"
OUT_PATH      = "/workspace/data/processed/hybrid_pairs.parquet"
TOP_PREFIX, SCORE_PREFIX = "Top ", "Score "

def wide_to_long(path, score_col_name):
    """Read parquet with columns: 'Product ID', 'Top i', 'Score i' and return
    long df: product_id, rec_id, <score_col_name>
    """
    df = pd.read_parquet(path)
    # Ensure string ids
    df["Product ID"] = df["Product ID"].astype(str)

    # Detect available ranks dynamically (Top 1..Top N)
    ranks = sorted(
        [int(c.split()[1]) for c in df.columns if c.startswith(TOP_PREFIX)],
    )
    long_parts = []
    for r in ranks:
        top_col   = f"{TOP_PREFIX}{r}"
        score_col = f"{SCORE_PREFIX}{r}"
        if top_col not in df.columns or score_col not in df.columns:
            continue
        part = df[["Product ID", top_col, score_col]].rename(
            columns={
                "Product ID": "product_id",
                top_col: "rec_id",
                score_col: score_col_name,
            }
        )
        # keep only non-null recs; cast ids to str for consistent joins
        part = part.dropna(subset=["rec_id"])
        part["rec_id"] = part["rec_id"].astype(str)
        long_parts.append(part)

    if not long_parts:
        # empty safely
        return pd.DataFrame(columns=["product_id", "rec_id", score_col_name])

    long_df = pd.concat(long_parts, ignore_index=True)
    # If duplicates per (product_id, rec_id), keep the best (highest score)
    long_df = long_df.sort_values(score_col_name, ascending=False).drop_duplicates(
        subset=["product_id", "rec_id"], keep="first"
    )
    return long_df

# Build long frames per source
basket_long   = wide_to_long(PATH_BASKET,   "score_basket")
pair_long     = wide_to_long(PATH_PAIR,     "score_pair")
semantic_long = wide_to_long(PATH_SEMANTIC, "score_semantic")

# Outer-join on (product_id, rec_id)
dfs = [basket_long, pair_long, semantic_long]
hybrid = reduce(
    lambda left, right: pd.merge(left, right, on=["product_id", "rec_id"], how="outer"),
    dfs
)

# Optional: sort for readability
#hybrid = hybrid.sort_values(["product_id", "rec_id"]).reset_index(drop=True)

# Write out
#hybrid.to_parquet(OUT_PATH, index=False)
#print(hybrid.head())
#print(f"Saved: {OUT_PATH}  (rows={len(hybrid)})")


  long_df = pd.concat(long_parts, ignore_index=True)


In [6]:
hybrid[hybrid["score_pair"].notna()].head(20)

Unnamed: 0,product_id,rec_id,score_basket,score_pair,score_semantic
425,210676,210734,0.050521,0.115578,
533,210726,242289,0.044211,0.123457,
597,210734,210676,0.052169,0.118252,0.757563
600,210734,210756,,0.105398,
684,210749,240280,0.11355,0.315789,
725,210752,240995,0.130778,0.279869,
752,210755,241091,0.040963,0.12012,
765,210758,210746,0.046128,0.115108,
913,210770,210773,0.047273,0.13913,0.767856
914,210770,210789,0.039414,0.115942,0.805429
