In [None]:
import pandas as pd
from functools import reduce

TOP_PREFIX, SCORE_PREFIX = "Top ", "Score "

def wide_to_long(path: str, score_col_name: str) -> pd.DataFrame:
    """
    Read parquet with columns: 'Product ID', 'Top i', 'Score i'
    and return long df: product_id, rec_id, <score_col_name>.
    Handles arbitrary number of Top/Score columns (Top 1..Top N).
    """
    df = pd.read_parquet(path, dtype_backend="numpy_nullable")

    ranks = sorted(
        int(c.split()[1]) for c in df.columns if c.startswith(TOP_PREFIX)
        if c.split()[1].isdigit()
    )

    long_parts: list[pd.DataFrame] = []
    for r in ranks:
        top_col   = f"{TOP_PREFIX}{r}"
        score_col = f"{SCORE_PREFIX}{r}"
        if top_col not in df.columns or score_col not in df.columns:
            continue

        part = (
            df[["Product ID", top_col, score_col]]
            .rename(
                columns={
                    "Product ID": "product_id",
                    top_col: "rec_id",
                    score_col: score_col_name,
                }
            )
            .copy()
        )

        part["product_id"] = part["product_id"].astype("string")
        part["rec_id"]     = part["rec_id"].astype("string")
        part = part.dropna(subset=["product_id", "rec_id"])

        long_parts.append(part)

    if not long_parts:
        return pd.DataFrame(
            columns=["product_id", "rec_id", score_col_name],
            dtype="string"
        )

    long_df = pd.concat(long_parts, ignore_index=True)

    return long_df

PATH_BASKET   = "/workspace/data/processed/basket_completion.parquet"
PATH_PAIR     = "/workspace/data/processed/pair_complements.parquet"
PATH_SEMANTIC = "/workspace/data/processed/semantic_similarity_recs.parquet"
OUT_PATH      = "/workspace/data/processed/hybrid_pairs.parquet"

basket_long   = wide_to_long(PATH_BASKET,   "score_basket")
pair_long     = wide_to_long(PATH_PAIR,     "score_pair")
semantic_long = wide_to_long(PATH_SEMANTIC, "score_semantic")

dfs = [basket_long, pair_long, semantic_long]
hybrid = reduce(
    lambda left, right: pd.merge(left, right, on=["product_id", "rec_id"], how="outer"),
    dfs
)

hybrid["product_id"] = hybrid["product_id"].astype("string")
hybrid["rec_id"]     = hybrid["rec_id"].astype("string")
hybrid = hybrid.dropna(subset=["product_id", "rec_id"])




  long_df = pd.concat(long_parts, ignore_index=True)


In [None]:
import pandas as pd

weights = {"score_basket": 10, "score_pair": 1, "score_semantic": 0.1}

for c in weights:
    if c in hybrid.columns:
        hybrid[c] = pd.to_numeric(hybrid[c], errors="coerce")

# Missing values → 0, then compute the weighted sum
hybrid["hybrid_score"] = sum(
    hybrid.get(c, 0).fillna(0) * w
    for c, w in weights.items()
)

In [13]:
hybrid

Unnamed: 0,product_id,rec_id,score_basket,score_pair,score_semantic,hybrid_score
0,055522,210676,,,0.688272,0.068827
1,055522,210782,,,0.690731,0.069073
2,055522,260163,,,0.707638,0.070764
3,055522,260287,,,0.71139,0.071139
4,055522,260646,,,0.697282,0.069728
...,...,...,...,...,...,...
17490,790196,490530,,,0.730972,0.073097
17491,790196,490531,,,0.691934,0.069193
17492,790196,517185,,,0.649574,0.064957
17493,790196,530341,,,0.628614,0.062861


In [14]:
hybrid[hybrid["score_pair"].notna()].head(20)

Unnamed: 0,product_id,rec_id,score_basket,score_pair,score_semantic,hybrid_score
425,210676,210734,0.050521,0.115578,,0.620786
533,210726,242289,0.044211,0.123457,,0.56557
597,210734,210676,0.052169,0.118252,0.757563,0.715701
600,210734,210756,,0.105398,,0.105398
684,210749,240280,0.11355,0.315789,,1.451289
725,210752,240995,0.130778,0.279869,,1.587644
752,210755,241091,0.040963,0.12012,,0.529753
765,210758,210746,0.046128,0.115108,,0.576384
913,210770,210773,0.047273,0.13913,0.767856,0.688643
914,210770,210789,0.039414,0.115942,0.805429,0.590629


In [None]:
def make_topk_hybrid_parquet(df, k=10, out_path=OUT_PATH):
    """
    For each product_id, find the top-k recommendations by hybrid_score
    and store them (and their scores) in a wide format parquet file.
    """
    df_sorted = df.sort_values(["product_id", "hybrid_score"], ascending=[True, False])

    def topk(group):
        top = group.head(k).reset_index(drop=True)
        return pd.Series({
            **{"Product ID": group.name},
            **{f"Top {i+1}": str(top.loc[i, "rec_id"]) if i < len(top) else None for i in range(k)},
            **{f"Score {i+1}": float(top.loc[i, "hybrid_score"]) if i < len(top) else None for i in range(k)},
        })

    topk_df = (
        df_sorted
        .groupby("product_id", group_keys=False)
        .apply(topk, include_groups=False)
        .reset_index(drop=True)
    )

    topk_df.to_parquet(out_path, index=False)
    print(f"Saved to {out_path}")