In [1]:
import pandas as pd

def load_filtered_transactions(
    trans_path="/workspace/data/processed/transactions_clean.parquet",
    avail_path="/workspace/data/processed/articles_for_recs.parquet",
    bad_ids={"12025DK", "12025FI", "12025NO", "12025SE", "970300", "459978"},
    cols=("shopUserId", "orderId", "groupId", "category", "brand", "audience"),
):
    global df 
    df = pd.read_parquet(trans_path, columns=list(cols))
    avail_df = pd.read_parquet(avail_path)

    gid = df["groupId"].astype(str).str.strip()
    avail_ids = set(avail_df["groupId"].astype(str).str.strip().unique())

    df = df.loc[gid.isin(avail_ids) & ~gid.isin(bad_ids)].reset_index(drop=True)
    return df

df = load_filtered_transactions()

In [2]:
import pandas as pd

def aggregate_by_groupid(
    df: pd.DataFrame,
    item_col: str = "groupId",
    brand_col: str = "brand",
    category_col: str = "category",
    audience_col: str = "audience",
) -> pd.DataFrame:
    """
    Aggregate by groupId, count occurrences, and keep brand and category.
    Returns a DataFrame with [groupId, brand, category, transactions].
    """
    agg_df = (
        df.groupby([item_col, brand_col, category_col, audience_col])
          .size()
          .reset_index(name="transactions")
    )
    return agg_df

pairs = aggregate_by_groupid(df)

In [3]:
pairs.sort_values(by="transactions", ascending=False)

Unnamed: 0,groupId,brand,category,audience,transactions
454,261637,Locköstrumpan,"Stödstrumpor,Strumpor,Underkläder",dam,6246
278,260695,Louise,"Bh-toppar,Bh,Bh utan kupstorlek,Underkläder",dam,5725
121,240187,Åshild,"Mjukisbyxor,Mysplagg,Byxor",dam,4570
30,210338,Åshild,"Toppar,Överdelar,T-shirts",dam,4069
35,210695,Åshild,"Toppar,Överdelar",dam,3962
...,...,...,...,...,...
1154,432036,Cobble Hill,"Hobbyhörnan,Pussel,Pussel 2000 bitar",generic,1
1156,432043,Cobble Hill,"Hobbyhörnan,Pussel,Pussel 1000 bitar",generic,1
1159,432050,Cobble Hill,"Hobbyhörnan,Pussel,Pussel 1000 bitar",generic,1
1160,432051,Cobble Hill,"Hobbyhörnan,Pussel,Pussel 1000 bitar",generic,1


In [4]:
# Output all unique values of audience present in the pairs DataFrame
#unique_audiences = sorted(pairs["audience"].unique())
#unique_audiences


In [5]:
# Find all fully unique category values present in the pairs DataFrame (no splitting)
#unique_categories = sorted(pairs['category'].dropna().unique())
#unique_categories


In [6]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

GENDER_TOKENS = {"dam", "herr"} 

def cat_to_set(s):
    if pd.isna(s): return set()
    # split on commas; keep tokens like "Mössor & hattar" as a single concept
    toks = {t.strip().lower() for t in str(s).split(",") if t.strip()}
    return {t for t in toks if t != "unknown"}

def preprocess(pairs: pd.DataFrame):
    df = pairs.copy()
    df["brand_norm"] = df["brand"].astype(str).str.strip().str.lower()
    df["aud_norm"] = df["audience"].astype(str).str.strip().str.lower()
    is_unknown = df["brand_norm"].eq("unknown")
    skipped = int(is_unknown.sum())
    df = df.loc[~is_unknown].copy()
    df["cat_set"] = df["category"].map(cat_to_set)
    return df, skipped

def categories_match(a: set[str], b: set[str]) -> bool:
    """
    True if:
      - there is at least one shared NON-gender token, AND
      - if 'dam' is in A, then 'dam' must be in B; similarly for 'herr'.
    If A has no gender token, gender is not enforced (e.g., 'badrum/wc' can match anything).
    """
    core_a = a - GENDER_TOKENS
    core_b = b - GENDER_TOKENS
    if not (core_a & core_b):
        return False

    # enforce gender symmetry only if present in A
    if "dam" in a and "dam" not in b:
        return False
    if "herr" in a and "herr" not in b:
        return False

    return True

def build_recs(filtered_pairs: pd.DataFrame, min_recs=4, max_recs=10):
    recs, insufficient = {}, 0
    fp = filtered_pairs.copy()
    fp["transactions"] = pd.to_numeric(fp["transactions"], errors="coerce").fillna(0)

    for _, g in fp.groupby("brand_norm", sort=False):
        g = g.sort_values("transactions", ascending=False)
        cat_map = g["cat_set"].to_dict()
        cutoff = g["transactions"].quantile(0.96)
        is_bestseller = g["transactions"] >= cutoff

        for idx, row in g.iterrows():
            gid = str(row["groupId"])
            my_cats = row["cat_set"]
            my_aud  = row["aud_norm"]
            if not my_cats:
                insufficient += 1
                continue

            mask = (
                (g["groupId"].astype(str) != gid)
                & (g.index != idx)
                & (~is_bestseller)                          # drop top 5% by popularity
                & (g["aud_norm"] == my_aud)                 # exact audience match
                & g.index.map(lambda j: categories_match(my_cats, cat_map[j]))
            )

            tops = g.loc[mask, "groupId"].astype(str).head(max_recs).tolist()
            if len(tops) >= min_recs:
                recs[gid] = {f"Top {i+1}": t for i, t in enumerate(tops)}
            else:
                insufficient += 1

    rows = [{"Product ID": k, **v} for k, v in recs.items()]
    df = pd.DataFrame(rows)
    if not df.empty:
        ordered = ["Product ID"] + [c for i in range(1, max_recs + 1) if (c := f"Top {i}") in df.columns]
        df = df[ordered]
    return df, insufficient

def save_parquet(df: pd.DataFrame, path: str):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, path)

def run(pairs: pd.DataFrame, min_recs=4, max_recs=10, out_path="/workspace/data/processed/top_same_brand.parquet"):
    filtered, skipped = preprocess(pairs)
    export_df, insufficient = build_recs(filtered, min_recs=min_recs, max_recs=max_recs)
    save_parquet(export_df, out_path)
    print(f"Skipped {skipped} rows due to Unknown brand")
    print(f"Removed {insufficient} groupIds due to having fewer than {min_recs} recommendations")
    print(f"Saved {len(export_df)} rows to {out_path}")
    return export_df

export_df = run(pairs, min_recs=1, max_recs=10)


Skipped 33 rows due to Unknown brand
Removed 76 groupIds due to having fewer than 1 recommendations
Saved 1297 rows to /workspace/data/processed/top_same_brand.parquet
