In [5]:
import pandas as pd

def load_filtered_transactions(
    trans_path="/workspace/data/processed/transactions_clean.parquet",
    avail_path="/workspace/data/processed/articles_for_recs.parquet",
    bad_ids={"12025DK", "12025FI", "12025NO", "12025SE", "970300", "459978"},
    cols=("shopUserId", "orderId", "groupId", "category", "brand"),
):
    global df 
    df = pd.read_parquet(trans_path, columns=list(cols))
    avail_df = pd.read_parquet(avail_path)

    gid = df["groupId"].astype(str).str.strip()
    avail_ids = set(avail_df["groupId"].astype(str).str.strip().unique())

    df = df.loc[gid.isin(avail_ids) & ~gid.isin(bad_ids)].reset_index(drop=True)
    return df

df = load_filtered_transactions()

In [13]:
import pandas as pd

def aggregate_by_groupid(
    df: pd.DataFrame,
    item_col: str = "groupId",
    brand_col: str = "brand",
    category_col: str = "category",
) -> pd.DataFrame:
    """
    Aggregate by groupId, count occurrences, and keep brand and category.
    Returns a DataFrame with [groupId, brand, category, transactions].
    """
    agg_df = (
        df.groupby([item_col, brand_col, category_col])
          .size()
          .reset_index(name="transactions")
    )
    return agg_df

pairs = aggregate_by_groupid(df)

In [19]:
pairs.sort_values(by="transactions", ascending=False)

Unnamed: 0,groupId,brand,category,transactions
466,261637,Locköstrumpan,"Stödstrumpor,Strumpor,Underkläder",6223
288,260695,Louise,"Bh-toppar,Bh,Bh utan kupstorlek,Underkläder",5716
128,240187,Åshild,"Mjukisbyxor,Mysplagg,Byxor",4562
35,210338,Åshild,"Toppar,Överdelar,T-shirts",4063
41,210695,Åshild,"Toppar,Överdelar",3952
...,...,...,...,...
794,270619,Abecita by swegmark,"Badkläder,Baddräkter",1
1083,340459,Daylight,"Synhjälpmedel,Vardagshjälpmedel,Belysning",1
1086,340572,Singer,"Sytillbehör,Symaskiner och tillbehör",1
197,260183,Trofé,"Underkläder,Trosor",1


In [27]:
# Find all fully unique category values present in the pairs DataFrame (no splitting)
unique_categories = sorted(pairs['category'].dropna().unique())
unique_categories


['Accessoarer',
 'Accessoarer,Bh,Underkläder,Bh-tillbehör',
 'Accessoarer,Handskar & vantar',
 'Accessoarer,Herr,Kepsar & mössor',
 'Accessoarer,Kepsar & mössor',
 'Accessoarer,Mössor & hattar',
 'Accessoarer,Väskor',
 'Bad,Duschdraperier',
 'Bad,Frottéhanddukar & badlakan',
 'Baddräkter,Badkläder',
 'Baddräkter,Badkläder,Dam',
 'Badkläder,Baddräkter',
 'Badkläder,Baddräkter,Kjolbaddräkter',
 'Badkläder,Bikini',
 'Badkläder,Dam',
 'Badkläder,Dam,Tankini',
 'Badkläder,Sarong',
 'Badkläder,Tankini',
 'Badrum/WC',
 'Badrum/WC,Ansiktsvård',
 'Badrum/WC,Vardagshjälpmedel,Stödartiklar',
 'Badrumsmattor,Bad',
 'Badskor,Badkläder',
 'Batteridrivna ljus',
 'Belysning,Barnrummet,Natt & garderobslampor',
 'Belysning,Batteridrivna ljus',
 'Belysning,Vardagshjälpmedel,Batterier',
 'Bh utan bygel,Baddräkter,Badkläder',
 'Bh utan bygel,Bh,Sport-bh,Bygel-bh,Underkläder',
 'Bh utan bygel,Bh,Underkläder',
 'Bh utan bygel,Bh,Underkläder,Framknäppt bh',
 'Bh utan bygel,Bh-toppar,Bh,Bh utan kupstorlek,Unde

In [18]:
def normalize_brand(brand):
    # Strip leading/trailing spaces, reduce multiple spaces to one, sentence case
    if isinstance(brand, str):
        brand_clean = ' '.join(brand.strip().split())
        return brand_clean.capitalize()
    return brand

pairs['brand'] = pairs['brand'].apply(normalize_brand)


In [None]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

def cat_to_set(s):
    if pd.isna(s): return set()
    toks = {t.strip().lower() for t in str(s).split(",") if t.strip()}
    return {t for t in toks if t != "unknown"}

def preprocess(pairs: pd.DataFrame):
    df = pairs.copy()
    df["brand_norm"] = df["brand"].astype(str).str.strip().str.lower()
    is_unknown = df["brand_norm"].eq("unknown")
    skipped = int(is_unknown.sum())
    df = df.loc[~is_unknown].copy()
    df["cat_set"] = df["category"].map(cat_to_set)
    return df, skipped

def build_recs(filtered_pairs: pd.DataFrame, min_recs=4, max_recs=10):
    recs, insufficient = {}, 0
    for _, g in filtered_pairs.groupby("brand_norm", sort=False):
        g = g.sort_values("transactions", ascending=False)
        cat_map = g["cat_set"].to_dict()
        for idx, row in g.iterrows():
            gid = str(row["groupId"])
            my_cats = row["cat_set"]
            if not my_cats: 
                insufficient += 1
                continue
            mask = (
                (g["groupId"].astype(str) != gid)
                & (g.index != idx)
                & g.index.map(lambda j: not my_cats.isdisjoint(cat_map[j]))
            )
            tops = g.loc[mask, "groupId"].astype(str).head(max_recs).tolist()
            if len(tops) >= min_recs:
                recs[gid] = {f"Top {i+1}": t for i, t in enumerate(tops)}
            else:
                insufficient += 1
    rows = [{"Product ID": k, **v} for k, v in recs.items()]
    df = pd.DataFrame(rows)
    if not df.empty:
        ordered = ["Product ID"] + [c for i in range(1, max_recs + 1) if (c := f"Top {i}") in df.columns]
        df = df[ordered]
    return df, insufficient

def save_parquet(df: pd.DataFrame, path: str):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, path)

def run(pairs: pd.DataFrame, min_recs=4, max_recs=10, out_path="/workspace/data/processed/top_same_brand.parquet"):
    filtered, skipped = preprocess(pairs)
    export_df, insufficient = build_recs(filtered, min_recs=min_recs, max_recs=max_recs)
    save_parquet(export_df, out_path)
    print(f"Skipped {skipped} rows due to Unknown brand")
    print(f"Removed {insufficient} groupIds due to having fewer than {min_recs} recommendations")
    print(f"Saved {len(export_df)} rows to {out_path}")
    return export_df

# Example:
export_df = run(pairs, min_recs=4, max_recs=10)


Skipped 40 rows due to Unknown brand
Removed 161 groupIds due to having fewer than 4 recommendations
Saved 1262 rows to /workspace/data/processed/top_same_brand_and_category.parquet
