In [24]:
import pandas as pd, numpy as np

In [25]:
groups = pd.read_parquet(
    "/workspace/data/processed/articles_for_recs.parquet")

In [26]:
groups['priceSEK'] = pd.to_numeric(groups['priceSEK'], errors='coerce')
before_count = len(groups)
groups = groups[groups['priceSEK'] >= 1]
after_count = len(groups)
print(f"Dropped {before_count - after_count} rows with priceSEK < 1")

if 'audienceId' in groups.columns:
    groups = groups.drop(columns=['audienceId'])


Dropped 18 rows with priceSEK < 1


In [27]:
import re
import unicodedata
import pandas as pd
import numpy as np

MISSING = {"", "unknown", "nan", "none", None}

def canon(s: str) -> str:
    """Normalize quotes, dashes, NBSP and whitespace; trim."""
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"\u00A0", " ", s)                     # NBSP -> space
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", "-", s)   # hyphen family -> "-"
    return re.sub(r"\s+", " ", s).strip()

def norm_categories(x):
    """Normalize, lowercase-dedupe (order-preserving), drop MISSING."""
    cats = [canon(c) for c in str(x).split(",") if str(c).strip() not in MISSING]
    seen, out = set(), []
    for c in cats:
        key = c.lower()
        if c and key not in seen:
            seen.add(key)
            out.append(c)
    return out

def short_desc(desc, max_words: int = 30):
    """Take the first sentence (if any) and cap at max_words."""
    if not desc:
        return ""
    first = re.split(r"(?<=[.!?])\s+", desc)[0]
    return " ".join(first.split()[:max_words])

def _split_multi(s: str, seps=",/|;"):
    """Split on any of the provided separators (regex-friendly), trimming whitespace."""
    # Build a character class from separators like ",/|;" -> "[,/|;]"
    pattern = r"\s*[" + re.escape(seps) + r"]\s*"
    return re.split(pattern, s) if any(ch in s for ch in seps) else [s]

def format_colors(col) -> str:
    """
    Render colors as 'Svart, Grå' (no brackets). Accepts list/tuple/Series/ndarray
    or strings like: "['Svart' 'Grå']", "Grå,Svart", "Svart/Grå".
    """
    vals = []
    if isinstance(col, (list, tuple, pd.Series, np.ndarray)):
        seq = list(col)
        for v in seq:
            s = str(v).strip()
            if not s or s.lower() in MISSING:
                continue
            parts = _split_multi(s, seps=",/|;")
            vals.extend(parts)
    else:
        s = str(col).strip()
        if s and s.lower() not in MISSING:
            # handle quoted lists like "['Svart' 'Grå']" or '["Svart","Grå"]'
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = _split_multi(s, seps=",/|;")

    # order-preserving dedupe with normalization
    out, seen = [], set()
    for v in vals:
        t = canon(v)
        key = t.lower()
        if t and key not in seen and key not in MISSING:
            seen.add(key)
            out.append(t)
    return ", ".join(out)

def format_sizes(sz) -> str:
    """
    Render sizes as '36/38, 40/42' (no brackets).
    Accepts list/tuple/Series/ndarray or strings like:
    "['36/38' '40/42']", "36/38,40/42", "36/38 | 40/42".
    NOTE: Do NOT split on '/' because '36/38' is a single size token.
    """
    vals = []
    if isinstance(sz, (list, tuple, pd.Series, np.ndarray)):
        for v in list(sz):
            s = str(v).strip()
            if not s or s.lower() in MISSING:
                continue
            parts = _split_multi(s, seps=",|;")  # no '/' here
            vals.extend(parts)
    else:
        s = str(sz).strip()
        if s and s.lower() not in MISSING:
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = _split_multi(s, seps=",|;")  # no '/'

    out, seen = [], set()
    for v in vals:
        t = canon(v)
        key = t.lower()
        if t and key not in seen and key not in MISSING:
            seen.add(key)
            out.append(t)
    return ", ".join(out)

def build_texts_for_ir(r):
    """
    Create two strings:
      - text_embed: compact, instruction-prefixed for embedding ("passage: ...")
      - text_rerank: richer, for cross-encoder reranking
    """
    name  = canon(r.get("name", ""))
    desc  = short_desc(canon(r.get("description", "")), 30)
    brand = canon(r.get("brand", ""))
    cats  = r.get("categories", []) or []
    cols  = r.get("colors_str", "")
    sizes = r.get("sizes_str", "")
    aud   = canon(r.get("audience", ""))

    # Make categories explicit tokens so queries like "bh bygel" hit
    # Add common Swedish synonyms if you use them in your data.
    cat_str = ", ".join(cats)

    # EMBEDDING TEXT (compact, labeled fields)
    embed_parts = [
        f"passage: {name}."
    ]
    if desc:
        embed_parts.append(desc)
    if brand:
        embed_parts.append(f"Brand: {brand}.")
    if aud:
        embed_parts.append(f"Audience: {aud}.")
    if cat_str:
        embed_parts.append(f"Categories: {cat_str}.")
    if cols:
        embed_parts.append(f"Colors: {cols}.")
    if sizes:
        embed_parts.append(f"Sizes: {sizes}.")
    text_embed = " ".join(embed_parts)

    # RERANK TEXT (richer; CE can use longer input)
    rerank_parts = [
        f"{name}.",
        desc if desc else "",
        f"Brand: {brand}." if brand else "",
        f"Audience: {aud}." if aud else "",
        f"Categories: {cat_str}." if cat_str else "",
        f"Colors: {cols}." if cols else "",
        f"Sizes: {sizes}." if sizes else "",
    ]
    text_rerank = " ".join(p for p in rerank_parts if p).strip()
    return text_embed, text_rerank

def prepare_group_corpus(groups: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    g = groups.copy()
    if "color" not in g.columns:
        g["color"] = ""
    if "size" not in g.columns:
        g["size"] = ""

    g["categories"] = g["category"].apply(norm_categories)
    g["colors_str"] = g["color"].apply(format_colors)
    g["sizes_str"]  = g["size"].apply(format_sizes)

    texts = g.apply(lambda r: build_texts_for_ir(r), axis=1)
    g["text"]        = [t[0] for t in texts]  # for embeddings (passage: ...)
    g["text_rerank"] = [t[1] for t in texts]  # for cross-encoder

    cols = ["groupId","text","text_rerank","audience","color","colors_str","size",
            "sizes_str","categories","brand","name"]
    present = [c for c in cols if c in g.columns]
    group_df = g[present].reset_index(drop=True)
    corpus = group_df["text"].tolist()  # this is what you embed/index
    return group_df, corpus


# Example usage:
group_df, corpus = prepare_group_corpus(groups)
