## Assemble a DS

In [1]:
import pandas as pd
import numpy as np

In [2]:
groups = pd.read_parquet(
    "/workspace/data/processed/articles_for_recs.parquet")

In [3]:
groups['priceSEK'] = pd.to_numeric(groups['priceSEK'], errors='coerce')
before_count = len(groups)
groups = groups[groups['priceSEK'] >= 1]
after_count = len(groups)
print(f"Dropped {before_count - after_count} rows with priceSEK < 1")

Dropped 18 rows with priceSEK < 1


In [4]:
# Create price buckets based on the distribution of price_sek, using 6 buckets
price_bins = [0, 100, 300, 600, 1000, 2000, float('inf')]
price_labels = [
    'Budget',
    'Value',
    'Popular',
    'Premium',
    'Luxury',
    'Exclusive'
]
groups['priceband'] = pd.cut(groups['priceSEK'], bins=price_bins, labels=price_labels, include_lowest=True)

In [5]:
if 'audienceId' in groups.columns:
    groups = groups.drop(columns=['audienceId'])


In [6]:
groups

Unnamed: 0,groupId,name,brand,audience,category,priceSEK,description,color,priceband
0,055522,Beskrivning Tröja,Gjestal Garn,dam,Tröjor,29,Sticka en färgglad och trendig tröja i garnet ...,[],Budget
1,055573,Luva Hygge,Novita,dam,"Mössor & hattar,Mönster",29,Sticka en trendig huva i garnet Halaus från No...,[],Budget
2,055575,Beskrivning Vantar,Novita,dam,Vantar,29,Sticka ett par vantar med blomstermotiv i garn...,[],Budget
3,055576,Beskrivning Benvärmare,Novita,dam,Sockor & strumpor,29,Sticka ett par trendiga benvärmare i garnet Ha...,[],Budget
4,095302,Garn Drops Nepal,Drops Design,,unknown,33,Garn Drops Nepal är ett underbart tjockt luxuö...,"[Beige, Blush, Blå, Blå/grön, Cerise, Grå, Grå...",Budget
...,...,...,...,...,...,...,...,...,...
1714,590838,Madrasskydd Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",359,Hygienisk och vattenavvisande yta. OEKO-TEX.Pl...,[Vit],Popular
1715,590841,5-meterklipp Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",398,Hygienisk och vattenavvisande yta. OEKO-TEX. P...,[Vit],Popular
1716,598005,Gardinkappa Sanna enfärgad med spets,Fondaco,hemmet,Kanalkappa,249,Underbar gardinkappa med vacker spetsdetalj.Ko...,"[Grön, Linne, Röd]",Value
1717,790196,Fingerborg 17 mm,Ateljé Margaretha,generic,"Sytillbehör,Vardagshjälpmedel",19,"Fingerborg, storlek 17 mm.",[],Budget


## 1. Build a clean text field for vectorization

In [7]:
import pandas as pd, unicodedata, re, numpy as np

MISSING = {"", "unknown", "nan", "none", None}

def canon(s: str) -> str:
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"\u00A0", " ", s)
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", "-", s)
    return re.sub(r"\s+", " ", s).strip()

def norm_categories(x):
    cats = [canon(c) for c in str(x).split(",") if str(c).strip() not in MISSING]
    seen, out = set(), []
    for c in cats:
        cl = c.lower()
        if c and cl not in seen:
            seen.add(cl)
            out.append(c)
    return out

def short_desc(desc, max_words=30):
    if not desc: return ""
    first = re.split(r"(?<=[.!?])\s+", desc)[0]
    return " ".join(first.split()[:max_words])

def format_colors(col) -> str:
    """
    Render colors as 'Svart, Grå' (no brackets). Accepts list/tuple/Series/ndarray or strings like:
    "['Svart' 'Grå']", "Grå,Svart", "Svart/Grå".
    """
    vals = []
    if isinstance(col, (list, tuple, pd.Series, np.ndarray)):
        seq = list(col)
        for v in seq:
            s = str(v).strip()
            if not s or s.lower() in MISSING: 
                continue
            # split embedded multi-values too (e.g., 'Grå,Svart')
            parts = re.split(r"\s*[,/|;]\s*", s) if any(sep in s for sep in ",/|;") else [s]
            vals.extend(parts)
    else:
        s = str(col).strip()
        if s and s.lower() not in MISSING:
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = re.split(r"\s*[,/|;]\s*", s) if any(sep in s for sep in ",/|;") else [s]

    # order-preserving dedupe
    out, seen = [], set()
    for v in vals:
        t = v.strip()
        if t and t.lower() not in seen:
            seen.add(t.lower())
            out.append(t)
    return ", ".join(out)

# ---- build ----
groups = groups.copy()

# Ensure color column exists
if "color" not in groups.columns:
    groups["color"] = ""

# Single normalized categories column
groups["categories"] = groups["category"].apply(norm_categories)

# Nice string rendering of colors for text/metadata
groups["colors_str"] = groups["color"].apply(format_colors)

def build_text_embed_clean(r):
    name  = canon(r.get("name", ""))
    desc  = short_desc(canon(r.get("description", "")), 30)
    brand = canon(r.get("brand", ""))
    cats  = r.get("categories", []) or []
    cols  = r.get("colors_str", "")

    # append audience as is (no missing checks, no mapping)
    aud   = canon(r.get("audience", ""))  # "<NA>", "nan", "" etc. will pass through

    parts, attrs = [], []

    # put audience first with a stable label
    parts.append(f"AUDIENCE: {aud}.")

    if name: parts.append(f"{name}.")
    if desc: parts.append(desc)
    if brand: attrs.append(brand)
    if cats:  attrs.append(", ".join(cats))
    if cols:  attrs.append(cols)
    if attrs: parts.append(" ".join(attrs) + ".")
    return re.sub(r"\s+", " ", " ".join(parts)).strip()

groups["text"] = groups.apply(build_text_embed_clean, axis=1)

group_df = groups[[
    "groupId",
    "text",
    "audience",     # raw audience only
    "color",
    "colors_str",
    "categories",
    "brand",
    "priceband"
]].reset_index(drop=True)

corpus = group_df["text"].tolist()

In [8]:
import sys, torch, transformers, sentence_transformers
print("py exe:", sys.executable)
print("torch:", torch.__version__, "at", torch.__file__)
print("transformers:", transformers.__version__)
print("sentence-transformers:", sentence_transformers.__version__)


py exe: /usr/local/bin/python
torch: 2.8.0+cpu at /usr/local/lib/python3.10/site-packages/torch/__init__.py
transformers: 4.57.1
sentence-transformers: 5.1.2


In [9]:

#!python -m pip install --upgrade --index-url https://download.pytorch.org/whl/cpu torch==2.8.0

#pip install --no-deps sentence-transformers transformers tokenizers huggingface_hub safetensors
#pip install --no-cache-dir "sentencepiece==0.1.99"
#pip install faiss-cpu scikit-learn

#pip install "huggingface_hub[hf_xet]"  # or: pip install hf_xet

#pip install regex


In [10]:
#load encoder
MODEL_ID = "Alibaba-NLP/gte-multilingual-base"

import os, torch
from sentence_transformers import SentenceTransformer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(4)

enc = SentenceTransformer(MODEL_ID, device="cpu", trust_remote_code=True)
enc.max_seq_length = min(4096, enc.tokenizer.model_max_length)
print(enc)


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False, 'architecture': 'NewModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [11]:
#embed texts

texts = group_df["text"].fillna("").tolist()

E = enc.encode(
    texts,
    batch_size=512,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True
).astype("float32")

N, d = E.shape
N, d


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

(1702, 768)

In [12]:
#FAISS index (cosine via inner product) + gid map

import faiss

index = faiss.IndexFlatIP(d)   # cosine since E is normalized
index.add(E)

gid2i = {str(g): i for i, g in enumerate(group_df["groupId"].astype(str))}
N, index.is_trained



(1702, True)

In [None]:
import numpy as np
import pandas as pd

def compute_neighbors_with_audience(
    index,
    E: np.ndarray,
    groups: pd.DataFrame,
    audience_col: str = "audience",
    K: int = 10,
    cos_min: float = 0.6,
    min_keep: int = 1,
    k_cand: int | None = None,
    aud_ok_map: dict | None = None,
):
    N = E.shape[0]
    if k_cand is None:
        k_cand = min(100, max(1, N - 1))

    if aud_ok_map is None:
        aud_ok_map = {
            "herr": {"herr", "generic"},
            "dam": {"dam", "generic"},
            "barn & ungdom": {"barn & ungdom", "generic"},
            "hemmet": {"hemmet", "generic"},
            "generic": {"generic", "herr", "dam", "barn & ungdom", "hemmet"},
        }

    aud = groups[audience_col].astype(str).to_numpy()
    S_all, I_all = index.search(E, k_cand + 1)

    neighbors, sims_list, valid = [], [], []

    for i in range(N):
        js, ss = I_all[i], S_all[i]
        qa = aud[i]
        allowed = aud_ok_map.get(qa, {qa, "generic"})

        m = (js != i) & (ss >= cos_min)
        m &= np.isin(aud[js], list(allowed))

        js, ss = js[m], ss[m]
        if js.size:
            order = np.argsort(-ss)
            js, ss = js[order], ss[order]

        take = min(K, js.size)
        ok = take >= min_keep

        if ok:
            neighbors.append(js[:take].tolist())
            sims_list.append(ss[:take].astype(float).tolist())
        else:
            neighbors.append([])
            sims_list.append([])
        valid.append(ok)

    valid = np.array(valid, dtype=bool)
    kept = int(valid.sum())
    skipped = int(N - kept)
    stats = {
        "total": int(N), "kept": kept, "skipped_lt_min": skipped,
        "K": K, "min_keep": min_keep, "cos_min": cos_min, "k_cand": k_cand
    }
    return neighbors, sims_list, valid, stats


def build_topk_rows(
    neighbors, sims_list, groups: pd.DataFrame,
    K: int = 10, id_key: str = "Product ID", id_col: str | None = "groupId",
    skip_if_empty: bool = True  # NEW
):
    ids = groups[id_col].astype(str).to_numpy() if id_col else groups.index.astype(str).to_numpy()
    rows = []
    for i, (js, ss) in enumerate(zip(neighbors, sims_list)):
        if skip_if_empty and len(js) == 0:
            continue
        row = {id_key: ids[i]}
        for k in range(1, K + 1):
            if k <= len(js):
                row[f"Top {k}"]   = ids[js[k - 1]]
                row[f"Score {k}"] = float(ss[k - 1])
            else:
                row[f"Top {k}"]   = None
                row[f"Score {k}"] = None
        rows.append(row)
    return rows


neighbors, sims, valid, stats = compute_neighbors_with_audience(index, E, groups, K=10)
rows = build_topk_rows(neighbors, sims, groups, K=10, id_col="groupId")

print(stats)

{'total': 1702, 'kept': 1699, 'skipped_lt_min': 3, 'K': 10, 'min_keep': 1, 'cos_min': 0.6, 'k_cand': 100}
{'Product ID': '055522', 'Top 1': '261998', 'Score 1': 0.7245949506759644, 'Top 2': '260287', 'Score 2': 0.7113903164863586, 'Top 3': '260163', 'Score 3': 0.7076380252838135, 'Top 4': '267113', 'Score 4': 0.7033674716949463, 'Top 5': '261478', 'Score 5': 0.6997042894363403, 'Top 6': '272097', 'Score 6': 0.6988092660903931, 'Top 7': '260646', 'Score 7': 0.6972818374633789, 'Top 8': '210782', 'Score 8': 0.6907309889793396, 'Top 9': '210676', 'Score 9': 0.68827223777771, 'Top 10': '261476', 'Score 10': 0.6865171790122986}


In [19]:
def save_wide_topk_parquet(rows, path="/workspace/data/processed/semantic_similarity_recs.parquet", K=10):
    cols = ["Product ID"] + [c for k in range(1, K+1) for c in (f"Top {k}", f"Score {k}")]
    wide = pd.DataFrame(rows)
    # ensure missing cols exist (in case some Ks weren’t filled)
    for c in cols:
        if c not in wide.columns:
            wide[c] = None
    wide = wide[cols]
    wide.to_parquet(path, index=False)
    return wide

# usage
neighbors, sims, valid, stats = compute_neighbors_with_audience(index, E, groups, K=10)
rows = build_topk_rows(neighbors, sims, groups, K=10)
wide = save_wide_topk_parquet(rows, K=10)