

Hybrid with CF: final score = 0.7*embed_cos + 0.3*CF_sim

## Assemble a DS

In [50]:
import pandas as pd
groups = pd.read_parquet(
    "/workspace/data/processed/groups_for_recs.parquet")

In [51]:
groups['priceSEK'] = pd.to_numeric(groups['priceSEK'], errors='coerce')
before_count = len(groups)
groups = groups[groups['priceSEK'] >= 50]
after_count = len(groups)
print(f"Dropped {before_count - after_count} rows with priceSEK < 50")

Dropped 82 rows with priceSEK < 50


In [52]:
# Create price buckets based on the distribution of price_sek, using 6 buckets
price_bins = [0, 100, 300, 600, 1000, 2000, float('inf')]
price_labels = [
    'Budget',        # 0-100
    'Value',         # 100-300
    'Popular',       # 300-600
    'Premium',       # 600-1000
    'Luxury',        # 1000-2000
    'Exclusive'      # 2000+
]
groups['priceband'] = pd.cut(groups['priceSEK'], bins=price_bins, labels=price_labels, include_lowest=True)

## 1. Build a clean text field for vectorization

In [53]:
import pandas as pd, unicodedata, re, numpy as np

MISSING = {"", "unknown", "nan", "none", None}

def canon(s: str) -> str:
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"\u00A0", " ", s)
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", "-", s)
    return re.sub(r"\s+", " ", s).strip()

def norm_categories(x):
    cats = [canon(c) for c in str(x).split(",") if str(c).strip() not in MISSING]
    seen, out = set(), []
    for c in cats:
        cl = c.lower()
        if c and cl not in seen:
            seen.add(cl)
            out.append(c)
    return out

def short_desc(desc, max_words=30):
    if not desc: return ""
    first = re.split(r"(?<=[.!?])\s+", desc)[0]
    return " ".join(first.split()[:max_words])

def format_colors(col) -> str:
    """
    Render colors as 'Svart, Grå' (no brackets). Accepts list/tuple/Series/ndarray or strings like:
    "['Svart' 'Grå']", "Grå,Svart", "Svart/Grå".
    """
    vals = []
    if isinstance(col, (list, tuple, pd.Series, np.ndarray)):
        seq = list(col)
        for v in seq:
            s = str(v).strip()
            if not s or s.lower() in MISSING: 
                continue
            # split embedded multi-values too (e.g., 'Grå,Svart')
            parts = re.split(r"\s*[,/|;]\s*", s) if any(sep in s for sep in ",/|;") else [s]
            vals.extend(parts)
    else:
        s = str(col).strip()
        if s and s.lower() not in MISSING:
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = re.split(r"\s*[,/|;]\s*", s) if any(sep in s for sep in ",/|;") else [s]

    # order-preserving dedupe
    out, seen = [], set()
    for v in vals:
        t = v.strip()
        if t and t.lower() not in seen:
            seen.add(t.lower())
            out.append(t)
    return ", ".join(out)

# ---- build ----
groups = groups.copy()

# Ensure color column exists
if "color" not in groups.columns:
    groups["color"] = ""

# Single normalized categories column
groups["categories"] = groups["category"].apply(norm_categories)

# Nice string rendering of colors for text/metadata
groups["colors_str"] = groups["color"].apply(format_colors)

def build_text_embed_clean(r):
    name  = canon(r.get("name", ""))
    desc  = short_desc(canon(r.get("description", "")), 30)
    brand_raw = r.get("brand", "")
    brand = canon(brand_raw) if str(brand_raw).strip() and str(brand_raw).strip().lower() not in MISSING else ""
    cats  = r.get("categories", []) or []
    cols  = r.get("colors_str", "")

    parts, attrs = [], []
    if name: parts.append(f"{name}.")
    if desc: parts.append(desc)
    if brand: attrs.append(brand)
    if cats:  attrs.append(", ".join(cats))
    if cols:  attrs.append(cols)
    if attrs: parts.append(" ".join(attrs) + ".")
    return re.sub(r"\s+", " ", " ".join(parts)).strip()

groups["text"] = groups.apply(build_text_embed_clean, axis=1)

group_df = groups[[
    "groupId",
    "text",
    "color",        # original raw colors (list/string as-is)
    "colors_str",
    "categories",
    "brand",
    "priceband"
]].reset_index(drop=True)

# Corpus for embedding
corpus = group_df["text"].tolist()


In [54]:
import sys, torch, transformers, sentence_transformers
print("py exe:", sys.executable)
print("torch:", torch.__version__, "at", torch.__file__)
print("transformers:", transformers.__version__)
print("sentence-transformers:", sentence_transformers.__version__)


py exe: /usr/local/bin/python
torch: 2.8.0+cpu at /home/vscode/.local/lib/python3.10/site-packages/torch/__init__.py
transformers: 4.57.0
sentence-transformers: 5.1.1


In [55]:

#!python -m pip install --upgrade --index-url https://download.pytorch.org/whl/cpu torch==2.8.0

#pip install --no-deps sentence-transformers transformers tokenizers huggingface_hub safetensors
#pip install --no-cache-dir "sentencepiece==0.1.99"
#pip install faiss-cpu scikit-learn

#pip install "huggingface_hub[hf_xet]"  # or: pip install hf_xet

#pip install regex


In [56]:
#load encoder
MODEL_ID = "Alibaba-NLP/gte-multilingual-base"

import os, torch
from sentence_transformers import SentenceTransformer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(4)

enc = SentenceTransformer(MODEL_ID, device="cpu", trust_remote_code=True)
enc.max_seq_length = min(4096, enc.tokenizer.model_max_length)
print(enc)


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False, 'architecture': 'NewModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [57]:
#embed texts

import numpy as np
import pandas as pd

texts = group_df["text"].fillna("").tolist()

E = enc.encode(
    texts,
    batch_size=512,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=False
).astype("float32")

N, d = E.shape
N, d


(1648, 768)

In [58]:
#FAISS index (cosine via inner product) + gid map

import faiss

index = faiss.IndexFlatIP(d)   # cosine since E is normalized
index.add(E)

# fast groupId -> row lookup (store as str to be safe)
gid2i = {str(g): i for i, g in enumerate(group_df["groupId"].astype(str))}
N, index.is_trained



(1648, True)

In [59]:
#precompute neighbors
K = 10
COS_MIN = 0.60

# One batched search for everyone
S_all, I_all = index.search(E, K + 1)  # includes self

neighbors, sims_list = [], []
for i in range(N):
    js, ss = I_all[i], S_all[i]
    m = (js != i) & (ss >= COS_MIN)     # drop self + cosine cutoff
    js, ss = js[m][:K], ss[m][:K]
    neighbors.append(js.tolist())
    sims_list.append(ss.astype(float).tolist())

In [60]:
def search_by_gid(target_gid):
    i = gid2i[str(target_gid)]
    js, ss = neighbors[i], sims_list[i]
    if not js:
        return group_df.iloc[[]].assign(similarity=[]).reset_index(drop=True)
    out = group_df.iloc[js].copy()
    out["similarity"] = ss
    return out.reset_index(drop=True)


In [61]:
# example:
#search_by_gid("290281", k=10)
search_by_gid("106065")

Unnamed: 0,groupId,text,color,colors_str,categories,brand,priceband,similarity
0,527002,Fyndpaket Heminredning. Nu erbjuder vi dig som...,[],,[REA],unknown,Premium,0.749368
1,206532,Fyndpaket stor. Fyndpaket. Åshild REA.,[],,[REA],Åshild,Popular,0.684989
2,206524,Fyndpaket liten. Fyndpaket. Åshild REA.,[],,[REA],Åshild,Value,0.672018
3,536763,Fyndpaket Jul Hemtextil/juldekorationer. Fyndp...,[],,[REA],unknown,Premium,0.649287
4,350600,Fyndpaket Hobbyhörnan. Överraskningspaket med ...,[],,"[Hobbyhörnan, Pysselset]",Ateljé Margaretha,Premium,0.601266


In [62]:
def save_wide_topk_parquet(path="semantic_similarity_recs.parquet"):
    gids = group_df["groupId"].astype(str).tolist()
    rows = []
    for i, gid in enumerate(gids):
        rec_ids = group_df.iloc[neighbors[i]]["groupId"].astype(str).tolist()
        rows.append([gid] + rec_ids + [None] * (K - len(rec_ids)))
    import pandas as pd
    wide = pd.DataFrame(rows, columns=["Product ID"] + [f"Top {r}" for r in range(1, K+1)])
    wide.to_parquet(path, index=False)
    return wide

wide = save_wide_topk_parquet()
