In [24]:
import pandas as pd, numpy as np

In [25]:
groups = pd.read_parquet(
    "/workspace/data/processed/articles_for_recs.parquet")

In [26]:
groups['priceSEK'] = pd.to_numeric(groups['priceSEK'], errors='coerce')
before_count = len(groups)
groups = groups[groups['priceSEK'] >= 1]
after_count = len(groups)
print(f"Dropped {before_count - after_count} rows with priceSEK < 1")

if 'audienceId' in groups.columns:
    groups = groups.drop(columns=['audienceId'])


Dropped 18 rows with priceSEK < 1


In [27]:
import re
import unicodedata
import pandas as pd
import numpy as np

MISSING = {"", "unknown", "nan", "none", None}

def canon(s: str) -> str:
    """Normalize quotes, dashes, NBSP and whitespace; trim."""
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"\u00A0", " ", s)                     # NBSP -> space
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", "-", s)   # hyphen family -> "-"
    return re.sub(r"\s+", " ", s).strip()

def norm_categories(x):
    """Normalize, lowercase-dedupe (order-preserving), drop MISSING."""
    cats = [canon(c) for c in str(x).split(",") if str(c).strip() not in MISSING]
    seen, out = set(), []
    for c in cats:
        key = c.lower()
        if c and key not in seen:
            seen.add(key)
            out.append(c)
    return out

def short_desc(desc, max_words: int = 30):
    """Take the first sentence (if any) and cap at max_words."""
    if not desc:
        return ""
    first = re.split(r"(?<=[.!?])\s+", desc)[0]
    return " ".join(first.split()[:max_words])

def _split_multi(s: str, seps=",/|;"):
    """Split on any of the provided separators (regex-friendly), trimming whitespace."""
    # Build a character class from separators like ",/|;" -> "[,/|;]"
    pattern = r"\s*[" + re.escape(seps) + r"]\s*"
    return re.split(pattern, s) if any(ch in s for ch in seps) else [s]

def format_colors(col) -> str:
    """
    Render colors as 'Svart, Grå' (no brackets). Accepts list/tuple/Series/ndarray
    or strings like: "['Svart' 'Grå']", "Grå,Svart", "Svart/Grå".
    """
    vals = []
    if isinstance(col, (list, tuple, pd.Series, np.ndarray)):
        seq = list(col)
        for v in seq:
            s = str(v).strip()
            if not s or s.lower() in MISSING:
                continue
            parts = _split_multi(s, seps=",/|;")
            vals.extend(parts)
    else:
        s = str(col).strip()
        if s and s.lower() not in MISSING:
            # handle quoted lists like "['Svart' 'Grå']" or '["Svart","Grå"]'
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = _split_multi(s, seps=",/|;")

    # order-preserving dedupe with normalization
    out, seen = [], set()
    for v in vals:
        t = canon(v)
        key = t.lower()
        if t and key not in seen and key not in MISSING:
            seen.add(key)
            out.append(t)
    return ", ".join(out)

def format_sizes(sz) -> str:
    """
    Render sizes as '36/38, 40/42' (no brackets).
    Accepts list/tuple/Series/ndarray or strings like:
    "['36/38' '40/42']", "36/38,40/42", "36/38 | 40/42".
    NOTE: Do NOT split on '/' because '36/38' is a single size token.
    """
    vals = []
    if isinstance(sz, (list, tuple, pd.Series, np.ndarray)):
        for v in list(sz):
            s = str(v).strip()
            if not s or s.lower() in MISSING:
                continue
            parts = _split_multi(s, seps=",|;")  # no '/' here
            vals.extend(parts)
    else:
        s = str(sz).strip()
        if s and s.lower() not in MISSING:
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = _split_multi(s, seps=",|;")  # no '/'

    out, seen = [], set()
    for v in vals:
        t = canon(v)
        key = t.lower()
        if t and key not in seen and key not in MISSING:
            seen.add(key)
            out.append(t)
    return ", ".join(out)

def build_texts_for_ir(r):
    """
    Create two strings:
      - text_embed: compact, instruction-prefixed for embedding ("passage: ...")
      - text_rerank: richer, for cross-encoder reranking
    """
    name  = canon(r.get("name", ""))
    desc  = short_desc(canon(r.get("description", "")), 30)
    brand = canon(r.get("brand", ""))
    cats  = r.get("categories", []) or []
    cols  = r.get("colors_str", "")
    sizes = r.get("sizes_str", "")
    aud   = canon(r.get("audience", ""))

    # Make categories explicit tokens so queries like "bh bygel" hit
    # Add common Swedish synonyms if you use them in your data.
    cat_str = ", ".join(cats)

    # EMBEDDING TEXT (compact, labeled fields)
    embed_parts = [
        f"passage: {name}."
    ]
    if desc:
        embed_parts.append(desc)
    if brand:
        embed_parts.append(f"Brand: {brand}.")
    if aud:
        embed_parts.append(f"Audience: {aud}.")
    if cat_str:
        embed_parts.append(f"Categories: {cat_str}.")
    if cols:
        embed_parts.append(f"Colors: {cols}.")
    if sizes:
        embed_parts.append(f"Sizes: {sizes}.")
    text_embed = " ".join(embed_parts)

    # RERANK TEXT (richer; CE can use longer input)
    rerank_parts = [
        f"{name}.",
        desc if desc else "",
        f"Brand: {brand}." if brand else "",
        f"Audience: {aud}." if aud else "",
        f"Categories: {cat_str}." if cat_str else "",
        f"Colors: {cols}." if cols else "",
        f"Sizes: {sizes}." if sizes else "",
    ]
    text_rerank = " ".join(p for p in rerank_parts if p).strip()
    return text_embed, text_rerank

def prepare_group_corpus(groups: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    g = groups.copy()
    if "color" not in g.columns:
        g["color"] = ""
    if "size" not in g.columns:
        g["size"] = ""

    g["categories"] = g["category"].apply(norm_categories)
    g["colors_str"] = g["color"].apply(format_colors)
    g["sizes_str"]  = g["size"].apply(format_sizes)

    texts = g.apply(lambda r: build_texts_for_ir(r), axis=1)
    g["text"]        = [t[0] for t in texts]  # for embeddings (passage: ...)
    g["text_rerank"] = [t[1] for t in texts]  # for cross-encoder

    cols = ["groupId","text","text_rerank","audience","color","colors_str","size",
            "sizes_str","categories","brand","name"]
    present = [c for c in cols if c in g.columns]
    group_df = g[present].reset_index(drop=True)
    corpus = group_df["text"].tolist()  # this is what you embed/index
    return group_df, corpus


# Example usage:
group_df, corpus = prepare_group_corpus(groups)


In [28]:
MODEL_ID = "Alibaba-NLP/gte-multilingual-base"

import os, torch
from sentence_transformers import SentenceTransformer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(4)

enc = SentenceTransformer(MODEL_ID, device="cpu", trust_remote_code=True)
enc.max_seq_length = min(256, enc.tokenizer.model_max_length)
print(enc)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'NewModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [29]:
#embed texts

texts = group_df["text"].fillna("").tolist()

E = enc.encode(
    texts,
    batch_size=256,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True
).astype("float32")

N, d = E.shape
N, d


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

(1700, 768)

Here the change from semantic search begins: to reuse the index for searching by product ID later
replace IndexFlatIP with an ID-mapped index so you can look up items by their real IDs.

In [30]:
import faiss

index = faiss.IndexIDMap2(faiss.IndexFlatIP(d))  # cosine since E is normalized
ids = group_df["groupId"].astype(np.int64).to_numpy()
index.add_with_ids(E, ids)

N, index.is_trained


(1700, True)

In [36]:
# Save FAISS index to disk
faiss.write_index(index, "products.faiss")

# Save metadata for quick lookup
group_df[["groupId", "name", "brand", "categories", "audience", "color", "size", "text", "text_rerank"]].to_parquet(
    "product_meta.parquet", index=False
)


In [47]:
# At query time
query = "Herrlinne"
qv = enc.encode(["query: " + query], normalize_embeddings=True,
                convert_to_numpy=True).astype("float32")
scores, ids = index.search(qv, 400)  # larger K to allow later gating

meta = pd.read_parquet("product_meta.parquet")


meta = meta.copy()
meta["groupId"] = meta["groupId"].astype(np.int64)

# Build ranking dataframe from FAISS results and join names
rank = pd.DataFrame({"groupId": ids[0].astype(np.int64), "score": scores[0]})
rank = rank[rank["groupId"] != -1]

out = (rank
       .merge(meta[["groupId", "name", "brand", "color", "size"]], on="groupId", how="left")
       .sort_values("score", ascending=False)
       .reset_index(drop=True))

out[["groupId", "name", "brand", "score", "color", "size"]]

Unnamed: 0,groupId,name,brand,score,color,size
0,261195,Herrlinne,Åshild,0.752234,[Vit],"[2XL, 3XL, L, M, XL]"
1,260192,Linne,Trofé,0.612880,"[Svart, Vit]","[2XL, 3XL, L, M, S, XL]"
2,280100,Herrsandal brun mocka Airbed,Embla of Sweden,0.595481,[Brun],"[40, 41, 42, 43, 44, 45, 46]"
3,217467,Linne,Åshild,0.581021,[Svart],"[36/38, 40/42, 44/46, 56/58]"
4,261756,Bygel-bh Laila,Trofé,0.580530,[Grön],"[A70, A75, A80, A85, B70, B75, B80, B85, B90, ..."
...,...,...,...,...,...,...
395,270431,Amnings-bh Leonarda,Abecita by Swegmark,0.506509,[Svart],"[F70, J80]"
396,260903,Bygel-bh Iris,Abecita by Swegmark,0.506270,"[Svart, Vit]","[C80, F85]"
397,291757,MoliCare Premium Lady pants 5 droppar,Hartmann,0.506127,[],"[7 droppar, Large]"
398,261064,Hipstertrosa Essentials,Anita,0.506097,"[Antracit, Marin, Röd, Sand, Svart]","[L/XL, S/M]"


In [48]:
# --- Rerank FAISS candidates on CPU using GTE multilingual cross-encoder ---

import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Inputs assumed available:
#   - index: FAISS index already loaded
#   - query: user query string, e.g. "bh bygel åshild"
#   - NOTE: your embeddings were built from `text` with the "passage: " prefix

# 1) Encode query for FAISS (GTE/E5-style prefix)
qv = enc.encode(["query: " + query], normalize_embeddings=True,
                convert_to_numpy=True).astype("float32")

# 2) Retrieve a larger candidate pool from FAISS
K = 400
scores, ids = index.search(qv, K)
cand_ids = ids[0].astype(np.int64)
cand_ids = cand_ids[cand_ids != -1]

# 3) Load metadata and join candidates
meta = pd.read_parquet("product_meta.parquet").copy()
meta["groupId"] = meta["groupId"].astype(np.int64)

need_cols = ["groupId", "name", "brand", "color", "size", "text_rerank"]
cands = (
    pd.DataFrame({"groupId": cand_ids})
    .drop_duplicates(subset=["groupId"])
    .merge(meta[need_cols], on="groupId", how="left")
    .dropna(subset=["text_rerank"])
)

# Early exit: no candidates
if len(cands) == 0:
    final = cands.copy()
else:
    # 4) Cross-encode (query, doc) pairs with proper instruction prefix
    tok = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-reranker-base")
    rerank = AutoModelForSequenceClassification.from_pretrained(
        "Alibaba-NLP/gte-multilingual-reranker-base", trust_remote_code=True
    ).eval()

    pairs = [("query: " + query, t) for t in cands["text_rerank"].tolist()]

    # Batch to control memory
    bs = 64
    ce_scores = []
    with torch.no_grad():
        for i in range(0, len(pairs), bs):
            batch = tok(pairs[i:i+bs], padding=True, truncation=True,
                        max_length=256, return_tensors="pt")
            logits = rerank(**batch).logits.view(-1).cpu().numpy()
            ce_scores.append(logits)

    cands["rerank_score"] = np.concatenate(ce_scores) if ce_scores else np.array([])

    # 5) Sort by reranker score (highest = best) and select output columns
    final = (cands.sort_values("rerank_score", ascending=False)
                  .loc[:, ["groupId", "name", "brand", "color", "size", "rerank_score"]]
                  .reset_index(drop=True))



In [49]:
# Show top results
final.head(20)

Unnamed: 0,groupId,name,brand,color,size,rerank_score
0,261195,Herrlinne,Åshild,[Vit],"[2XL, 3XL, L, M, XL]",1.574539
1,393017,Måttband på rulle,Hemline,[],[],0.414236
2,470161,Fingerborg i läder,Hemline,[],[],0.094913
3,260192,Linne,Trofé,"[Svart, Vit]","[2XL, 3XL, L, M, S, XL]",0.052729
4,270054,Baddräkt Shirley,Damella,"[Marin, Svart]","[36, 38, 40, 42, 44, 46, 48]",0.039965
5,261846,Linne,Damella,"[Off-white, Svart]","[36, 38, 40, 42]",0.029686
6,230097,Klänning Henrietta,Åshild,[Svart],"[38, 40, 42, 44, 46, 48, 50, 52, 54]",0.021018
7,393066,Klädvårdsrulle refill 2-pack,Hemline,[],[],0.018764
8,393058,Klädvårdsrulle,Hemline,[],[],-0.000581
9,261866,Ull-linne,Damella,"[Svart, Vanilj]","[38, 40, 42, 44, 46, 48]",-0.108966


In [23]:
# Brands
# Brands
unique_brands = pd.Series(group_df["brand"].dropna().unique())
unique_brands_lower = unique_brands.str.lower().unique()
pd.Series(unique_brands_lower).to_frame(name="brand").to_parquet("brands.parquet", index=False)

# Categories (same pattern as brands)
all_categories = group_df["categories"].dropna().explode()
unique_categories = pd.Series(all_categories.dropna().unique())
unique_categories_lower = unique_categories.str.lower().unique()
pd.Series(unique_categories_lower).to_frame(name="category").to_parquet("categories.parquet", index=False)

# Colors
# Flatten the color lists, remove NAs, and get unique (case-insensitive) color names
all_colors = group_df["color"].dropna().explode()
unique_colors = pd.Series(all_colors.dropna().unique())
unique_colors_lower = unique_colors.str.lower().unique()
pd.Series(unique_colors_lower).to_frame(name="color").to_parquet("colors.parquet", index=False)

# Sizes
# Flatten the size lists, remove NAs, and get unique (case-insensitive) size values
all_sizes = group_df["size"].dropna().explode()
unique_sizes = pd.Series(all_sizes.dropna().unique())
unique_sizes_lower = unique_sizes.str.lower().unique()
pd.Series(unique_sizes_lower).to_frame(name="size").to_parquet("sizes.parquet", index=False)


In [None]:
# Load index
index = faiss.read_index("products.faiss")

# Encode a query (e.g., user's search text)
query = "bh bygel åshild"
qv = enc.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")

# Search top 10 similar items
scores, ids = index.search(qv, 100)


meta = pd.read_parquet("product_meta.parquet")


meta = meta.copy()
meta["groupId"] = meta["groupId"].astype(np.int64)

# Build ranking dataframe from FAISS results and join names
rank = pd.DataFrame({"groupId": ids[0].astype(np.int64), "score": scores[0]})
rank = rank[rank["groupId"] != -1]

out = (rank
       .merge(meta[["groupId", "name", "brand", "color", "size"]], on="groupId", how="left")
       .sort_values("score", ascending=False)
       .reset_index(drop=True))

out[["groupId", "name", "brand", "score", "color", "size"]]


Unnamed: 0,groupId,name,brand,score,color,size
0,261736,Bygel-bh Essence,Miss Mary,0.772645,"[Svart, Vit]","[B100, B105, B75, B80, B85, B90, B95, C100, C1..."
1,240176,Byxa med resårmidja,Åshild,0.770817,"[Beige, Marin, Svart]","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 38, 4..."
2,267111,Bygel-bh Embroidery Dreams,Miss Mary,0.768182,"[Rosa, Svart]","[A100, A70, A75, A80, A85, A90, A95, B100, B70..."
3,261695,Bygel-bh Julia,Trofé,0.765987,"[Off-white, Svart]","[A70, A75, A80, A85, B70, B75, B80, B85, B90, ..."
4,260097,Bygel-bh Asta,Trofé,0.765903,"[Off-white, Svart]","[B75, B80, B85, B90, B95, C75, C80, C85, C90, ..."
5,242131,Byxa,Åshild,0.763808,"[Grå, Svart, Vinröd]","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 38, 4..."
6,261697,Bygel-bh Julia Lace,Trofé,0.757938,"[Off-white, Svart]","[A70, A75, A80, A85, B70, B75, B80, B85, B90, ..."
7,261907,Bygel-bh Bobette,Anita,0.756458,"[Röd, Svart]","[A75, A80, A85, A90, A95, B75, B80, B85, B90, ..."
8,242108,Populär byxa med resårmidja Ellen,Åshild,0.755024,"[Marin, Svart]","[38, 40, 42, 44, 46, 48, 50, 52, 54]"
9,261718,Bygel-bh Ella,Trofé,0.753005,[Svart],"[B70, B75, B80, B85, B90, C70, C75, C80, C85, ..."


In [11]:
# --- Rerank FAISS candidates on CPU (minimal) ---
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch, pandas as pd, numpy as np

tok = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-reranker-base")
rerank = AutoModelForSequenceClassification.from_pretrained(
    "Alibaba-NLP/gte-multilingual-reranker-base", trust_remote_code=True
).eval()  # CPU fp32


In [14]:
# 2) Retrieve with FAISS
K = 100
scores, ids = index.search(qv, K)
cand_ids = ids[0].astype(np.int64)
cand_ids = cand_ids[cand_ids != -1]

# 3) Prepare texts for reranking
meta = pd.read_parquet("product_meta.parquet").copy()
meta["groupId"] = meta["groupId"].astype(np.int64)


cands = (pd.DataFrame({"groupId": cand_ids})
         .merge(meta[["groupId","name","brand","color","size","text"]], on="groupId", how="left")
         .dropna(subset=["text"]))

# 4) Cross-encode (query, doc) pairs and rerank
pairs = [(query, t) for t in cands["text"].tolist()]
with torch.no_grad():
    batch = tok(pairs, padding=True, truncation=True, max_length=256, return_tensors="pt")
    ce = rerank(**batch).logits.view(-1).numpy()

cands["rerank_score"] = ce
final = (cands.sort_values("rerank_score", ascending=False)
         .loc[:, ["groupId","name","brand","color","size","rerank_score", "text"]]
         .reset_index(drop=True))

In [15]:
final.head(10)

Unnamed: 0,groupId,name,brand,color,size,rerank_score,text
0,241687,Bekväma caprileggings i stretchig kvalitet,Åshild,"[Svart, Vit]","[2XL, 3XL, 4XL, L, M, S, XL, XS]",1.192068,AUDIENCE: dam. Bekväma caprileggings i stretch...
1,262023,Bygel-bh Organic Cotton t-shirt bh,Miss Mary,"[Beige, Svart]","[B100, B105, B65, B75, B80, B85, B90, B95, C10...",1.056902,AUDIENCE: dam. Bygel-bh Organic Cotton t-shirt...
2,267115,Bygel-bh Organic Cotton t-shirt bh,Miss Mary,"[Beige, Svart]","[B100, B105, B75, B80, B85, B90, B95, C100, C1...",0.918124,AUDIENCE: dam. Bygel-bh Organic Cotton t-shirt...
3,261873,Bygel-bh Clean Curves,Swegmark,"[Beige, Svart, Vit]","[B100, B75, B80, B85, B90, B95, C100, C75, C80...",0.901874,AUDIENCE: dam. Bygel-bh Clean Curves. Denna el...
4,565301,Bygel-bh Fleur,Rosa Faia,"[Antracit, Off-white, Rosa, Svart]","[B100, B105, B70, B75, B80, B85, B90, B95, C10...",0.819396,AUDIENCE: dam. Bygel-bh Fleur. Elegant bygel- ...
5,261737,Bygel-bh Tenderly,Miss Mary,"[Svart, Vit]","[B100, B105, B75, B80, B85, B90, B95, C100, C1...",0.809103,AUDIENCE: dam. Bygel-bh Tenderly. En bh från M...
6,260090,Bygel-bh Sanna,Trofé,"[Beige, Svart, Vit]","[B100, B70, B75, B80, B85, B90, B95, C100, C11...",0.769203,AUDIENCE: dam. Bygel-bh Sanna. Sanna bygel-bh ...
7,260897,T-shirt-bh med bygel Smooth Lacy,Miss Mary,"[Beige, Mörkblå, Svart, Vit]","[B100, B105, B75, B80, B85, B90, B95, C100, C1...",0.761805,AUDIENCE: dam. T-shirt-bh med bygel Smooth Lac...
8,261736,Bygel-bh Essence,Miss Mary,"[Svart, Vit]","[B100, B105, B75, B80, B85, B90, B95, C100, C1...",0.757004,AUDIENCE: dam. Bygel-bh Essence. En elegant bo...
9,267111,Bygel-bh Embroidery Dreams,Miss Mary,"[Rosa, Svart]","[A100, A70, A75, A80, A85, A90, A95, B100, B70...",0.750061,AUDIENCE: dam. Bygel-bh Embroidery Dreams. Ele...
