In [55]:
import pandas as pd, numpy as np

In [56]:
groups = pd.read_parquet(
    "/workspace/data/processed/articles_for_recs.parquet")

In [57]:
groups['priceSEK'] = pd.to_numeric(groups['priceSEK'], errors='coerce')
before_count = len(groups)
groups = groups[groups['priceSEK'] >= 1]
after_count = len(groups)
print(f"Dropped {before_count - after_count} rows with priceSEK < 1")

if 'audienceId' in groups.columns:
    groups = groups.drop(columns=['audienceId'])


Dropped 18 rows with priceSEK < 1


In [58]:
import re
import unicodedata
import pandas as pd
import numpy as np   # <-- add this if you use np.ndarray

MISSING = {"", "unknown", "nan", "none", None}

def canon(s: str) -> str:
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r"\u00A0", " ", s)
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", "-", s)
    return re.sub(r"\s+", " ", s).strip()

# --- keep your existing norm_categories, short_desc, and format_colors ---

def format_sizes(sz) -> str:
    """
    Render sizes as '36/38, 40/42' (no brackets).
    Accepts list/tuple/Series/ndarray or strings like:
    "['36/38' '40/42']", "36/38,40/42", "36/38 | 40/42".
    NOTE: We DO NOT split on '/' because '36/38' is one size token.
    """
    vals = []

    # sequence-like input
    if isinstance(sz, (list, tuple, pd.Series, np.ndarray)):
        for v in list(sz):
            s = str(v).strip()
            if not s or s.lower() in MISSING:
                continue
            # split only on comma/pipe/semicolon, NOT on '/'
            parts = re.split(r"\s*[,|;]\s*", s) if any(sep in s for sep in ",|;") else [s]
            vals.extend(parts)

    else:
        s = str(sz).strip()
        if s and s.lower() not in MISSING:
            # strings like "['36/38' '40/42']" or '["36/38","40/42"]'
            quoted = re.findall(r"'([^']+)'|\"([^\"]+)\"", s)
            if quoted:
                vals = [a or b for a, b in quoted]
            else:
                vals = re.split(r"\s*[,|;]\s*", s) if any(sep in s for sep in ",|;") else [s]

    # order-preserving dedupe with normalization
    out, seen = [], set()
    for v in vals:
        t = canon(v)
        if t and t.lower() not in seen and t.lower() not in MISSING:
            seen.add(t.lower())
            out.append(t)
    return ", ".join(out)

# ---- build ----
groups = groups.copy()

# Ensure columns exist
if "color" not in groups.columns:
    groups["color"] = ""
if "size" not in groups.columns:
    groups["size"] = ""   # <-- add size

# Normalized categories
groups["categories"] = groups["category"].apply(norm_categories)

# Nice string renderings
groups["colors_str"] = groups["color"].apply(format_colors)
groups["sizes_str"]  = groups["size"].apply(format_sizes)   # <-- add sizes

def build_text_embed_clean(r):
    name  = canon(r.get("name", ""))
    desc  = short_desc(canon(r.get("description", "")), 30)
    brand = canon(r.get("brand", ""))
    cats  = r.get("categories", []) or []
    cols  = r.get("colors_str", "")
    sizes = r.get("sizes_str", "")   # <-- add sizes

    aud   = canon(r.get("audience", ""))

    parts, attrs = [], []
    parts.append(f"AUDIENCE: {aud}.")

    if name:  parts.append(f"{name}.")
    if desc:  parts.append(desc)
    if brand: attrs.append(brand)
    if cats:  attrs.append(", ".join(cats))
    if cols:  attrs.append(cols)
    if sizes: attrs.append(sizes)    # <-- include sizes
    if attrs: parts.append(" ".join(attrs) + ".")
    return re.sub(r"\s+", " ", " ".join(parts)).strip()

groups["text"] = groups.apply(build_text_embed_clean, axis=1)

group_df = groups[[
    "groupId",
    "text",
    "audience",
    "color",
    "colors_str",
    "size",         # <-- include raw size if you want it downstream
    "sizes_str",    # <-- include formatted sizes
    "categories",
    "brand",
    "name"
]].reset_index(drop=True)

corpus = group_df["text"].tolist()


In [None]:
MODEL_ID = "Alibaba-NLP/gte-multilingual-base"

import os, torch
from sentence_transformers import SentenceTransformer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(4)

enc = SentenceTransformer(MODEL_ID, device="cpu", trust_remote_code=True)
enc.max_seq_length = min(4096, enc.tokenizer.model_max_length)
print(enc)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False, 'architecture': 'NewModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


: 

In [None]:
#embed texts

texts = group_df["text"].fillna("").tolist()

E = enc.encode(
    texts,
    batch_size=512,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True
).astype("float32")

N, d = E.shape
N, d


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Here the change from semantic search begins: to reuse the index for searching by product ID later
replace IndexFlatIP with an ID-mapped index so you can look up items by their real IDs.

In [None]:
import faiss

index = faiss.IndexIDMap2(faiss.IndexFlatIP(d))  # cosine since E is normalized
ids = group_df["groupId"].astype(np.int64).to_numpy()
index.add_with_ids(E, ids)

N, index.is_trained


(1702, True)

In [None]:
# Save FAISS index to disk
faiss.write_index(index, "products.faiss")

# Save metadata for quick lookup
group_df[["groupId", "name", "brand", "categories", "audience", "color", "size", "text"]].to_parquet(
    "product_meta.parquet", index=False
)


In [None]:
# Brands
unique_brands = pd.Series(group_df["brand"].dropna().unique())
unique_brands_lower = unique_brands.str.lower().unique()
pd.Series(unique_brands_lower).to_frame(name="brand").to_parquet("brands.parquet", index=False)

# Colors
# Flatten the color lists, remove NAs, and get unique (case-insensitive) color names
all_colors = group_df["color"].dropna().explode()
unique_colors = pd.Series(all_colors.dropna().unique())
unique_colors_lower = unique_colors.str.lower().unique()
pd.Series(unique_colors_lower).to_frame(name="color").to_parquet("colors.parquet", index=False)


In [None]:
# Load index
index = faiss.read_index("products.faiss")

# Encode a query (e.g., user's search text)
query = "blå M"
qv = enc.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")

# Search top 10 similar items
scores, ids = index.search(qv, 10)


meta = pd.read_parquet("product_meta.parquet")


meta = meta.copy()
meta["groupId"] = meta["groupId"].astype(np.int64)

# Build ranking dataframe from FAISS results and join names
rank = pd.DataFrame({"groupId": ids[0].astype(np.int64), "score": scores[0]})
rank = rank[rank["groupId"] != -1]

out = (rank
       .merge(meta[["groupId", "name", "brand", "color", "size"]], on="groupId", how="left")
       .sort_values("score", ascending=False)
       .reset_index(drop=True))

out[["groupId", "name", "brand", "score", "color", "size"]]


Unnamed: 0,groupId,name,brand,score,color
0,210730,Rutig blus,Åshild,0.717682,[Blå]
1,219477,Herrskjorta,unknown,0.690796,"[Blå, Röd]"
2,210247,Kortärmad bomullsblus,Åshild,0.68846,[Vit]
3,270312,Herrpyjamas blå/vit,Åshild,0.671869,"[Blå, Marin]"
4,507863,Bomullslakan,Borganäs of Sweden,0.664199,"[Blå, Brun, Grön, Linne, Ljusblå, Ljusgrå, Mör..."
5,210748,Blus,Åshild,0.65618,[Svart]
6,290039,G-punktsvibrator Bodil blå Belladot,Belladot,0.654369,[]
7,270095,Tankini,Damella,0.651817,[Blå]
8,242511,Bomullsbyxa Julia,Åshild,0.649073,"[Beige, Ljusblå, Marin, Vit]"
9,531400,Bågkappa Fanny med broderade blommor,Linea,0.648997,"[Blå, Vit]"


In [None]:
# --- Rerank FAISS candidates on CPU (minimal) ---
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch, pandas as pd, numpy as np

tok = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-reranker-base")
rerank = AutoModelForSequenceClassification.from_pretrained(
    "Alibaba-NLP/gte-multilingual-reranker-base", trust_remote_code=True
).eval()  # CPU fp32

# 2) Retrieve with FAISS
K = 100
scores, ids = index.search(qv, K)
cand_ids = ids[0].astype(np.int64)
cand_ids = cand_ids[cand_ids != -1]

# 3) Prepare texts for reranking
meta = pd.read_parquet("product_meta.parquet").copy()
meta["groupId"] = meta["groupId"].astype(np.int64)


cands = (pd.DataFrame({"groupId": cand_ids})
         .merge(meta[["groupId","name","brand","color","size","text"]], on="groupId", how="left")
         .dropna(subset=["text"]))

# 4) Cross-encode (query, doc) pairs and rerank
pairs = [(query, t) for t in cands["text"].tolist()]
with torch.no_grad():
    batch = tok(pairs, padding=True, truncation=True, max_length=512, return_tensors="pt")
    ce = rerank(**batch).logits.view(-1).numpy()

cands["rerank_score"] = ce
final = (cands.sort_values("rerank_score", ascending=False)
         .loc[:, ["groupId","name","brand","color","size","rerank_score"]]
         .reset_index(drop=True))

print(final.head(10))
