## Assemble a DS

In [187]:
import pandas as pd

art = pd.read_csv(
    "../data/processed/articles_clean.csv",
    usecols=["sku", "groupId", "name", "name.1", "status", "audience", "category", "priceSEK"],
    dtype=str
)

In [188]:
#proportion of price in art?
art['priceSEK'].isna().mean()

np.float64(0.14845806867347877)

In [190]:
# For rows in art where priceSEK is null, try to fill from transactions_clean.csv

# Load transactions with only sku and price_sek
trans = pd.read_csv(
    "../data/processed/transactions_clean.csv",
    usecols=["sku", "price_sek"],
    dtype=str
)

# Convert price_sek to numeric, coerce errors to NaN
trans['price_sek'] = pd.to_numeric(trans['price_sek'], errors='coerce')
# Set all prices below 1 to NaN
trans.loc[trans['price_sek'] < 1, 'price_sek'] = pd.NA

# For each sku, get the median price from transactions
sku_price = trans.groupby('sku', dropna=False)['price_sek'].median()

# Map sku in art to median price from transactions for missing priceSEK
mask = art['priceSEK'].isna()
art.loc[mask, 'priceSEK'] = art.loc[mask, 'sku'].map(sku_price)


In [191]:
# Convert 'price' to numeric, coerce errors to NaN
art['priceSEK'] = pd.to_numeric(art['priceSEK'], errors='coerce')
# Set all prices below 1 to NaN
art.loc[art['priceSEK'] < 1, 'priceSEK'] = pd.NA


In [192]:
#proportion of price in art?
art['priceSEK'].isna().mean()

np.float64(0.0684065175365921)

In [193]:
art['priceSEK'].describe()

count    101199.000000
mean        467.805005
std         350.656167
min           1.000000
25%         318.000000
50%         438.000000
75%         569.000000
max       34998.000000
Name: priceSEK, dtype: float64

In [194]:
# Create price buckets based on the distribution of price_sek, using 6 buckets
# (min: ~3.6, 25%: ~375, 50%: ~496, 75%: ~596, max: ~5242)
price_bins = [0, 100, 300, 600, 1000, 2000, float('inf')]
price_labels = [
    'Budget',        # 0-100
    'Value',         # 100-300
    'Popular',       # 300-600
    'Premium',       # 600-1000
    'Luxury',        # 1000-2000
    'Exclusive'      # 2000+
]
art['priceband'] = pd.cut(art['priceSEK'], bins=price_bins, labels=price_labels, include_lowest=True)
art['priceband'] = art['priceband'].astype(object)
art.loc[art['priceSEK'].isna(), 'priceband'] = 'unknown'

In [195]:
# Drop rows where priceband is 'unknown', then drop the price_sek and status columns
art = art[art["priceband"] != "unknown"].drop(columns=["priceSEK", "status"])

In [196]:
art

Unnamed: 0,sku,groupId,name,name.1,audience,category,priceband
1,052743,052743,Lakan örngott,unknown,unknown,unknown,Value
2,055522,055522,Tröja,Gjestal Garn,dam,Tröjor,Budget
3,055573,055573,Luva,Novita,dam,"Mössor & hattar,Mönster",Budget
4,055575,055575,Vantar,Novita,dam,Vantar,Budget
5,055576,055576,Benvärmare,Novita,dam,Sockor & strumpor,Budget
...,...,...,...,...,...,...,...
108523,AH3021-4244,AH3021,Stödstrumpa Herr,Funq Wear,generic,"Stödstrumpor,Stödartiklar",Value
108524,AH3021-4547,AH3021,Stödstrumpa Herr,Funq Wear,generic,"Stödstrumpor,Stödartiklar",Value
108620,SOFIND,SOFIND,Bruksanvisng Sofia N DK,unknown,unknown,unknown,Budget
108622,TEST01,TEST01,guldarmband,Disney,unknown,Smycken,Popular


## only one group_id should remain

In [197]:
# Deduplicate so only one row per groupId remains, keeping the one with the most information (fewest NA/unknown), and drop sku
def info_score(row):
    # Count number of non-missing, non-unknown fields (excluding sku and groupId)
    fields = [col for col in art.columns if col not in ("sku", "groupId")]
    score = 0
    for col in fields:
        val = row[col]
        if pd.isna(val):
            continue
        sval = str(val).strip().lower()
        if sval in {"", "unknown", "nan", "none"}:
            continue
        score += 1
    return score

art = art.assign(_info_score=art.apply(info_score, axis=1))
art = art.sort_values("_info_score", ascending=False)
art = art.drop_duplicates(subset=["groupId"], keep="first").drop(columns=["_info_score", "sku"]).reset_index(drop=True)


In [198]:
art

Unnamed: 0,groupId,name,name.1,audience,category,priceband
0,261270,Boxertrosa Basic,Miss Mary,dam,"Underkläder,Trosor",Value
1,261464,Bh utan bygel Meadow Cotton,Swegmark,dam,"Bh utan bygel,Bh,Underkläder",Popular
2,261467,Body Adorable,Swegmark,dam,"Body,Underkläder",Premium
3,261468,Bh utan bygel Sensible,Swegmark,dam,"Bh utan bygel,Bh,Underkläder",Popular
4,261591,Bygel bh Smooth Line,Swegmark,dam,"Bygel-bh,Bh,Underkläder",Popular
...,...,...,...,...,...,...
8460,250035,,unknown,unknown,unknown,Popular
8461,270025,,unknown,unknown,unknown,Popular
8462,210138,,unknown,unknown,unknown,Value
8463,210146,,unknown,unknown,unknown,Value


## 1. Build a clean text field for vectorization

In [199]:
import pandas as pd, unicodedata, re

MISSING = {"", "unknown", "nan", "none"}

def clean(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in MISSING else s

def canon(s):
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("&", " ").replace(",", " ")
    s = re.sub(r"\u00A0", " ", s)
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def to_token(s):
    return re.sub(r"\s+", "_", s.strip())

def tidy_underscores(s):
    s = re.sub(r"\s*_\s*", "_", s)
    return re.sub(r"_+", "_", s)

def norm_category(cat):
    if pd.isna(cat): return ""
    return " ".join(to_token(clean(p)) for p in str(cat).split(",") if clean(p))

def build_text(row):
    names = [to_token(clean(row.get(c, ""))) for c in ("name", "name.1") if clean(row.get(c, ""))]
    others = [
        clean(row.get("color", "")),
        clean(row.get("audience", "")),
        norm_category(row.get("category", "")),
        clean(row.get("priceband", "")),
    ]
    s = " ".join(names*3 + [o for o in others if o])
    return tidy_underscores(canon(s))

art["text"] = art.apply(build_text, axis=1)
group_df = art.loc[art["text"].ne(""), ["groupId", "text"]].reset_index(drop=True)

In [200]:
pd.set_option('display.max_colwidth', None)
group_df.head()

Unnamed: 0,groupId,text
0,261270,Boxertrosa_Basic Miss_Mary Boxertrosa_Basic Miss_Mary Boxertrosa_Basic Miss_Mary dam Underkläder Trosor Value
1,261464,Bh_utan_bygel_Meadow_Cotton Swegmark Bh_utan_bygel_Meadow_Cotton Swegmark Bh_utan_bygel_Meadow_Cotton Swegmark dam Bh_utan_bygel Bh Underkläder Popular
2,261467,Body_Adorable Swegmark Body_Adorable Swegmark Body_Adorable Swegmark dam Body Underkläder Premium
3,261468,Bh_utan_bygel_Sensible Swegmark Bh_utan_bygel_Sensible Swegmark Bh_utan_bygel_Sensible Swegmark dam Bh_utan_bygel Bh Underkläder Popular
4,261591,Bygel_bh_Smooth_Line Swegmark Bygel_bh_Smooth_Line Swegmark Bygel_bh_Smooth_Line Swegmark dam Bygel bh Bh Underkläder Popular


## 2. Vectorize with TF-IDF (Term Frequency × Inverse Document Frequency)

In [201]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2), #use unigrams + bigrams
    min_df=1,
    strip_accents=None,   # keep å/ä/ö
    # sublinear_tf=True,  # optional
    # dtype=np.float32,   # optional memory saver
    token_pattern=r'(?u)\b\w+\b'
)
X_tfidf = tfidf.fit_transform(group_df["text"])
X_tfidf.shape

(8465, 14773)

## 3. Singular Value Decomposition + L2 normalize

TF-IDF gives precise but sparse signals; SVD compresses & generalizes them.
L2-norm makes nearest-neighbor search stable and comparable across items.

In [202]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

n_components = min(128, max(2, X_tfidf.shape[1]-1)) 
svd = TruncatedSVD(n_components=n_components, random_state=0)
X_svd = svd.fit_transform(X_tfidf)
X_emb = normalize(X_svd)
X_emb.shape


(8465, 128)

## 4. Build 10-nearest neighbors (cosine) and return a small recs table


* It’s the **cosine of the angle** between vectors $a$ and $b$:

  $$
  \text{cosine\_sim}(a,b)=\frac{a\cdot b}{\|a\|\;\|b\|}
  $$

  * $=1$ → same direction (very similar)
  * $=0$ → orthogonal (unrelated)
  * $=-1$ → opposite (rare with TF-IDF since values are ≥0)

* After we **L2-normalize** vectors, cosine similarity becomes just the **dot product**.

* In scikit-learn, `metric="cosine"` actually computes **cosine distance**:

  $$
  \text{cosine\_dist} = 1 - \text{cosine\_sim}
  $$

  That’s why in the code we convert back with `similarity = 1 - d`.

Why we use it: it’s **scale-invariant** (ignores length), so two SKUs with similar wording but different text lengths still match well.


In [203]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

POOL = min(11, len(group_df))
nn = NearestNeighbors(metric="cosine", n_neighbors=POOL).fit(X_emb)
dists, idxs = nn.kneighbors(X_emb)

groupid_arr = group_df["groupId"].values

# Build all recommendations, then filter out self-recommendations
src_groupids = np.repeat(groupid_arr, POOL)
rec_groupids = groupid_arr[idxs.ravel()]
similarities = 1 - dists.ravel()

recs = pd.DataFrame({
    "src_groupId": src_groupids,
    "rec_groupId": rec_groupids,
    "similarity": similarities
})

# Remove self-recommendations
recs = recs[recs["src_groupId"] != recs["rec_groupId"]].copy()

# For each src_groupId, keep only the top POOL-1 recommendations (in case of ties or accidental duplicates)
recs["rec_rank"] = (
    recs.groupby("src_groupId")["similarity"]
    .rank(method="first", ascending=False)
    .astype(int)
)
recs = recs[recs["rec_rank"] <= POOL-1]

recs.head(10)

Unnamed: 0,src_groupId,rec_groupId,similarity,rec_rank
1,261270,261253,0.999867,1
2,261270,260922,0.999815,2
3,261270,261745,0.999815,3
4,261270,261740,0.999815,4
5,261270,261938,0.999815,5
6,261270,261010,0.999815,6
7,261270,261379,0.999815,7
8,261270,261656,0.999815,8
9,261270,261280,0.999815,9
10,261270,262038,0.999815,10


In [204]:
recs.sample(15, random_state=42)

Unnamed: 0,src_groupId,rec_groupId,similarity,rec_rank
69805,210718,210565,0.559198,10
73280,106065,294902,0.631359,9
32870,210127,210143,1.0,2
28602,270458,270361,1.0,2
17923,271017,271008,0.988752,4
54954,280768,280018,0.847903,9
23198,310542,400288,0.943035,10
42950,260008,260625,0.999892,6
54496,292318,294103,0.900685,2
60278,215103,210296,0.924482,9


In [205]:
target = "190041"  # example groupId

# pick columns and collapse art to one row per groupId
keep = [c for c in ["groupId","name","name.1","color","audience","category","priceband"] if c in art.columns]
details = art.drop_duplicates("groupId")[keep].copy()

view = (
    recs.loc[recs["src_groupId"].eq(target), ["rec_groupId","rec_rank","similarity"]]
        .merge(details, left_on="rec_groupId", right_on="groupId", how="left")
        .drop(columns=["groupId"])  # joined key
        [["rec_rank","similarity","rec_groupId"] + [c for c in keep if c != "groupId"]]
        .sort_values("rec_rank")
        .reset_index(drop=True)
)

view


Unnamed: 0,rec_rank,similarity,rec_groupId,name,name.1,audience,category,priceband
0,1,0.999998,521879,Innerkudde Rund,Linea,hemmet,"Innerkuddar,Bädd (linea)",Value
1,2,0.999998,576223,Sovkudde Låg,Linea,hemmet,"Innerkuddar,Bädd (linea)",Value
2,3,0.999998,588970,Sovkudde Duni,Linea,hemmet,"Innerkuddar,Bädd (linea)",Value
3,4,0.999979,576231,Sovkudde Medium,Linea,hemmet,"Innerkuddar,Bädd (linea),Kuddar",Value
4,5,0.999979,576249,Sovkudde Hög,Linea,hemmet,"Innerkuddar,Bädd (linea),Kuddar",Value
5,6,0.999865,500389,Sovkudde Lyx,Linea,hemmet,"Bäddtillbehör,Bädd (linea)",Value
6,7,0.998899,538002,Multibandskappa Shivaun,Linea,hemmet,"Multibandskappa,Gardiner (linea)",Value
7,8,0.998899,546024,Multibandskappa enfärgad sammet,Linea,hemmet,"Multibandskappa,Gardiner (linea)",Value
8,9,0.998686,541012,Bågkappa med brodyr Lavendel,Linea,hemmet,"Gardinbåge,Gardiner (linea)",Value
9,10,0.998686,552028,Gardinbåge Bird jacquardvävd spets,Linea,hemmet,"Gardinbåge,Gardiner (linea)",Value


In [206]:
# Visualize the recommendations for the target itself (i.e., show the row for the target)
view_target = (
    details.loc[details["groupId"] == target]
        .assign(rec_rank=1, similarity=1.0, rec_groupId=target)
        [["rec_rank","similarity","rec_groupId"] + [c for c in keep if c != "groupId"]]
        .reset_index(drop=True)
)
view_target


Unnamed: 0,rec_rank,similarity,rec_groupId,name,name.1,audience,category,priceband
0,1,1.0,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Value


In [207]:
out = (recs
       .sort_values(['src_groupId','rec_rank'])
       .groupby('src_groupId')['rec_groupId']
       .apply(list)
       .reset_index(name='recs'))

out.to_csv("../data/predictions/vector_similarity_recommendations.csv", index=False)


In [208]:
out

Unnamed: 0,src_groupId,recs
0,0044,"[870023, 261262, 263012, 261172, 261024, 263202, 262097, 262089, 261271, 263095]"
1,052743,"[520768, 528417, 582111, 522391, 562744, 581173, 582109, 569694, 569744, 522383]"
2,055522,"[170005, 261883, 261909, 261891, 273763, 261917, 890030, 273755, 210277, 200287]"
3,055573,"[055576, 055575, 319525, 200260, 587964, 200097, 200014, 200048, 200063, 200030]"
4,055575,"[055576, 055573, 319525, 200260, 587964, 200014, 260083, 200105, 200048, 200097]"
...,...,...
8460,AH2021,"[AH2031, AH1021, AH3021, 267005, 267013, 266965, 266940, 266973, 266981, 266361]"
8461,AH2031,"[AH1021, AH3021, AH2021, 267005, 267013, 266965, 266940, 266973, 266981, 266361]"
8462,AH3021,"[AH2031, AH1021, AH2021, 267005, 267013, 266965, 266940, 266973, 266981, 266361]"
8463,SOFIND,"[500080, 394545, 530000, 270140, 201202, 395700, 394040, 484850, 205773, 205443]"
