## Assemble a DS

In [1]:
import pandas as pd

art = pd.read_csv(
    "../data/processed/articles_clean.csv",
    usecols=["sku", "name", "color", "name.1", "status", "audience", "category"],
    dtype=str
)
prices = pd.read_csv(
    "../data/processed/transactions_clean.csv",
    usecols=["sku", "price_sek"],
    dtype={"sku": str}
)
art = art.query("status in ['active']").merge(prices, on="sku", how="left")

In [2]:
# Save mapping between each individual SKU and its sku_family for later use
import os, json

import re
def extract_sku_family(sku):
    # Remove trailing SE, NO, FI, DK (case-insensitive) if present
    sku_base = re.sub(r'(SE|NO|FI|DK)$', '', str(sku), flags=re.IGNORECASE)
    # Take part before '-' if present
    return sku_base.split('-')[0]
art.insert(art.columns.get_loc('sku') + 1, 'sku_family', art['sku'].apply(extract_sku_family))

# Build mapping df (one row per unique sku)
sku_family_map = art.loc[:, ["sku", "sku_family"]].drop_duplicates().reset_index(drop=True)


In [3]:
#proportion of price in art?
art['price_sek'].isna().sum() / len(art)

np.float64(0.2486204820788081)

In [4]:
#show a few rows where price in not na
art[art['price_sek'].notna()].head()

Unnamed: 0,sku,sku_family,status,name,name.1,color,audience,category,price_sek
38,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
39,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
40,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
41,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
42,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0


In [5]:
# Convert 'price' to numeric, coerce errors to NaN
art['price_sek'] = pd.to_numeric(art['price_sek'], errors='coerce')
# Set all prices below 1 to NaN
art.loc[art['price_sek'] < 1, 'price_sek'] = pd.NA


In [6]:
#proportion of price in art?
art['price_sek'].isna().sum() / len(art)

np.float64(0.24896536155910606)

In [7]:
# For all rows where all columns besides price are identical, take the median price and keep only that row, in place
cols_except_price = [col for col in art.columns if col != "price_sek"]
art = (
    art.groupby(cols_except_price, dropna=False, as_index=False)
       .agg({"price_sek": "median"})
)

# Output how many SKUs that match more than one row we still have after cleaning
sku_counts_cleaned = art.groupby("sku").size()
skus_with_multiple_rows_cleaned = sku_counts_cleaned[sku_counts_cleaned > 1].index
print(f"Number of SKUs with more than one row after cleaning: {len(skus_with_multiple_rows_cleaned)}")
#art[art["sku"].isin(skus_with_multiple_rows_cleaned)]


Number of SKUs with more than one row after cleaning: 13


In [8]:
# Remove SKUs with more than one row after cleaning
sku_counts = art.groupby("sku").size()
skus_to_remove = sku_counts[sku_counts > 1].index
art = art[~art["sku"].isin(skus_to_remove)].reset_index(drop=True)


In [9]:
art['price_sek'].describe()

count    11650.000000
mean       492.954012
std        240.368846
min          3.587508
25%        374.591175
50%        493.507847
75%        596.423205
max       5241.558600
Name: price_sek, dtype: float64

In [10]:
# Create more granular price buckets, especially in the low range, using marketing terms
price_bins = [0, 100, 200, 300, 400, 500, 600, float('inf')]
price_labels = [
    'Budget',        # 0-100
    'Value',         # 100-200
    'Smart Choice',  # 200-300
    'Popular',       # 300-400
    'Premium',       # 400-500
    'Luxury',        # 500-600
    'Exclusive'      # 600+
]
art['priceband'] = pd.cut(art['price_sek'], bins=price_bins, labels=price_labels, include_lowest=True)
art['priceband'] = art['priceband'].astype(object)
art.loc[art['price_sek'].isna(), 'priceband'] = 'unknown'

In [11]:
# Drop rows where priceband is 'unknown', then drop the price_sek and status columns
art = art[art["priceband"] != "unknown"].drop(columns=["price_sek", "status"])

In [12]:
art

Unnamed: 0,sku,sku_family,name,name.1,color,audience,category,priceband
38,101702,101702,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
39,106065,106065,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Popular
41,112877,112877,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Exclusive
107,144001-3100,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
115,144001-4090,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
...,...,...,...,...,...,...,...,...
32819,590838,590838,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
32820,590839,590839,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Luxury
32821,590840,590840,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Exclusive
32822,590841,590841,5 meterklipp Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular


## only one sku_family should remain

In [13]:
fam2skus = (sku_family_map.groupby('sku_family')['sku']
            .apply(list).to_dict())

In [14]:
# Deduplicate so only one row per sku_family remains, keeping the first occurrence
art = art.drop_duplicates(subset=["sku_family"], keep="first").reset_index(drop=True)

In [15]:
art

Unnamed: 0,sku,sku_family,name,name.1,color,audience,category,priceband
0,101702,101702,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
1,106065,106065,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Popular
2,112877,112877,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Exclusive
3,144001-3100,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
4,144002-2075,144002,Sport bh Extreme Movement,Swegmark,beige,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Exclusive
...,...,...,...,...,...,...,...,...
2016,590838,590838,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
2017,590839,590839,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Luxury
2018,590840,590840,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Exclusive
2019,590841,590841,5 meterklipp Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular


## 1. Build a clean text field for vectorization

In [17]:
import pandas as pd, unicodedata, re

art["sku_family"] = art["sku_family"].astype(str)
MISSING = {"", "unknown", "nan", "none"}

def clean(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in MISSING else s

def canon(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("&", " ").replace(",", " ")
    s = re.sub(r"\u00A0", " ", s)                       # NBSP → space
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", " ", s)     # dash variants → space
    s = re.sub(r"\s+", " ", s).strip()                  # collapse spaces
    return s

def to_token(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()                  # collapse spaces
    s = s.replace(" ", "_")                             # then space → underscore
    return s

def tidy_underscores(s: str) -> str:
    s = re.sub(r"\s*_\s*", "_", s)                      # strip spaces around _
    s = re.sub(r"_+", "_", s)                           # collapse multiple _
    return s

def norm_category(cat: str) -> str:
    if pd.isna(cat): return ""
    parts = []
    for p in str(cat).split(","):
        p = clean(p)
        if p:
            parts.append(to_token(p))
    return " ".join(parts)

def build_text(row):
    names = [to_token(clean(row.get(c, ""))) for c in ("name","name.1")]
    names = [n for n in names if n]
    others = [
        clean(row.get("color","")),
        clean(row.get("audience","")),
        norm_category(row.get("category","")),
        clean(row.get("priceband","")),
    ]
    s = " ".join(names*3 + [o for o in others if o])
    return tidy_underscores(canon(s))

art["text"] = art.apply(build_text, axis=1)
sku_df = art.loc[art["text"].ne(""), ["sku_family","text"]].reset_index(drop=True)



In [18]:
pd.set_option('display.max_colwidth', None)
sku_df.head(60)

Unnamed: 0,sku_family,text
0,101702,Innerkudde Linea Innerkudde Linea Innerkudde Linea vit hemmet Kuddar Innerkuddar Bädd_(linea) Budget
1,106065,Fyndpaket_Stickgarn Knittingroom Fyndpaket_Stickgarn Knittingroom Fyndpaket_Stickgarn Knittingroom Popular
2,112877,Fyndpaket_Stickgarn Knittingroom Fyndpaket_Stickgarn Knittingroom Fyndpaket_Stickgarn Knittingroom Exclusive
3,144001,Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark vit dam Sport bh Bh_utan_bygel Bh Underkläder Luxury
4,144002,Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark beige dam Sport bh Bh_utan_bygel Bh Underkläder Exclusive
5,144009,Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark turkos dam Sport bh Bh_utan_bygel Bh Underkläder Exclusive
6,144012,Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark Sport_bh_Extreme_Movement Swegmark röd dam Sport bh Bh_utan_bygel Bh Underkläder Luxury
7,146606,Sport_bh_Courage Swegmark Sport_bh_Courage Swegmark Sport_bh_Courage Swegmark svart dam Sport bh Bh Underkläder Luxury
8,147106,Sport_bh_INCREDIBLE Swegmark Sport_bh_INCREDIBLE Swegmark Sport_bh_INCREDIBLE Swegmark svart vit dam Sport bh Bh_utan_bygel Bh Underkläder Exclusive
9,175701,Bygel_bh_Adamo_basic Swegmark Bygel_bh_Adamo_basic Swegmark Bygel_bh_Adamo_basic Swegmark vanilj dam Bygel bh Bh Underkläder Luxury


## 2. Vectorize with TF-IDF (Term Frequency × Inverse Document Frequency)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2), #use unigrams + bigrams
    min_df=1,
    strip_accents=None,   # keep å/ä/ö
    # sublinear_tf=True,  # optional
    # dtype=np.float32,   # optional memory saver
    token_pattern=r'(?u)\b\w+\b'
)
X_tfidf = tfidf.fit_transform(sku_df["text"])
X_tfidf.shape

(2021, 4466)

## 3. Singular Value Decomposition + L2 normalize

TF-IDF gives precise but sparse signals; SVD compresses & generalizes them.
L2-norm makes nearest-neighbor search stable and comparable across items.

In [20]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

n_components = min(128, max(2, X_tfidf.shape[1]-1)) 
svd = TruncatedSVD(n_components=n_components, random_state=0)
X_svd = svd.fit_transform(X_tfidf)
X_emb = normalize(X_svd)
X_emb.shape


(2021, 128)

## 4. Build 10-nearest neighbors (cosine) and return a small recs table


* It’s the **cosine of the angle** between vectors $a$ and $b$:

  $$
  \text{cosine\_sim}(a,b)=\frac{a\cdot b}{\|a\|\;\|b\|}
  $$

  * $=1$ → same direction (very similar)
  * $=0$ → orthogonal (unrelated)
  * $=-1$ → opposite (rare with TF-IDF since values are ≥0)

* After we **L2-normalize** vectors, cosine similarity becomes just the **dot product**.

* In scikit-learn, `metric="cosine"` actually computes **cosine distance**:

  $$
  \text{cosine\_dist} = 1 - \text{cosine\_sim}
  $$

  That’s why in the code we convert back with `similarity = 1 - d`.

Why we use it: it’s **scale-invariant** (ignores length), so two SKUs with similar wording but different text lengths still match well.


In [29]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

POOL = min(11, len(sku_df))
nn = NearestNeighbors(metric="cosine", n_neighbors=POOL).fit(X_emb)
dists, idxs = nn.kneighbors(X_emb)

sku_arr = sku_df["sku_family"].values

# Build all recommendations, then filter out self-recommendations
src_skus = np.repeat(sku_arr, POOL)
rec_skus = sku_arr[idxs.ravel()]
similarities = 1 - dists.ravel()

recs = pd.DataFrame({
    "src_sku": src_skus,
    "rec_sku": rec_skus,
    "similarity": similarities
})

# Remove self-recommendations
recs = recs[recs["src_sku"] != recs["rec_sku"]].copy()

# For each src_sku, keep only the top POOL-1 recommendations (in case of ties or accidental duplicates)
recs["rec_rank"] = (
    recs.groupby("src_sku")["similarity"]
    .rank(method="first", ascending=False)
    .astype(int)
)
recs = recs[recs["rec_rank"] <= POOL-1]

recs.head(10)

Unnamed: 0,src_sku,rec_sku,similarity,rec_rank
0,101702,190041,1.0,1
2,101702,340091,0.997211,2
3,101702,309989,0.997211,3
4,101702,190033,0.997211,4
5,101702,439125,0.997211,5
6,101702,522710,0.623439,6
7,101702,521879,0.60251,7
8,101702,585056,0.480252,8
9,101702,576223,0.480252,9
10,101702,576231,0.475823,10


In [30]:
recs.sample(15, random_state=42)

Unnamed: 0,src_sku,rec_sku,similarity,rec_rank
13812,290149,291260,0.974432,7
10986,267094,260365,0.77182,8
13071,270648,270561,0.928771,3
1997,260098,261696,0.90802,6
3733,260695,261890,0.725856,4
14030,290232,290241,0.998772,5
11643,270100,270104,0.84332,5
554,210698,210785,0.477867,4
5316,261317,263285,0.911884,3
6682,261606,264911,0.99161,5


In [31]:
target = "266890"

# pick columns and collapse art to one row per family
keep = [c for c in ["sku_family","sku","name","name.1","color","audience","category","priceband"] if c in art.columns]
details = art.drop_duplicates("sku_family")[keep].copy()

view = (
    recs.loc[recs["src_sku"].eq(target), ["rec_sku","rec_rank","similarity"]]
        .merge(details, left_on="rec_sku", right_on="sku_family", how="left")
        .drop(columns=["sku_family"])  # joined key
        [["rec_rank","similarity","rec_sku"] + [c for c in keep if c not in ("sku_family","sku")]]
        .sort_values("rec_rank")
        .reset_index(drop=True)
)

view


Unnamed: 0,rec_rank,similarity,rec_sku,name,name.1,color,audience,category,priceband
0,1,0.990135,266891,Bygel bh Jacquard&Lace,Miss Mary,unknown,dam,"Bygel-bh,Bh,Underkläder",Exclusive
1,2,0.98967,260896,Bygel bh Jacquard&Lace,Miss Mary,mörkgrå,dam,"Bygel-bh,Bh,Underkläder",Exclusive
2,3,0.988576,261729,Bygel bh Jacquard&Lace,Miss Mary,röd,dam,"Bygel-bh,Bh,Underkläder",Exclusive
3,4,0.983168,266882,Bygel bh Jacquard&Lace,Miss Mary,vit,dam,"Bygel-bh,Bh,Underkläder",Exclusive
4,5,0.853981,260346,Trosgördel Jaquard&Lace,Miss Mary,beige,dam,"Underkläder,Gördlar",Popular
5,6,0.844951,261752,Trosgördel Jaquard&Lace,Miss Mary,röd,dam,"Underkläder,Gördlar",Popular
6,7,0.840971,260911,Trosgördel Jaquard&Lace,Miss Mary,mörkgrå,dam,"Underkläder,Gördlar",Smart Choice
7,8,0.8385,262027,Trosgördel Jaquard&Lace,Miss Mary,lila,dam,"Underkläder,Gördlar",Smart Choice
8,9,0.836918,260345,Trosgördel Jaquard&Lace,Miss Mary,vit,dam,"Underkläder,Gördlar",Popular
9,10,0.407326,262899,Bygel bh Rose,Miss Mary,mörkgrå,dam,"Bygel-bh,Bh,Underkläder",Luxury


In [32]:
# Visualize the recommendations for the target itself (i.e., show the row for the target)
view_target = (
    details.loc[details["sku_family"] == target]
        .assign(rec_rank=1, similarity=1.0, rec_sku=target)
        [["rec_rank","similarity","rec_sku"] + [c for c in keep if c not in ("sku_family","sku")]]
        .reset_index(drop=True)
)
view_target


Unnamed: 0,rec_rank,similarity,rec_sku,name,name.1,color,audience,category,priceband
0,1,1.0,266890,Bygel bh Jacquard&Lace,Miss Mary,beige,dam,"Bygel-bh,Bh,Underkläder",Luxury


In [33]:
out = (recs
       .sort_values(['src_sku','rec_rank'])
       .groupby('src_sku')['rec_sku']
       .apply(list)
       .reset_index(name='rec_sku_families'))


In [34]:
out

Unnamed: 0,src_sku,rec_sku_families
0,101702,"[190041, 340091, 309989, 190033, 439125, 522710, 521879, 585056, 576223, 576231]"
1,106065,"[112877, 445897, 261639, 261215, 261213, 261216, 280053, 345907, 270130, 345906]"
2,112877,"[106065, 261215, 261216, 345907, 261639, 293456, 345906, 261213, 345901, 292292]"
3,144001,"[144012, 261631, 260367, 144009, 261630, 144002, 261573, 260577, 261589, 146606]"
4,144002,"[261630, 144009, 144012, 260367, 261631, 144001, 261589, 260577, 261573, 260304]"
...,...,...
2016,590838,"[590839, 590840, 590833, 530202, 590834, 590835, 530205, 530203, 530206, 535338]"
2017,590839,"[590840, 590838, 530202, 590834, 590835, 590833, 530205, 530206, 530203, 535338]"
2018,590840,"[590839, 590838, 530205, 590835, 530203, 530206, 530202, 590834, 590833, 535338]"
2019,590841,"[535338, 530202, 530205, 530203, 530206, 590838, 590833, 590835, 590840, 590834]"


In [41]:
# Build (src_sku family -> sku variant) table from fam2skus
map_long = (pd.DataFrame([(fam, sku) for fam, skus in fam2skus.items() for sku in skus],
                         columns=['src_sku','sku'])
            .astype({'src_sku':'string','sku':'string'}))

# Join so each SKU gets the family's rec_sku_families
per_sku_recs = (map_long.merge(out.astype({'src_sku':'string'}), on='src_sku', how='inner')
                         [['sku','src_sku','rec_sku_families']]
                         .reset_index(drop=True))

per_sku_recs.sample(10)

Unnamed: 0,sku,src_sku,rec_sku_families
16994,261950-3640,261950,"[261953, 261436, 261437, 261439, 261954, 260986, 260987, 260984, 261400, 261401]"
8641,261238-C095,261238,"[261375, 236506, 261937, 261734, 261738, 261967, 261650, 261645, 261666, 261376]"
10083,261444-B095,261444,"[571201, 570607, 571212, 261143, 570601, 267526, 261142, 261151, 261148, 261152]"
3115,260342-B085,260342,"[261248, 260965, 261123, 261131, 260966, 261739, 261668, 260867, 261884, 260584]"
14747,261736-E085,261736,"[236501, 261936, 261296, 261667, 261643, 261737, 261376, 266221, 261666, 261645]"
23305,267302-G085,267302,"[267002, 261358, 267310, 267328, 260697, 261557, 260713, 261113, 261114, 260730]"
19679,264457-3130,264457,"[261607, 260313, 260513, 264499, 261412, 265819, 260411, 261543, 260314, 264481]"
26395,350225,350225,"[350227, 400540, 420309, 300503, 300502, 410328, 320354, 400292, 400584, 310388]"
816,219485-0012,219485,"[210550, 218982, 219006, 210051, 210025, 218990, 210745, 210731, 210338, 250033]"
26097,290281,290281,"[290282, 415065, 338152, 294785, 393058, 393066, 393116, 470161, 338160, 291088]"


In [42]:
# Save the per_sku_recs DataFrame to a CSV file
per_sku_recs.to_csv("../data/predictions/vector_similarity_recommendations.csv", index=False)
