## Assemble a DS

In [1]:
import pandas as pd

art = pd.read_csv(
    "../data/processed/articles_clean.csv",
    usecols=["sku", "name", "color", "name.1", "status", "audience", "category"],
    dtype=str
)
prices = pd.read_csv(
    "../data/processed/transactions_clean.csv",
    usecols=["sku", "price_sek"],
    dtype={"sku": str}
)
art = art.query("status in ['active']").merge(prices, on="sku", how="left")

In [2]:
# Save mapping between each individual SKU and its sku_family for later use
import os, json

import re
def extract_sku_family(sku):
    # Remove trailing SE, NO, FI, DK (case-insensitive) if present
    sku_base = re.sub(r'(SE|NO|FI|DK)$', '', str(sku), flags=re.IGNORECASE)
    # Take part before '-' if present
    return sku_base.split('-')[0]
art.insert(art.columns.get_loc('sku') + 1, 'sku_family', art['sku'].apply(extract_sku_family))

# Build mapping df (one row per unique sku)
sku_family_map = art.loc[:, ["sku", "sku_family"]].drop_duplicates().reset_index(drop=True)


In [3]:
#proportion of price in art?
art['price_sek'].isna().sum() / len(art)

np.float64(0.2486204820788081)

In [4]:
#show a few rows where price in not na
art[art['price_sek'].notna()].head()

Unnamed: 0,sku,sku_family,status,name,name.1,color,audience,category,price_sek
38,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
39,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
40,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
41,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
42,101702,101702,active,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0


In [5]:
# Convert 'price' to numeric, coerce errors to NaN
art['price_sek'] = pd.to_numeric(art['price_sek'], errors='coerce')
# Set all prices below 1 to NaN
art.loc[art['price_sek'] < 1, 'price_sek'] = pd.NA


In [6]:
#proportion of price in art?
art['price_sek'].isna().sum() / len(art)

np.float64(0.24896536155910606)

In [7]:
# For all rows where all columns besides price are identical, take the median price and keep only that row, in place
cols_except_price = [col for col in art.columns if col != "price_sek"]
art = (
    art.groupby(cols_except_price, dropna=False, as_index=False)
       .agg({"price_sek": "median"})
)

# Output how many SKUs that match more than one row we still have after cleaning
sku_counts_cleaned = art.groupby("sku").size()
skus_with_multiple_rows_cleaned = sku_counts_cleaned[sku_counts_cleaned > 1].index
print(f"Number of SKUs with more than one row after cleaning: {len(skus_with_multiple_rows_cleaned)}")
#art[art["sku"].isin(skus_with_multiple_rows_cleaned)]


Number of SKUs with more than one row after cleaning: 13


In [8]:
# Remove SKUs with more than one row after cleaning
sku_counts = art.groupby("sku").size()
skus_to_remove = sku_counts[sku_counts > 1].index
art = art[~art["sku"].isin(skus_to_remove)].reset_index(drop=True)


In [9]:
art['price_sek'].describe()

count    11650.000000
mean       492.954012
std        240.368846
min          3.587508
25%        374.591175
50%        493.507847
75%        596.423205
max       5241.558600
Name: price_sek, dtype: float64

In [10]:
# Create more granular price buckets, especially in the low range, using marketing terms
price_bins = [0, 100, 200, 300, 400, 500, 600, float('inf')]
price_labels = [
    'Budget',        # 0-100
    'Value',         # 100-200
    'Smart Choice',  # 200-300
    'Popular',       # 300-400
    'Premium',       # 400-500
    'Luxury',        # 500-600
    'Exclusive'      # 600+
]
art['priceband'] = pd.cut(art['price_sek'], bins=price_bins, labels=price_labels, include_lowest=True)
art['priceband'] = art['priceband'].astype(object)
art.loc[art['price_sek'].isna(), 'priceband'] = 'unknown'

In [11]:
# Drop rows where priceband is 'unknown', then drop the price_sek and status columns
art = art[art["priceband"] != "unknown"].drop(columns=["price_sek", "status"])

In [12]:
art

Unnamed: 0,sku,sku_family,name,name.1,color,audience,category,priceband
38,101702,101702,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
39,106065,106065,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Popular
41,112877,112877,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Exclusive
107,144001-3100,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
115,144001-4090,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
...,...,...,...,...,...,...,...,...
32819,590838,590838,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
32820,590839,590839,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Luxury
32821,590840,590840,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Exclusive
32822,590841,590841,5 meterklipp Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular


## only one sku_family should remain

In [13]:
fam2skus = (sku_family_map.groupby('sku_family')['sku']
            .apply(list).to_dict())

In [14]:
# Deduplicate so only one row per sku_family remains, keeping the first occurrence
art = art.drop_duplicates(subset=["sku_family"], keep="first").reset_index(drop=True)

In [15]:
art

Unnamed: 0,sku,sku_family,name,name.1,color,audience,category,priceband
0,101702,101702,Innerkudde,Linea,vit,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
1,106065,106065,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Popular
2,112877,112877,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,unknown,Exclusive
3,144001-3100,144001,Sport bh Extreme Movement,Swegmark,vit,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Luxury
4,144002-2075,144002,Sport bh Extreme Movement,Swegmark,beige,dam,"Sport-bh,Bh utan bygel,Bh,Underkläder",Exclusive
...,...,...,...,...,...,...,...,...
2016,590838,590838,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
2017,590839,590839,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Luxury
2018,590840,590840,Madrasskydd Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Exclusive
2019,590841,590841,5 meterklipp Plastad frotté,Linea,vit,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular


## 1. Build a clean text field for vectorization

In [16]:
import pandas as pd
import unicodedata, re

art["sku_family"] = art["sku_family"].astype(str)

MISSING = {"", "unknown", "nan", "none"}

def clean(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in MISSING else s

def canon(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("&", " ").replace(",", " ")           # unify separators
    s = re.sub(r"\u00A0", " ", s)                       # NBSP -> space
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", " ", s)     # all dash variants -> space
    s = re.sub(r"\s+", " ", s).strip()                  # collapse spaces
    return s

cols = ["name","name.1","color","audience","category","priceband"]

# Build text and canonicalize
art["text"] = art[cols].map(clean).agg(" ".join, axis=1).map(canon)

# Keep only non-empty texts
sku_df = art.loc[art["text"].ne(""), ["sku_family","text"]].reset_index(drop=True)

In [17]:
pd.set_option('display.max_colwidth', None)
sku_df.head(5)

Unnamed: 0,sku_family,text
0,101702,Innerkudde Linea vit hemmet Kuddar Innerkuddar Bädd (linea) Budget
1,106065,Fyndpaket Stickgarn Knittingroom Popular
2,112877,Fyndpaket Stickgarn Knittingroom Exclusive
3,144001,Sport bh Extreme Movement Swegmark vit dam Sport bh Bh utan bygel Bh Underkläder Luxury
4,144002,Sport bh Extreme Movement Swegmark beige dam Sport bh Bh utan bygel Bh Underkläder Exclusive


## 2. Vectorize with TF-IDF (Term Frequency × Inverse Document Frequency)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2), #use unigrams + bigrams
    min_df=1,
    strip_accents=None,   # keep å/ä/ö
    # sublinear_tf=True,  # optional
    # dtype=np.float32,   # optional memory saver
    token_pattern=r'(?u)\b\w+\b'
)
X_tfidf = tfidf.fit_transform(sku_df["text"])
X_tfidf.shape

(2021, 4644)

## 3. Singular Value Decomposition + L2 normalize

TF-IDF gives precise but sparse signals; SVD compresses & generalizes them.
L2-norm makes nearest-neighbor search stable and comparable across items.

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

n_components = min(128, max(2, X_tfidf.shape[1]-1))
svd = TruncatedSVD(n_components=n_components, random_state=0)
X_svd = svd.fit_transform(X_tfidf)
X_emb = normalize(X_svd)
X_emb.shape


(2021, 128)

## 4. Build 10-nearest neighbors (cosine) and return a small recs table


* It’s the **cosine of the angle** between vectors $a$ and $b$:

  $$
  \text{cosine\_sim}(a,b)=\frac{a\cdot b}{\|a\|\;\|b\|}
  $$

  * $=1$ → same direction (very similar)
  * $=0$ → orthogonal (unrelated)
  * $=-1$ → opposite (rare with TF-IDF since values are ≥0)

* After we **L2-normalize** vectors, cosine similarity becomes just the **dot product**.

* In scikit-learn, `metric="cosine"` actually computes **cosine distance**:

  $$
  \text{cosine\_dist} = 1 - \text{cosine\_sim}
  $$

  That’s why in the code we convert back with `similarity = 1 - d`.

Why we use it: it’s **scale-invariant** (ignores length), so two SKUs with similar wording but different text lengths still match well.


In [20]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

POOL = min(11, len(sku_df))
nn = NearestNeighbors(metric="cosine", n_neighbors=POOL).fit(X_emb)
dists, idxs = nn.kneighbors(X_emb)

sku_arr = sku_df["sku_family"].values
recs = pd.DataFrame({
    "src_sku": np.repeat(sku_arr, POOL-1),
    "rec_sku": sku_arr[idxs[:,1:]].ravel(),
    "similarity": 1 - dists[:,1:].ravel()
})

# Add ranking: for each src_sku, rank rec_sku by similarity descending
recs["rec_rank"] = (
    recs.groupby("src_sku")["similarity"]
    .rank(method="first", ascending=False)
    .astype(int)
)

recs.head(10)

Unnamed: 0,src_sku,rec_sku,similarity,rec_rank
0,101702,190041,1.0,1
1,101702,309989,0.966158,2
2,101702,439125,0.966158,3
3,101702,190033,0.966158,4
4,101702,340091,0.966158,5
5,101702,521879,0.900361,6
6,101702,585056,0.84463,7
7,101702,576223,0.84463,8
8,101702,522710,0.842505,9
9,101702,576231,0.796932,10


In [21]:
recs.sample(15, random_state=42)

Unnamed: 0,src_sku,rec_sku,similarity,rec_rank
12556,290149,293878,0.794558,7
9987,267094,260170,0.628108,8
11882,270648,270572,0.791129,3
1815,260098,261697,0.830341,6
3393,260695,261725,0.854313,4
12754,290232,290240,0.943397,5
10584,270100,270105,0.806638,5
503,210698,210338,0.849286,4
4832,261317,266643,0.9353,3
6074,261606,264911,0.923825,5


In [22]:
target = "266890"

# pick columns and collapse art to one row per family
keep = [c for c in ["sku_family","sku","name","name.1","color","audience","category","priceband"] if c in art.columns]
details = art.drop_duplicates("sku_family")[keep].copy()

view = (
    recs.loc[recs["src_sku"].eq(target), ["rec_sku","rec_rank","similarity"]]
        .merge(details, left_on="rec_sku", right_on="sku_family", how="left")
        .drop(columns=["sku_family"])  # joined key
        [["rec_rank","similarity","rec_sku"] + [c for c in keep if c not in ("sku_family","sku")]]
        .sort_values("rec_rank")
        .reset_index(drop=True)
)

view


Unnamed: 0,rec_rank,similarity,rec_sku,name,name.1,color,audience,category,priceband
0,1,0.895444,266891,Bygel bh Jacquard&Lace,Miss Mary,unknown,dam,"Bygel-bh,Bh,Underkläder",Exclusive
1,2,0.850882,266882,Bygel bh Jacquard&Lace,Miss Mary,vit,dam,"Bygel-bh,Bh,Underkläder",Exclusive
2,3,0.850723,260896,Bygel bh Jacquard&Lace,Miss Mary,mörkgrå,dam,"Bygel-bh,Bh,Underkläder",Exclusive
3,4,0.828497,261729,Bygel bh Jacquard&Lace,Miss Mary,röd,dam,"Bygel-bh,Bh,Underkläder",Exclusive
4,5,0.826186,260113,Bygel bh Confident,Miss Mary,beige,dam,"Bygel-bh,Bh,Underkläder",Luxury
5,6,0.749343,262899,Bygel bh Rose,Miss Mary,mörkgrå,dam,"Bygel-bh,Bh,Underkläder",Luxury
6,7,0.706263,266221,Bygel bh Rose,Miss Mary,vit,dam,"Bygel-bh,Bh,Underkläder",Luxury
7,8,0.705065,260514,Winter Dew bygel bh,Miss Mary,vit-beige,dam,"Bygel-bh,Bh,Underkläder",Exclusive
8,9,0.683362,236506,Bygel bh Amsterdam,Miss Mary,svart,dam,"Bygel-bh,Bh,Underkläder",Luxury
9,10,0.678728,261738,Bygel bh Tenderly,Miss Mary,svart,dam,"Bygel-bh,Bh,Underkläder",Luxury


In [23]:
# Visualize the recommendations for the target itself (i.e., show the row for the target)
view_target = (
    details.loc[details["sku_family"] == target]
        .assign(rec_rank=1, similarity=1.0, rec_sku=target)
        [["rec_rank","similarity","rec_sku"] + [c for c in keep if c not in ("sku_family","sku")]]
        .reset_index(drop=True)
)
view_target


Unnamed: 0,rec_rank,similarity,rec_sku,name,name.1,color,audience,category,priceband
0,1,1.0,266890,Bygel bh Jacquard&Lace,Miss Mary,beige,dam,"Bygel-bh,Bh,Underkläder",Luxury


In [24]:
out = (recs
       .sort_values(['src_sku','rec_rank'])
       .groupby('src_sku')['rec_sku']
       .apply(list)
       .reset_index(name='rec_sku_families'))


In [25]:
out

Unnamed: 0,src_sku,rec_sku_families
0,101702,"[190041, 309989, 439125, 190033, 340091, 521879, 585056, 576223, 522710, 576231]"
1,106065,"[445897, 206524, 112877, 536106, 549003, 549000, 350600, 270127, 219477, 270130]"
2,112877,"[106065, 206524, 270086, 261844, 261860, 260272, 270059, 270607, 270087, 261874]"
3,144001,"[261573, 260367, 144012, 261631, 261630, 260479, 260478, 261632, 261570, 260577]"
4,144002,"[261630, 260708, 144009, 552953, 260477, 261633, 144001, 261589, 261912, 554408]"
...,...,...
2016,590838,"[590841, 590839, 590840, 590833, 530203, 530206, 535338, 590834, 530202, 530205]"
2017,590839,"[590840, 590838, 590841, 530202, 530203, 530206, 590834, 535338, 530205, 590835]"
2018,590840,"[590839, 590838, 590841, 530203, 530206, 535338, 530205, 590835, 530202, 590834]"
2019,590841,"[590838, 590840, 590839, 590833, 530206, 530203, 535338, 590834, 530202, 590835]"


In [26]:
# Build (src_sku family -> sku variant) table from fam2skus
map_long = (pd.DataFrame([(fam, sku) for fam, skus in fam2skus.items() for sku in skus],
                         columns=['src_sku','sku'])
            .astype({'src_sku':'string','sku':'string'}))

# Join so each SKU gets the family's rec_sku_families
per_sku_recs = (map_long.merge(out.astype({'src_sku':'string'}), on='src_sku', how='inner')
                         [['sku','src_sku','rec_sku_families']]
                         .reset_index(drop=True))

per_sku_recs.head()

Unnamed: 0,sku,src_sku,rec_sku_families
0,101702,101702,"[190041, 309989, 439125, 190033, 340091, 521879, 585056, 576223, 522710, 576231]"
1,106065,106065,"[445897, 206524, 112877, 536106, 549003, 549000, 350600, 270127, 219477, 270130]"
2,112877,112877,"[106065, 206524, 270086, 261844, 261860, 260272, 270059, 270607, 270087, 261874]"
3,144001-2070,144001,"[261573, 260367, 144012, 261631, 261630, 260479, 260478, 261632, 261570, 260577]"
4,144001-2075,144001,"[261573, 260367, 144012, 261631, 261630, 260479, 260478, 261632, 261570, 260577]"


In [28]:
# Save the per_sku_recs DataFrame to a CSV file
per_sku_recs.to_csv("../data/predictions/vector_similarity_recommendations.csv", index=False)
