## Assemble a DS

In [78]:
import pandas as pd

art = pd.read_csv(
    "../data/processed/articles_clean.csv",
    usecols=["sku", "name", "color", "name.1", "size", "status", "audience", "category"],
    dtype=str
)
prices = pd.read_csv(
    "../data/external/transactions_train.csv",
    usecols=["sku", "price"],
    dtype={"sku": str}
)
art = art.query("status in ['active']").merge(prices, on="sku", how="left")

In [79]:
#proportion of price in art?
art['price'].isna().sum() / len(art)

np.float64(0.13087120858318857)

In [80]:
#show a few rows where price in not na
art[art['price'].notna()].head()



Unnamed: 0,sku,status,name,name.1,color,size,audience,category,price
38,101702,active,Innerkudde,Linea,vit,30x30 cm,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",89.0
39,101702,active,Innerkudde,Linea,vit,30x30 cm,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",89.0
40,101702,active,Innerkudde,Linea,vit,30x30 cm,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
41,101702,active,Innerkudde,Linea,vit,30x30 cm,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
42,101702,active,Innerkudde,Linea,vit,30x30 cm,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0


In [81]:
# Convert 'price' to numeric, coerce errors to NaN
art['price'] = pd.to_numeric(art['price'], errors='coerce')
# Set all prices below 1 to NaN
art.loc[art['price'] < 1, 'price'] = pd.NA


In [82]:
#proportion of price in art?
art['price'].isna().sum() / len(art)

np.float64(0.13124482387152617)

In [83]:
# For all rows where all columns besides price are identical, take the median price and keep only that row, in place
cols_except_price = [col for col in art.columns if col != "price"]
art = (
    art.groupby(cols_except_price, dropna=False, as_index=False)
       .agg({"price": "median"})
)

# Output how many SKUs that match more than one row we still have after cleaning
sku_counts_cleaned = art.groupby("sku").size()
skus_with_multiple_rows_cleaned = sku_counts_cleaned[sku_counts_cleaned > 1].index
print(f"Number of SKUs with more than one row after cleaning: {len(skus_with_multiple_rows_cleaned)}")
art[art["sku"].isin(skus_with_multiple_rows_cleaned)]


Number of SKUs with more than one row after cleaning: 0


Unnamed: 0,sku,status,name,name.1,color,size,audience,category,price


## 1. Build a clean text field for vectorization

In [84]:
import pandas as pd

art["sku"] = art["sku"].astype(str)
art["status"] = art["status"].str.lower()

MISSING = {"", "unknown", "nan", "none"}
def clean(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in MISSING else s

# normalize separators a bit so categories/brands tokenize well
art["category"] = art["category"].astype(str).str.replace("&"," ", regex=False).str.replace(",", " ", regex=False)

cols = ["name","name.1","color","size","audience","category"]
art["text"] = art[cols].map(clean).agg(" ".join, axis=1).str.replace(r"\s+"," ", regex=True).str.strip()

sku_df = art.loc[art["text"].ne(""), ["sku","status","text"]].reset_index(drop=True)

In [94]:
pd.set_option('display.max_colwidth', None)
sku_df.head(50)

Unnamed: 0,sku,status,text
0,055522,active,Tröja Gjestal Garn dam Tröjor
1,055573,active,Luva Novita dam Mössor hattar Mönster
2,055575,active,Vantar Novita dam Vantar
3,055576,active,Benvärmare Novita dam Sockor strumpor
4,090962,active,Garn Drops Nepal Drops Design gul
5,090963,active,Garn Drops Nepal Drops Design rosa
6,090964,active,Garn Drops Nepal Drops Design turkos
7,095302,active,Garn Drops Nepal Drops Design beige
8,095304,active,Garn Drops Nepal Drops Design grå
9,095306,active,Garn Drops Nepal Drops Design mörkgrå


## 2. Vectorize with TF-IDF (Term Frequency × Inverse Document Frequency)

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2), #use unigrams + bigrams
    min_df=1,
    strip_accents=None,   # keep å/ä/ö
    # sublinear_tf=True,  # optional
    # dtype=np.float32,   # optional memory saver
    token_pattern=r'(?u)\b\w+\b'
)
X_tfidf = tfidf.fit_transform(sku_df["text"])
X_tfidf.shape

(32945, 9066)

## 3. Singular Value Decomposition + L2 normalize

TF-IDF gives precise but sparse signals; SVD compresses & generalizes them.
L2-norm makes nearest-neighbor search stable and comparable across items.

In [87]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

n_components = min(128, max(2, X_tfidf.shape[1]-1))
svd = TruncatedSVD(n_components=n_components, random_state=0)
X_svd = svd.fit_transform(X_tfidf)
X_emb = normalize(X_svd)
X_emb.shape


(32945, 128)

## 4. Build 5-nearest neighbors (cosine) and return a small recs table


* It’s the **cosine of the angle** between vectors $a$ and $b$:

  $$
  \text{cosine\_sim}(a,b)=\frac{a\cdot b}{\|a\|\;\|b\|}
  $$

  * $=1$ → same direction (very similar)
  * $=0$ → orthogonal (unrelated)
  * $=-1$ → opposite (rare with TF-IDF since values are ≥0)

* After we **L2-normalize** vectors, cosine similarity becomes just the **dot product**.

* In scikit-learn, `metric="cosine"` actually computes **cosine distance**:

  $$
  \text{cosine\_dist} = 1 - \text{cosine\_sim}
  $$

  That’s why in the code we convert back with `similarity = 1 - d`.

Why we use it: it’s **scale-invariant** (ignores length), so two SKUs with similar wording but different text lengths still match well.


In [88]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(metric="cosine", n_neighbors=min(6, len(sku_df)))
nn.fit(X_emb)

dists, idxs = nn.kneighbors(X_emb)

rows = []
for i in range(len(sku_df)):
    src = sku_df.loc[i, "sku"]
    # skip self, keep top 5 others
    ii = idxs[i][1:6]
    di = dists[i][1:6]
    for rank, (j, d) in enumerate(zip(ii, di), start=1):
        rows.append({"sku": src, "rec_rank": rank,
                     "rec_sku": sku_df.loc[j, "sku"],
                     "similarity": 1.0 - float(d)})

recs = pd.DataFrame(rows).sort_values(["sku","rec_rank"]).reset_index(drop=True)

In [89]:
recs.head(15)

Unnamed: 0,sku,rec_rank,rec_sku,similarity
0,55522,1,210768-5254,0.887874
1,55522,2,210767-5254,0.8874
2,55522,3,210824-5254,0.884958
3,55522,4,210781-5254,0.883789
4,55522,5,170005,0.88036
5,55573,1,055575,0.983489
6,55573,2,202788,0.976443
7,55573,3,200022,0.971654
8,55573,4,290299,0.851977
9,55573,5,546922,0.829337


In [90]:
import numpy as np
import pandas as pd

target = "055522"  # <- change if needed

# find row index for target sku
i = sku_df.index[sku_df["sku"] == target][0]

# neighbors (ask for 6 to drop self)
n = min(6, len(sku_df))
dists, idxs = nn.kneighbors(X_emb[i].reshape(1, -1), n_neighbors=n)
idxs, dists = idxs[0], dists[0]

# drop self, keep top-5
mask = idxs != i
nbr_idx = idxs[mask][:5]
nbr_sim = (1.0 - dists[mask])[:5]

# product fields to show
cols = ["name","name.1","color","size","audience","category","price","status"]
details = art.drop_duplicates("sku")[["sku"] + [c for c in cols if c in art.columns]]

recs_055573 = (
    pd.DataFrame({
        "src_sku": target,
        "rec_rank": range(1, len(nbr_idx)+1),
        "rec_sku": sku_df.iloc[nbr_idx]["sku"].to_list(),
        "text_similarity": nbr_sim
    })
    .merge(details, left_on="rec_sku", right_on="sku", how="left")
    .drop(columns=["sku"])
)

recs_055573


Unnamed: 0,src_sku,rec_rank,rec_sku,text_similarity,name,name.1,color,size,audience,category,price,status
0,55522,1,210768-5254,0.887874,Kabelstickad jumper,Åshild,gammalrosa,52/54,dam,Överdelar Tröjor,229.95,active
1,55522,2,210767-5254,0.8874,Stickad kofta,Åshild,gråmelerad,52/54,dam,Överdelar Koftor,,active
2,55522,3,210824-5254,0.884958,Kofta Ines,Åshild,hallon,52/54,dam,Överdelar Koftor,,active
3,55522,4,210781-5254,0.883789,Topp,Åshild,lila,52/54,dam,Överdelar Tunikor,,active
4,55522,5,170005,0.88036,Tröja,Permin,unknown,unknown,barn & ungdom,Tröjor Mönster,,active
