## Assemble a DS

In [140]:
import pandas as pd

art = pd.read_csv(
    "../data/processed/articles_clean.csv",
    usecols=["sku", "groupId", "name", "name.1", "status", "audience", "category"],
    dtype=str
)
prices = pd.read_csv(
    "../data/processed/transactions_clean.csv",
    usecols=["sku", "price_sek"],
    dtype={"sku": str}
)
art = art.query("status in ['active']").merge(prices, on="sku", how="left")

In [141]:
#proportion of price in art?
art['price_sek'].isna().mean()

np.float64(0.1322079811382277)

In [142]:
#show a few rows where price in not na
art[art['price_sek'].notna()].head()

Unnamed: 0,sku,groupId,status,name,name.1,audience,category,price_sek
38,101702,190041,active,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
39,101702,190041,active,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",83.555425
40,101702,190041,active,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
41,101702,190041,active,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0
42,101702,190041,active,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",79.0


In [143]:
# Convert 'price' to numeric, coerce errors to NaN
art['price_sek'] = pd.to_numeric(art['price_sek'], errors='coerce')
# Set all prices below 1 to NaN
art.loc[art['price_sek'] < 1, 'price_sek'] = pd.NA


In [144]:
#proportion of price in art?
art['price_sek'].isna().mean()

np.float64(0.13256540169053652)

In [145]:
art['price_sek'].describe()

count    138335.000000
mean        310.056316
std         275.890730
min           3.000000
25%         139.884925
50%         224.379175
75%         449.000000
max        5556.186000
Name: price_sek, dtype: float64

In [146]:
# Create price buckets based on the distribution of price_sek, using 6 buckets
# (min: ~3.6, 25%: ~375, 50%: ~496, 75%: ~596, max: ~5242)
price_bins = [0, 100, 200, 400, 1000, 2000, float('inf')]
price_labels = [
    'Budget',        # 0-100
    'Value',         # 100-200
    'Popular',       # 200-400
    'Premium',       # 400-1000
    'Luxury',        # 1000-2000
    'Exclusive'      # 2000+
]
art['priceband'] = pd.cut(art['price_sek'], bins=price_bins, labels=price_labels, include_lowest=True)
art['priceband'] = art['priceband'].astype(object)
art.loc[art['price_sek'].isna(), 'priceband'] = 'unknown'

In [147]:
# Drop rows where priceband is 'unknown', then drop the price_sek and status columns
art = art[art["priceband"] != "unknown"].drop(columns=["price_sek", "status"])

In [148]:
art

Unnamed: 0,sku,groupId,name,name.1,audience,category,priceband
38,101702,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
39,101702,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
40,101702,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
41,101702,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
42,101702,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
...,...,...,...,...,...,...,...
159351,590841,590841,5 meterklipp Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
159352,590841,590841,5 meterklipp Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular
159353,590841,590841,5 meterklipp Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Premium
159354,590841,590841,5 meterklipp Plastad frotté,Linea,hemmet,"Bäddtillbehör,Bädd,Inkontinens,Bädd (linea)",Popular


## only one group_id should remain

In [150]:
# Deduplicate so only one row per groupId remains, keeping the one with the most information (fewest NA/unknown), and drop sku
def info_score(row):
    # Count number of non-missing, non-unknown fields (excluding sku and groupId)
    fields = [col for col in art.columns if col not in ("sku", "groupId")]
    score = 0
    for col in fields:
        val = row[col]
        if pd.isna(val):
            continue
        sval = str(val).strip().lower()
        if sval in {"", "unknown", "nan", "none"}:
            continue
        score += 1
    return score

art = art.assign(_info_score=art.apply(info_score, axis=1))
art = art.sort_values("_info_score", ascending=False)
art = art.drop_duplicates(subset=["groupId"], keep="first").drop(columns=["_info_score", "sku"]).reset_index(drop=True)


In [151]:
art

Unnamed: 0,groupId,name,name.1,audience,category,priceband
0,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget
1,263855,Framknäppt bh utan bygel med Magic Lift funktion,Glamorise,dam,"Bh utan bygel,Bh,Underkläder",Premium
2,260596,Bh utan bygel Stars,Swegmark,dam,"Bh utan bygel,Bh,Underkläder",Premium
3,263988,Stödknästrumpor i nylon med spets,Funq Wear,dam,"Stödstrumpor,Underkläder",Value
4,262287,Knäskydd,Good Living,generic,Stödartiklar,Value
...,...,...,...,...,...,...
1117,290207,Väska till Lets Go Out,Trust Care,unknown,unknown,Popular
1118,106065,Fyndpaket Stickgarn,Knittingroom,unknown,unknown,Premium
1119,260551,Maxitrosa,Walking,unknown,unknown,Popular
1120,261069,Trosa Essentials Brief,Anita,unknown,unknown,Popular


## 1. Build a clean text field for vectorization

In [152]:
import pandas as pd, unicodedata, re

MISSING = {"", "unknown", "nan", "none"}

def clean(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in MISSING else s

def canon(s):
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("&", " ").replace(",", " ")
    s = re.sub(r"\u00A0", " ", s)
    s = re.sub(r"[\u2010-\u2015\u2212\-]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def to_token(s):
    return re.sub(r"\s+", "_", s.strip())

def tidy_underscores(s):
    s = re.sub(r"\s*_\s*", "_", s)
    return re.sub(r"_+", "_", s)

def norm_category(cat):
    if pd.isna(cat): return ""
    return " ".join(to_token(clean(p)) for p in str(cat).split(",") if clean(p))

def build_text(row):
    names = [to_token(clean(row.get(c, ""))) for c in ("name", "name.1") if clean(row.get(c, ""))]
    others = [
        clean(row.get("color", "")),
        clean(row.get("audience", "")),
        norm_category(row.get("category", "")),
        clean(row.get("priceband", "")),
    ]
    s = " ".join(names*3 + [o for o in others if o])
    return tidy_underscores(canon(s))

art["text"] = art.apply(build_text, axis=1)
group_df = art.loc[art["text"].ne(""), ["groupId", "text"]].reset_index(drop=True)

In [153]:
pd.set_option('display.max_colwidth', None)
group_df.head()

Unnamed: 0,groupId,text
0,190041,Innerkudde Linea Innerkudde Linea Innerkudde Linea hemmet Kuddar Innerkuddar Bädd_(linea) Budget
1,263855,Framknäppt_bh_utan_bygel_med_Magic_Lift_funktion Glamorise Framknäppt_bh_utan_bygel_med_Magic_Lift_funktion Glamorise Framknäppt_bh_utan_bygel_med_Magic_Lift_funktion Glamorise dam Bh_utan_bygel Bh Underkläder Premium
2,260596,Bh_utan_bygel_Stars Swegmark Bh_utan_bygel_Stars Swegmark Bh_utan_bygel_Stars Swegmark dam Bh_utan_bygel Bh Underkläder Premium
3,263988,Stödknästrumpor_i_nylon_med_spets Funq_Wear Stödknästrumpor_i_nylon_med_spets Funq_Wear Stödknästrumpor_i_nylon_med_spets Funq_Wear dam Stödstrumpor Underkläder Value
4,262287,Knäskydd Good_Living Knäskydd Good_Living Knäskydd Good_Living generic Stödartiklar Value


## 2. Vectorize with TF-IDF (Term Frequency × Inverse Document Frequency)

In [154]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2), #use unigrams + bigrams
    min_df=1,
    strip_accents=None,   # keep å/ä/ö
    # sublinear_tf=True,  # optional
    # dtype=np.float32,   # optional memory saver
    token_pattern=r'(?u)\b\w+\b'
)
X_tfidf = tfidf.fit_transform(group_df["text"])
X_tfidf.shape

(1122, 3980)

## 3. Singular Value Decomposition + L2 normalize

TF-IDF gives precise but sparse signals; SVD compresses & generalizes them.
L2-norm makes nearest-neighbor search stable and comparable across items.

In [155]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

n_components = min(128, max(2, X_tfidf.shape[1]-1)) 
svd = TruncatedSVD(n_components=n_components, random_state=0)
X_svd = svd.fit_transform(X_tfidf)
X_emb = normalize(X_svd)
X_emb.shape


(1122, 128)

## 4. Build 10-nearest neighbors (cosine) and return a small recs table


* It’s the **cosine of the angle** between vectors $a$ and $b$:

  $$
  \text{cosine\_sim}(a,b)=\frac{a\cdot b}{\|a\|\;\|b\|}
  $$

  * $=1$ → same direction (very similar)
  * $=0$ → orthogonal (unrelated)
  * $=-1$ → opposite (rare with TF-IDF since values are ≥0)

* After we **L2-normalize** vectors, cosine similarity becomes just the **dot product**.

* In scikit-learn, `metric="cosine"` actually computes **cosine distance**:

  $$
  \text{cosine\_dist} = 1 - \text{cosine\_sim}
  $$

  That’s why in the code we convert back with `similarity = 1 - d`.

Why we use it: it’s **scale-invariant** (ignores length), so two SKUs with similar wording but different text lengths still match well.


In [156]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

POOL = min(11, len(group_df))
nn = NearestNeighbors(metric="cosine", n_neighbors=POOL).fit(X_emb)
dists, idxs = nn.kneighbors(X_emb)

groupid_arr = group_df["groupId"].values

# Build all recommendations, then filter out self-recommendations
src_groupids = np.repeat(groupid_arr, POOL)
rec_groupids = groupid_arr[idxs.ravel()]
similarities = 1 - dists.ravel()

recs = pd.DataFrame({
    "src_groupId": src_groupids,
    "rec_groupId": rec_groupids,
    "similarity": similarities
})

# Remove self-recommendations
recs = recs[recs["src_groupId"] != recs["rec_groupId"]].copy()

# For each src_groupId, keep only the top POOL-1 recommendations (in case of ties or accidental duplicates)
recs["rec_rank"] = (
    recs.groupby("src_groupId")["similarity"]
    .rank(method="first", ascending=False)
    .astype(int)
)
recs = recs[recs["rec_rank"] <= POOL-1]

recs.head(10)

Unnamed: 0,src_groupId,rec_groupId,similarity,rec_rank
1,190041,521879,0.965562,1
2,190041,576249,0.964736,2
3,190041,576231,0.964429,3
4,190041,576223,0.963842,4
5,190041,522710,0.957764,5
6,190041,579009,0.929578,6
7,190041,503386,0.912374,7
8,190041,525937,0.911209,8
9,190041,503380,0.908524,9
10,190041,507707,0.904658,10


In [157]:
recs.sample(15, random_state=42)

Unnamed: 0,src_groupId,rec_groupId,similarity,rec_rank
10480,261551,262045,0.609698,8
7894,290298,294140,0.456657,7
5799,420293,420295,0.830767,2
4623,470112,490530,0.997813,3
7262,270614,270567,0.969417,2
11145,260276,210781,0.758801,2
7814,290245,290230,0.998826,4
6230,270249,270614,0.968618,4
7385,270577,294819,0.308128,4
7635,290290,290292,0.999812,1


In [158]:
target = "190041"  # example groupId

# pick columns and collapse art to one row per groupId
keep = [c for c in ["groupId","name","name.1","color","audience","category","priceband"] if c in art.columns]
details = art.drop_duplicates("groupId")[keep].copy()

view = (
    recs.loc[recs["src_groupId"].eq(target), ["rec_groupId","rec_rank","similarity"]]
        .merge(details, left_on="rec_groupId", right_on="groupId", how="left")
        .drop(columns=["groupId"])  # joined key
        [["rec_rank","similarity","rec_groupId"] + [c for c in keep if c != "groupId"]]
        .sort_values("rec_rank")
        .reset_index(drop=True)
)

view


Unnamed: 0,rec_rank,similarity,rec_groupId,name,name.1,audience,category,priceband
0,1,0.965562,521879,Innerkudde Rund,Linea,hemmet,"Innerkuddar,Bädd (linea)",Value
1,2,0.964736,576249,Sovkudde Hög,Linea,hemmet,"Innerkuddar,Bädd (linea),Kuddar",Value
2,3,0.964429,576231,Sovkudde Medium,Linea,hemmet,"Innerkuddar,Bädd (linea),Kuddar",Value
3,4,0.963842,576223,Sovkudde Låg,Linea,hemmet,"Innerkuddar,Bädd (linea)",Value
4,5,0.957764,522710,Sovkudde Hotell,Linea,hemmet,"Täcken,Bäddtillbehör,Bädd (linea),Kuddar,Innerkuddar",Premium
5,6,0.929578,579009,Täcke Kingsize,Linea,hemmet,"Täcken,Bädd (linea)",Premium
6,7,0.912374,503386,Örngott Extra stor,Linea,hemmet,"Lakan & örngott,Bädd (linea)",Budget
7,8,0.911209,525937,Täcke ultralätt,Linea,hemmet,"Täcken,Bäddtillbehör,Bädd (linea)",Premium
8,9,0.908524,503380,Örngott,Linea,hemmet,"Lakan & örngott,Bädd (linea)",Budget
9,10,0.904658,507707,Örngott vita,Linea,hemmet,"Lakan & örngott,Bädd (linea)",Budget


In [159]:
# Visualize the recommendations for the target itself (i.e., show the row for the target)
view_target = (
    details.loc[details["groupId"] == target]
        .assign(rec_rank=1, similarity=1.0, rec_groupId=target)
        [["rec_rank","similarity","rec_groupId"] + [c for c in keep if c != "groupId"]]
        .reset_index(drop=True)
)
view_target


Unnamed: 0,rec_rank,similarity,rec_groupId,name,name.1,audience,category,priceband
0,1,1.0,190041,Innerkudde,Linea,hemmet,"Kuddar,Innerkuddar,Bädd (linea)",Budget


In [162]:
out = (recs
       .sort_values(['src_groupId','rec_rank'])
       .groupby('src_groupId')['rec_groupId']
       .apply(list)
       .reset_index(name='recs'))

out.to_csv("../data/predictions/vector_similarity_recommendations.csv", index=False)


In [163]:
out

Unnamed: 0,src_groupId,recs
0,106065,"[292219, 242305, 240184, 294827, 338160, 338152, 241208, 294819, 240153, 470021]"
1,144001,"[261631, 261632, 260303, 147106, 261589, 260477, 261573, 260577, 146601, 262118]"
2,146601,"[260303, 260477, 261573, 261589, 261632, 260577, 147106, 262118, 175701, 261873]"
3,147106,"[260477, 260577, 261573, 261632, 261589, 260303, 146601, 262118, 261468, 261187]"
4,175701,"[261463, 261626, 261574, 261591, 261873, 261585, 263000, 261567, 267097, 146601]"
...,...,...
1117,590629,"[598005, 520039, 531005, 551094, 552001, 548001, 531301, 579838, 579830, 531327]"
1118,590833,"[590841, 530202, 530206, 590838, 535338, 500355, 537323, 500389, 525938, 525937]"
1119,590838,"[530206, 530202, 590841, 590833, 535338, 525938, 537323, 500389, 500355, 525937]"
1120,590841,"[590833, 590838, 530206, 530202, 535338, 500355, 500389, 525938, 537323, 525937]"
