In [21]:
import pandas as pd
full_articles = pd.read_csv("../data/external/products.csv", dtype='string')
articles_clean = pd.read_parquet("../data/processed/articles_clean.parquet").query("forSale.notna()")

In [22]:
# Drop some columns from articles_clean before merging
cols_to_drop = ['priceEUR', 'priceNOK', 'priceDKK', 'forSale', 'sizeId', 'brandId', 'categoryId']
articles = articles_clean.drop(columns=cols_to_drop, errors='ignore').copy()
cols_to_add = ['description', 'color']
articles = articles.merge(
    full_articles[['sku'] + cols_to_add],
    on='sku',
    how='left'
)


In [23]:
articles.isnull().mean()

sku            0.000000
groupId        0.000000
name           0.002362
brand          0.000000
audience       0.011809
audienceId     0.011809
category       0.000000
priceSEK       0.000000
description    0.004088
color          0.025617
dtype: float64

Removing bugs

In [24]:
na_name = articles[articles["name"].isna()]
na_name

Unnamed: 0,sku,groupId,name,brand,audience,audienceId,category,priceSEK,description,color
2551,280119-37,280117,,Embla of Sweden,,,unknown,1249,,
2552,280119-38,280117,,Embla of Sweden,,,unknown,1249,,
2553,280119-39,280117,,Embla of Sweden,,,unknown,1249,,
2554,280119-40,280117,,Embla of Sweden,,,unknown,1249,,
2555,280119-41,280117,,Embla of Sweden,,,unknown,1249,,
...,...,...,...,...,...,...,...,...,...,...
13103,270608-4042,270608,,Damella,dam,6,"Sovplagg,Pyjamas",899,,Grön
13104,270608-3638,270608,,Damella,dam,6,"Sovplagg,Pyjamas",899,,Grön
19720,551607,505254,,Redlunds,,,unknown,239,,
33022,262130-80D,262130,,Swegmark,,,unknown,874,,


In [25]:
articles = articles[articles["name"].notna()].reset_index(drop=True)

In [26]:
for c in articles['color'].unique():
    print(repr(c))

<NA>
'Vit,Vit,Vit'
'Grå,Grå,Grå'
'Multi,Multi'
'Svart,Svart,Svart'
'Vit,Vit'
'Vit'
'Röd,Röd,Röd'
'Svart'
'Beige,Beige'
'Grå,Grå'
'Svart,Svart'
'Marin,Marin'
'Grön,Grön'
'Mellanblå,Mellanblå'
'Rosa,Rosa'
'Off-white'
'Ljusgrå'
'Beige'
'Rosa'
'Röd,Röd'
'Linne'
'Brun'
'Blå,Blå'
'Ljung,Ljung'
'Linne,Linne'
'Ljusblå,Ljusblå'
'Mörkblå,Mörkblå'
'Sand,Sand'
'Brun,Brun'
'Vit,Vit,Vit,Vit'
'Linne,Linne,Linne,Linne'
'Gul,Gul'
'Turkos,Turkos'
'Multi,Multi,Multi'
'Mörkgrön,Mörkgrön'
'Lila,Lila'
'Rost,Rost'
'Champagne'
'Gråblå,Gråblå'
'Off-white,Off-white'
'Indigo,Indigo'
'Antracit'
'Vinröd,Vinröd'
'Cognac,Cognac'
'Creme,Creme'
'Mörkgrå,Mörkgrå'
'Off-white,Off-white,Off-white,Off-white'
'Ljusgrå,Ljusgrå'
'Aqua,Aqua'
'Ljusbrun,Ljusbrun'
'Gammalrosa,Gammalrosa'
'Off-white,Off-white,Off-white,Off-white,Off-white,Off-white'
'Kaki,Kaki'
'Indigo,Indigo,Indigo,Indigo,Indigo,Indigo'
'Rosa,Rosa,Rosa,Rosa,Rosa,Rosa'
'Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå'
'Rost,Rost,Rost,Rost,Rost,Rost'
'Vit/beige,Vit

In [27]:
# Deduplicate repeated color names in the 'color' column
def dedup_color(val):
    if pd.isna(val):
        return val
    seen = set()
    tokens = [x.strip() for x in str(val).split(',')]
    deduped = []
    for token in tokens:
        if token and token not in seen:
            deduped.append(token)
            seen.add(token)
    return ','.join(deduped) if deduped else pd.NA

articles['color'] = articles['color'].apply(dedup_color)


no sku instead using groupid for recs

In [28]:
# Deduplicate so that for each groupId, keep the first row for all columns except color, 
# which becomes a list of all colors from merged rows
def merge_colors(series):
    # Remove missing/unknown/nan/none and deduplicate
    colors = [str(c).strip() for c in series if pd.notna(c) and str(c).strip().lower() not in {"", "unknown", "nan", "none"}]
    return list(sorted(set(colors))) if colors else []

articles = articles.sort_values("sku")
articles = articles.groupby("groupId", as_index=False).agg(
    {col: (merge_colors if col == "color" else "first") for col in articles.columns if col != "sku"}
)


In [29]:
articles.isna().mean()

groupId        0.000000
name           0.000000
brand          0.000000
audience       0.091120
audienceId     0.091120
category       0.000000
priceSEK       0.000000
description    0.009286
color          0.000000
dtype: float64

In [30]:

articles.to_parquet("../data/processed/articles_for_recs.parquet", index=False)
