In [40]:
import pandas as pd
full_articles = pd.read_csv("../data/external/products.csv", dtype='string')
articles_clean = pd.read_parquet("../data/processed/articles_clean.parquet").query("forSale.notna()")

In [None]:
# Add columns from full_articles to for_sale_articles
# Drop some columns from articles_clean before merging
cols_to_drop = ['priceEUR', 'priceNOK', 'priceDKK', 'forSale', 'sizeId', 'brandId', 'categoryId']
articles = articles_clean.drop(columns=cols_to_drop, errors='ignore').copy()
cols_to_add = ['description', 'color']
articles = articles.merge(
    full_articles[['sku'] + cols_to_add],
    on='sku',
    how='left'
)


In [44]:
#count proportion of missing values in each column
articles.isnull().mean()

sku            0.000000
groupId        0.000000
name           0.004318
brand          0.000000
category       0.000000
priceSEK       0.000000
description    0.006039
color          0.027778
dtype: float64

Removing bugs

In [45]:
na_name = articles[articles["name"].isna()]
na_name

Unnamed: 0,sku,groupId,name,brand,category,priceSEK,description,color
2682,280119-40,280117,,Embla of Sweden,unknown,1249,,
2683,280119-39,280117,,Embla of Sweden,unknown,1249,,
2684,280119-38,280117,,Embla of Sweden,unknown,1249,,
2685,280119-37,280117,,Embla of Sweden,unknown,1249,,
2686,280119-36,280117,,Embla of Sweden,unknown,1249,,
...,...,...,...,...,...,...,...,...
33115,973036,973036,,unknown,unknown,72,,
33116,262130-80D,262130,,Swegmark,unknown,881,,
33117,262152-L,262150,,Swegmark,unknown,337,,
33118,262137-XL,262135,,Swegmark,unknown,190,,


In [46]:
# Drop rows with missing name and drop the corresponding rows from articles
articles = articles[articles["name"].notna()].reset_index(drop=True)

In [47]:
# Print all unique color values, one per line, to avoid truncation
for c in articles['color'].unique():
    print(repr(c))


<NA>
'Svart'
'Off-white'
'Svart,Svart'
'Grå,Grå'
'Vit,Vit'
'Rosa,Rosa'
'Grön,Grön'
'Blå,Blå'
'Vinröd,Vinröd'
'Röd,Röd'
'Cognac,Cognac'
'Linne,Linne'
'Brun,Brun'
'Gråblå,Gråblå'
'Champagne'
'Beige'
'Off-white,Off-white'
'Indigo,Indigo'
'Sand,Sand'
'Grön,Grön,Grön'
'Multi,Multi'
'Vit,Vit,Vit'
'Multi,Multi,Multi'
'Mörkgrön,Mörkgrön'
'Rost,Rost'
'Gul,Gul'
'Röd,Röd,Röd'
'Vit'
'Antracit'
'Creme,Creme'
'Beige,Beige'
'Ljusblå,Ljusblå'
'Ljung,Ljung'
'Turkos,Turkos'
'Vit,Vit,Vit,Vit'
'Linne,Linne,Linne,Linne'
'Mörkblå,Mörkblå'
'Grå,Grå,Grå'
'Svart,Svart,Svart'
'Marin,Marin'
'Mellanblå,Mellanblå'
'Ljusgrå'
'Lila,Lila'
'Rosa'
'Linne'
'Brun'
'Marin'
'Grön'
'Beige,Beige,Beige'
'Brun,Brun,Brun'
'Blå'
'Röd'
'Lila'
'Ljung'
'Ljusgrå,Ljusgrå'
'Vit/beige,Vit/beige'
'Mörkgrå,Mörkgrå'
'Aqua,Aqua'
'Ljusbrun,Ljusbrun'
'Gammalrosa,Gammalrosa'
'Rost,Rost,Rost,Rost,Rost,Rost'
'Kaki,Kaki'
'Off-white,Off-white,Off-white,Off-white,Off-white,Off-white'
'Indigo,Indigo,Indigo,Indigo,Indigo,Indigo'
'Rosa,Rosa,Rosa,Rosa

In [48]:
# Deduplicate repeated color names in the 'color' column
def dedup_color(val):
    if pd.isna(val):
        return val
    seen = set()
    tokens = [x.strip() for x in str(val).split(',')]
    deduped = []
    for token in tokens:
        if token and token not in seen:
            deduped.append(token)
            seen.add(token)
    return ','.join(deduped) if deduped else pd.NA

articles['color'] = articles['color'].apply(dedup_color)


In [49]:
# Deduplicate so that for each groupId, keep the first row for all columns except 'color', 
# which should be a list of all colors from merged rows (excluding missing/unknown/nan/none).
def merge_colors(series):
    # Remove missing/unknown/nan/none and deduplicate
    colors = [str(c).strip() for c in series if pd.notna(c) and str(c).strip().lower() not in {"", "unknown", "nan", "none"}]
    return list(sorted(set(colors))) if colors else []

articles = articles.sort_values("sku")  # Ensure deterministic "first" row
articles = articles.groupby("groupId", as_index=False).agg(
    {col: (merge_colors if col == "color" else "first") for col in articles.columns if col != "sku"}
)


In [50]:

articles.to_parquet("../data/processed/articles_for_recs.parquet", index=False)
