In [2]:
import pandas as pd
full_articles = pd.read_csv("../data/external/products.csv", dtype='string')
articles_clean = pd.read_parquet("../data/processed/articles_clean.parquet").query("forSale.notna()")

In [3]:
# Drop some columns from articles_clean before merging
cols_to_drop = ['priceEUR', 'priceNOK', 'priceDKK', 'forSale', 'sizeId', 'brandId', 'categoryId']
articles = articles_clean.drop(columns=cols_to_drop, errors='ignore').copy()
cols_to_add = ['description', 'color']
articles = articles.merge(
    full_articles[['sku'] + cols_to_add],
    on='sku',
    how='left'
)


In [4]:
articles.isnull().mean()

sku            0.000000
groupId        0.000000
name           0.004325
brand          0.000000
category       0.000000
priceSEK       0.000000
description    0.006049
color          0.027823
dtype: float64

Removing bugs

In [5]:
na_name = articles[articles["name"].isna()]
na_name

Unnamed: 0,sku,groupId,name,brand,category,priceSEK,description,color
2619,280119-42,280117,,Embla of Sweden,unknown,1249,,
2620,280119-41,280117,,Embla of Sweden,unknown,1249,,
2621,280119-40,280117,,Embla of Sweden,unknown,1249,,
2622,280119-39,280117,,Embla of Sweden,unknown,1249,,
2623,280119-38,280117,,Embla of Sweden,unknown,1249,,
...,...,...,...,...,...,...,...,...
33061,973036,973036,,unknown,unknown,72,,
33062,262152-L,262150,,Swegmark,unknown,338,,
33063,262144-110BCD,262142,,Glamorise,unknown,500,,
33064,262137-XL,262135,,Swegmark,unknown,190,,


In [6]:
articles = articles[articles["name"].notna()].reset_index(drop=True)

In [7]:
for c in articles['color'].unique():
    print(repr(c))

<NA>
'Multi,Multi'
'Grå,Grå,Grå'
'Vit,Vit,Vit'
'Svart,Svart,Svart'
'Svart'
'Röd,Röd'
'Grön,Grön'
'Brun,Brun'
'Vit,Vit'
'Linne,Linne'
'Gråblå,Gråblå'
'Sand,Sand'
'Mörkgrön,Mörkgrön'
'Röd,Röd,Röd'
'Rosa,Rosa'
'Blå,Blå'
'Svart,Svart'
'Vinröd,Vinröd'
'Cognac,Cognac'
'Grå,Grå'
'Vit'
'Beige'
'Champagne'
'Off-white,Off-white'
'Antracit'
'Ljung,Ljung'
'Beige,Beige'
'Mörkblå,Mörkblå'
'Vit,Vit,Vit,Vit'
'Ljusblå,Ljusblå'
'Creme,Creme'
'Linne,Linne,Linne,Linne'
'Gul,Gul'
'Turkos,Turkos'
'Rosa'
'Off-white'
'Ljusgrå'
'Mellanblå,Mellanblå'
'Marin,Marin'
'Multi,Multi,Multi'
'Lila,Lila'
'Indigo,Indigo'
'Rost,Rost'
'Brun'
'Linne'
'Marin'
'Grön,Grön,Grön'
'Grön'
'Beige,Beige,Beige'
'Brun,Brun,Brun'
'Röd'
'Blå'
'Lila'
'Ljung'
'Vit/beige,Vit/beige'
'Mörkgrå'
'Mörkgrön'
'Ljusgrå,Ljusgrå'
'Rost,Rost,Rost,Rost,Rost,Rost'
'Indigo,Indigo,Indigo,Indigo,Indigo,Indigo'
'Rosa,Rosa,Rosa,Rosa,Rosa,Rosa'
'Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå'
'Mörkgrå,Mörkgrå'
'Off-white,Off-white,Off-white,Off-white,Off-wh

In [8]:
# Deduplicate repeated color names in the 'color' column
def dedup_color(val):
    if pd.isna(val):
        return val
    seen = set()
    tokens = [x.strip() for x in str(val).split(',')]
    deduped = []
    for token in tokens:
        if token and token not in seen:
            deduped.append(token)
            seen.add(token)
    return ','.join(deduped) if deduped else pd.NA

articles['color'] = articles['color'].apply(dedup_color)


no sku instead using groupid for recs

In [9]:
# Deduplicate so that for each groupId, keep the first row for all columns except color, 
# which becomes a list of all colors from merged rows
def merge_colors(series):
    # Remove missing/unknown/nan/none and deduplicate
    colors = [str(c).strip() for c in series if pd.notna(c) and str(c).strip().lower() not in {"", "unknown", "nan", "none"}]
    return list(sorted(set(colors))) if colors else []

articles = articles.sort_values("sku")
articles = articles.groupby("groupId", as_index=False).agg(
    {col: (merge_colors if col == "color" else "first") for col in articles.columns if col != "sku"}
)


In [10]:

articles.to_parquet("../data/processed/articles_for_recs.parquet", index=False)
