In [68]:
import pandas as pd
full_articles = pd.read_csv("../data/external/products.csv", dtype='string')
articles_clean = pd.read_parquet("../data/processed/articles_clean.parquet").query("forSale.notna()")

In [69]:
# Drop some columns from articles_clean before merging
cols_to_drop = ['priceEUR', 'priceNOK', 'priceDKK', 'forSale', 'sizeId', 'brandId', 'categoryId']
articles = articles_clean.drop(columns=cols_to_drop, errors='ignore').copy()
cols_to_add = ['description', 'color']
articles = articles.merge(
    full_articles[['sku'] + cols_to_add],
    on='sku',
    how='left'
)


In [70]:
articles.isnull().mean()

sku            0.000000
groupId        0.000000
name           0.002364
brand          0.000000
size           0.031733
audience       0.009881
audienceId     0.009881
category       0.000000
priceSEK       0.000000
description    0.004092
color          0.025247
dtype: float64

Removing bugs

In [71]:
na_name = articles[articles["name"].isna()]
na_name

Unnamed: 0,sku,groupId,name,brand,size,audience,audienceId,category,priceSEK,description,color
2701,280024-0038,280024,,unknown,38,,,unknown,149,,
2709,280024-0041,280024,,unknown,41,,,unknown,149,,
2710,280024-0039,280024,,unknown,39,,,unknown,149,,
2769,270631-42,270627,,Miss Mary,42,dam,6,"Badkläder,Bikini",249,,Petrol
2770,270631-58,270627,,Miss Mary,58,dam,6,"Badkläder,Bikini",249,,Petrol
...,...,...,...,...,...,...,...,...,...,...,...
12826,270608-4042,270608,,Damella,40/42,dam,6,"Sovplagg,Pyjamas",899,,Grön
12834,270608-3638,270608,,Damella,36/38,dam,6,"Sovplagg,Pyjamas",899,,Grön
19696,551607,505254,,Redlunds,,,,unknown,239,,
32991,262130-80D,262130,,Swegmark,,,,unknown,876,,


In [72]:
articles = articles[articles["name"].notna()].reset_index(drop=True)

In [73]:
for c in articles['color'].unique():
    print(repr(c))

<NA>
'Vit,Vit,Vit'
'Grå,Grå,Grå'
'Svart,Svart,Svart'
'Multi,Multi'
'Antracit'
'Rosa'
'Off-white'
'Svart'
'Beige'
'Grå,Grå'
'Gråblå,Gråblå'
'Vit,Vit'
'Ljusbrun,Ljusbrun'
'Vinröd,Vinröd'
'Mörkgrå,Mörkgrå'
'Ljusgrå,Ljusgrå'
'Grön,Grön'
'Rosa,Rosa'
'Aqua,Aqua'
'Mörkgrön,Mörkgrön'
'Indigo,Indigo'
'Gammalrosa,Gammalrosa'
'Off-white,Off-white'
'Sand,Sand'
'Off-white,Off-white,Off-white,Off-white'
'Champagne'
'Vit'
'Rosa,Rosa,Rosa,Rosa,Rosa,Rosa'
'Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå,Mörkgrå'
'Rost,Rost,Rost,Rost,Rost,Rost'
'Indigo,Indigo,Indigo,Indigo,Indigo,Indigo'
'Off-white,Off-white,Off-white,Off-white,Off-white,Off-white'
'Rost,Rost'
'Kaki,Kaki'
'Gul,Gul'
'Lila,Lila'
'Linne,Linne'
'Röd,Röd'
'Blå,Blå'
'Multi,Multi,Multi'
'Brun,Brun'
'Röd,Röd,Röd'
'Ljusgrå'
'Beige,Beige'
'Svart,Svart'
'Marin,Marin'
'Mellanblå,Mellanblå'
'Creme,Creme'
'Ljusblå,Ljusblå'
'Cognac,Cognac'
'Linne'
'Brun'
'Vit,Vit,Vit,Vit'
'Mörkblå,Mörkblå'
'Linne,Linne,Linne,Linne'
'Turkos,Turkos'
'Ljung,Ljung'
'Brun,Brun,Bru

In [74]:
# Deduplicate repeated color names in the 'color' column
def dedup_color(val):
    if pd.isna(val):
        return val
    seen = set()
    tokens = [x.strip() for x in str(val).split(',')]
    deduped = []
    for token in tokens:
        if token and token not in seen:
            deduped.append(token)
            seen.add(token)
    return ','.join(deduped) if deduped else pd.NA

articles['color'] = articles['color'].apply(dedup_color)


no sku instead using groupid for recs

In [75]:
# Deduplicate so that for each groupId, keep the first row for all columns except color, 
# which becomes a list of all colors from merged rows
def merge_list(series):
    # Remove missing/unknown/nan/none and deduplicate
    items = [str(x).strip() for x in series if pd.notna(x) and str(x).strip().lower() not in {"", "unknown", "nan", "none"}]
    return list(sorted(set(items))) if items else []

articles = articles.sort_values("sku")
articles = articles.groupby("groupId", as_index=False).agg(
    {
        **{
            col: merge_list if col in ("color", "size") else "first"
            for col in articles.columns if col != "sku"
        }
    }
)


In [77]:
articles.isna().mean()

groupId        0.000000
name           0.000000
brand          0.000000
size           0.000000
audience       0.069267
audienceId     0.069267
category       0.000000
priceSEK       0.000000
description    0.009313
color          0.000000
dtype: float64

In [78]:

articles.to_parquet("../data/processed/articles_for_recs.parquet", index=False)
