In [16]:
import pandas as pd
full_articles = pd.read_csv("../data/external/products.csv", dtype='string')
articles_clean = pd.read_parquet("../data/processed/articles_clean.parquet").query("forSale.notna()")

In [17]:
# Add columns from full_articles to for_sale_articles (now called articles)
# Drop specified columns from articles_clean before merging
cols_to_drop = ['priceEUR', 'priceNOK', 'priceDKK', 'forSale', 'sizeId', 'brandId', 'categoryId']
articles = articles_clean.drop(columns=cols_to_drop, errors='ignore').copy()
cols_to_add = ['description', 'color']
articles = articles.merge(
    full_articles[['sku'] + cols_to_add],
    on='sku',
    how='left'
)


In [18]:
# number of unique groupid
print(articles['groupId'].nunique())


1738


In [19]:
articles.sample()

Unnamed: 0,sku,groupId,name,brand,category,priceSEK,description,color
31516,261525-F115,261396,Bh utan bygel Lovely Lace,Miss Mary,"Bh utan bygel,Bh,Underkläder",549,"Lovely Lace. En bh utan bygel från Miss Mary, ...",Grön


In [20]:
#count proportion of missing values in each column
articles.isnull().mean()

sku            0.000000
groupId        0.000000
name           0.004318
brand          0.000000
category       0.000000
priceSEK       0.000000
description    0.006039
color          0.027778
dtype: float64

Removing bugs

In [21]:
na_name = articles[articles["name"].isna()]
na_name

Unnamed: 0,sku,groupId,name,brand,category,priceSEK,description,color
2682,280119-40,280117,,Embla of Sweden,unknown,1249,,
2683,280119-39,280117,,Embla of Sweden,unknown,1249,,
2684,280119-38,280117,,Embla of Sweden,unknown,1249,,
2685,280119-37,280117,,Embla of Sweden,unknown,1249,,
2686,280119-36,280117,,Embla of Sweden,unknown,1249,,
...,...,...,...,...,...,...,...,...
33115,973036,973036,,unknown,unknown,72,,
33116,262130-80D,262130,,Swegmark,unknown,881,,
33117,262152-L,262150,,Swegmark,unknown,337,,
33118,262137-XL,262135,,Swegmark,unknown,190,,


In [22]:
# Drop rows with missing name and drop the corresponding rows from articles
articles = articles[articles["name"].notna()].reset_index(drop=True)

In [23]:
# Print all unique color values, one per line, to avoid truncation
for c in articles['color'].unique():
    print(repr(c))


<NA>
'Svart'
'Off-white'
'Svart,Svart'
'Grå,Grå'
'Vit,Vit'
'Rosa,Rosa'
'Grön,Grön'
'Blå,Blå'
'Vinröd,Vinröd'
'Röd,Röd'
'Cognac,Cognac'
'Linne,Linne'
'Brun,Brun'
'Gråblå,Gråblå'
'Champagne'
'Beige'
'Off-white,Off-white'
'Indigo,Indigo'
'Sand,Sand'
'Grön,Grön,Grön'
'Multi,Multi'
'Vit,Vit,Vit'
'Multi,Multi,Multi'
'Mörkgrön,Mörkgrön'
'Rost,Rost'
'Gul,Gul'
'Röd,Röd,Röd'
'Vit'
'Antracit'
'Creme,Creme'
'Beige,Beige'
'Ljusblå,Ljusblå'
'Ljung,Ljung'
'Turkos,Turkos'
'Vit,Vit,Vit,Vit'
'Linne,Linne,Linne,Linne'
'Mörkblå,Mörkblå'
'Grå,Grå,Grå'
'Svart,Svart,Svart'
'Marin,Marin'
'Mellanblå,Mellanblå'
'Ljusgrå'
'Lila,Lila'
'Rosa'
'Linne'
'Brun'
'Marin'
'Grön'
'Beige,Beige,Beige'
'Brun,Brun,Brun'
'Blå'
'Röd'
'Lila'
'Ljung'
'Ljusgrå,Ljusgrå'
'Vit/beige,Vit/beige'
'Mörkgrå,Mörkgrå'
'Aqua,Aqua'
'Ljusbrun,Ljusbrun'
'Gammalrosa,Gammalrosa'
'Rost,Rost,Rost,Rost,Rost,Rost'
'Kaki,Kaki'
'Off-white,Off-white,Off-white,Off-white,Off-white,Off-white'
'Indigo,Indigo,Indigo,Indigo,Indigo,Indigo'
'Rosa,Rosa,Rosa,Rosa

In [24]:
# Deduplicate repeated color names in the 'color' column
def dedup_color(val):
    if pd.isna(val):
        return val
    seen = set()
    tokens = [x.strip() for x in str(val).split(',')]
    deduped = []
    for token in tokens:
        if token and token not in seen:
            deduped.append(token)
            seen.add(token)
    return ','.join(deduped) if deduped else pd.NA

articles['color'] = articles['color'].apply(dedup_color)


## SKU

In [25]:
articles.drop_duplicates(subset=['sku'], keep=False)

Unnamed: 0,sku,groupId,name,brand,category,priceSEK,description,color
0,970100,970100,Frakt- & exp.avgift,unknown,unknown,69,,
1,790196,790196,Fingerborg 17 mm,Ateljé Margaretha,"Sytillbehör,Vardagshjälpmedel",19,"Fingerborg, storlek 17 mm.",
2,565301-7070,565301,Bygel-bh Fleur,Rosa Faia,"Bygel-bh,Bh,Underkläder",749,Elegant bygel- bh från Rosa Faias Fleur serie ...,Svart
3,565301-8095,565301,Bygel-bh Fleur,Rosa Faia,"Bygel-bh,Bh,Underkläder",749,Elegant bygel- bh från Rosa Faias Fleur serie ...,Svart
4,565301-8090,565301,Bygel-bh Fleur,Rosa Faia,"Bygel-bh,Bh,Underkläder",749,Elegant bygel- bh från Rosa Faias Fleur serie ...,Svart
...,...,...,...,...,...,...,...,...
32972,270548-L_XL,270547,Big-T Emilia av bambu,Åshild,"Sovkläder,Dam",298,"Big-T, tillverkat i mjuk bambu är den perfekta...",Svart
32973,260965-H115,260965,Body Happy Hearts,Miss Mary,"Bh utan bygel,Body,Underkläder",849,Formande body från Miss Mary med förstärkt fra...,Vit
32974,12025FI,12025FI,Åshild A-kat höst25 FI,unknown,unknown,0,,
32975,12025NO,12025NO,Åshild A-kat höst25 NO,unknown,unknown,0,,


In [26]:
# Save the cleaned articles DataFrame to Parquet without changing dtypes
articles.to_parquet("../data/processed/articles_clean.parquet", index=False)

articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32977 entries, 0 to 32976
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          32977 non-null  string
 1   groupId      32977 non-null  string
 2   name         32977 non-null  string
 3   brand        32977 non-null  string
 4   category     32977 non-null  string
 5   priceSEK     32977 non-null  string
 6   description  32920 non-null  string
 7   color        32159 non-null  object
dtypes: object(1), string(7)
memory usage: 2.0+ MB


In [27]:
print(len(articles['sku'].unique()))
print(len(articles['groupId'].unique()))

32977
1730


In [28]:
# Deduplicate so that for each groupId, keep the first row for all columns except 'color', 
# which should be a list of all colors from merged rows (excluding missing/unknown/nan/none).
def merge_colors(series):
    # Remove missing/unknown/nan/none and deduplicate
    colors = [str(c).strip() for c in series if pd.notna(c) and str(c).strip().lower() not in {"", "unknown", "nan", "none"}]
    return list(sorted(set(colors))) if colors else []

articles = articles.sort_values("sku")  # Ensure deterministic "first" row
articles = articles.groupby("groupId", as_index=False).agg(
    {col: (merge_colors if col == "color" else "first") for col in articles.columns if col != "sku"}
)


In [29]:

articles.to_parquet("../data/processed/articles_for_recs.parquet", index=False)
