In [1]:
import pandas as pd
articles = pd.read_csv("../data/external/products.csv", dtype='string')

In [2]:
# number of unique groupid
print(articles['groupId'].nunique())


9177


In [3]:
{len(articles)}

{110120}

In [4]:
articles = articles.drop(columns=[
    'status', 'incommingQuantity', 'length', 'width', 'height', 'weight',
    'fabricId', 'fabric', 'description', 'colorId', 'color',
    'sizeId', 'size', 'audience', 'audienceId', 'publishedDate', 'quantity'
])

In [5]:
articles.sample(10)

Unnamed: 0,sku,groupId,brandId,name,brand,category,categoryId,priceSEK,priceEUR,priceNOK,priceDKK,forSale
77990,260919-4085,260919,50.0,Bh med dragkedja,Miss Mary,"Bh,Bh",2727,299,36.9,299,299,
36190,213777-0054,213777,,Topp,,REA,110,218,21.9,218,218,
90605,260364-4850,260364,,Alushousut,,,,209,24.9,209,209,
39103,210764-000l,210764,126.0,Flanellskjorta,Åshild,"Skjortor,Herr",1519162,349,34.9,349,349,1.0
54142,261122-0052,260533,3.0,Trosgördel Clara,Anita,"Underkläder,Gördlar",191797,429,44.95,499,329,
109121,262717-4095,260596,80.0,Bh utan bygel Stars,Swegmark,"Bh utan bygel,Bh utan bygel,Bh,Bh,Underkläder,...",5050272719195050,479,44.9,549,429,1.0
64793,267402-G080,267435,3.0,Bygel-bh Selma,Anita,"Bh,Bygel-bh,Underkläder,Bygel-bh",2722319223,849,82.95,899,599,
89345,260303-C095,260303,80.0,Sport-bh Magic,Swegmark,"Sport-bh,Bh utan bygel,Bh,Underkläder,Sport-bh...",61850271961850,629,62.9,749,579,1.0
3388,432060,432060,189.0,Pussel Tomens verkstad 500 bitar,Cobble Hill,"Hobbyhörnan,Hobbyhörnan,Hobbyhörnan,Pussel,Pus...",172172172365365365499499499,249,22.9,298,219,1.0
85583,260424-C115,260424,109.0,Bomulls-bh Louise skin,Louise,"Bh utan bygel,Framknäppt bh,Bh,Bh utan bygel,F...",501892750189,189,19.9,189,189,


In [6]:
#count proportion of missing values in each column
articles.isnull().mean()

sku           0.000000
groupId       0.000009
brandId       0.183300
name          0.013031
brand         0.183300
category      0.052370
categoryId    0.052370
priceSEK      0.001571
priceEUR      0.006402
priceNOK      0.006220
priceDKK      0.006720
forSale       0.699155
dtype: float64

Removing bugs

In [7]:
# Remove rows where all price columns are NA
price_columns = ['priceSEK', 'priceEUR', 'priceNOK', 'priceDKK']
articles = articles[~articles[price_columns].isna().all(axis=1)]
print(f"Rows after removing all-prices-NA: {len(articles)}")


Rows after removing all-prices-NA: 110099


# Category

In [8]:
# Show unique categoryIds per category (excluding NA)
cat_stats = articles[articles['categoryId'].notna()].groupby('category')['categoryId'].agg(['unique', 'count'])
print(f"{'category':50} {'categoryIds':40} count")
for cat, (ids, cnt) in cat_stats.iterrows():
    # Format the list of ids as a string, not using a format specifier for the list
    ids_str = str(list(ids))
    print(f"{cat:50} {ids_str:40} {cnt}")

# Categories with >1 unique categoryId
multi_cat = cat_stats['unique'].apply(lambda x: sum(pd.notna(x)) > 1)
if multi_cat.any():
    print("\nCategories with >1 categoryId:")
    for cat, ids in cat_stats.loc[multi_cat, 'unique'].items():
        print(f"{cat}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo category has >1 categoryId.")

print(f"\nArticles without categoryId: {articles['categoryId'].isna().sum()}")
print(f"Articles without category: {articles['category'].isna().sum()}")

category                                           categoryIds                              count
Accessoarer                                        ['454']                                  5
Accessoarer,Accessoarer,Bh,Bh,Underkläder,Underkläder,Bh-tillbehör,Bh-tillbehör ['454,454,27,27,19,19,691,691']          3
Accessoarer,Accessoarer,Handskar & vantar,Handskar & vantar ['454,454,1415,1415']                    1
Accessoarer,Accessoarer,Kepsar & mössor,Kepsar & mössor ['454,454,1447,1447']                    3
Accessoarer,Halsdukar & sjalar                     ['454,7']                                1
Accessoarer,Handskar & vantar                      ['454,1415']                             5
Accessoarer,Herr,Kepsar & mössor                   ['454,162,1447']                         3
Accessoarer,Kepsar & mössor                        ['454,1447']                             15
Accessoarer,Kepsar & mössor,Accessoarer,Kepsar & mössor ['454,1447,454,1447']                    2
Accesso

In [9]:
# category↔ID mapping

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen: seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

articles['category']   = articles['category'].apply(dedup_csv).astype('string')
articles['categoryId'] = articles['categoryId'].apply(dedup_csv).astype('string')

pairs, mismatched = [], 0
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    if n == 0: continue
    if len(ct) != len(it): mismatched += 1
    pairs.extend(zip(ct[:n], it[:n]))

if not pairs:
    print("No category↔id pairs available.")
else:
    dfp = pd.DataFrame(pairs, columns=['cat_tok','id_tok'])
    token2id = (dfp.groupby(['cat_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['cat_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('cat_tok')
                  .set_index('cat_tok')['id_tok'])

    def rebuild_ids(cat):
        ct = toks(cat)
        mapped = [token2id.get(t, pd.NA) for t in ct if t in token2id]
        return ','.join(mapped) if mapped else pd.NA

    articles['categoryId'] = articles['category'].apply(rebuild_ids).astype('string')

# Replace missing category values with 'unknown' in place
articles['category'] = articles['category'].fillna('unknown').astype('string')

# Stats
pairs2 = []
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    pairs2.extend(zip(ct[:n], it[:n]))
d2 = pd.DataFrame(pairs2, columns=['cat_tok','id_tok'])

cat_stats = d2.groupby('cat_tok')['id_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='unique_ids')
cat_stats['count'] = d2.groupby('cat_tok')['id_tok'].size().values

print(f"Mismatched token/id lengths: {mismatched}")
print(f"\n{'category token':40} {'unique_ids':25} count")
for _, r in cat_stats.sort_values('count', ascending=False).iterrows():
    print(f"{r['cat_tok'][:40]:40} {str(r['unique_ids'])[:25]:25} {int(r['count'])}")

multi = cat_stats['unique_ids'].apply(len) > 1
if multi.any():
    print("\nTokens mapping to >1 id:")
    for _, r in cat_stats[multi].iterrows():
        print(f"  {r['cat_tok']}: {r['unique_ids']}")
else:
    print("\n✓ Every category token maps to a single id.")

id_stats = d2.groupby('id_tok')['cat_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='tokens')
multi_id = id_stats['tokens'].apply(len) > 1
if multi_id.any():
    print("\nIds used by multiple tokens:")
    for _, r in id_stats[multi_id].iterrows():
        print(f"  {r['id_tok']}: {r['tokens'][:10]}{' ...' if len(r['tokens'])>10 else ''}")
else:
    print("\n✓ No id is shared by multiple tokens.")

print(f"\nUnique categoryIds: {articles['categoryId'].str.split(',').explode().nunique(dropna=True)}")
print(f"Unique category tokens: {d2['cat_tok'].nunique(dropna=True)}")


Mismatched token/id lengths: 0

category token                           unique_ids                count
Bh                                       ['27']                    60899
Underkläder                              ['19']                    52140
Bh utan bygel                            ['50']                    41402
Bygel-bh                                 ['223']                   20505
REA                                      ['110']                   19798
Sport-bh                                 ['618']                   8356
Badkläder                                ['470']                   4607
Framknäppt bh                            ['189']                   3813
Överdelar                                ['1552']                  3812
Tunikor                                  ['451']                   3805
Dam                                      ['471']                   3439
Byxor                                    ['689']                   3047
Trosor                    

## Brand

In [10]:
# --- Brand cleanup + backfill (concise, no warnings) ---

# Normalize
def _norm_brand(s):
    if pd.isna(s): return pd.NA
    s = ' '.join(str(s).strip().split())  # collapse whitespace
    return s or pd.NA

articles['brand']  = articles['brand'].astype('string').apply(_norm_brand)
articles['brandId'] = articles['brandId'].astype('string').str.strip()

# Build maps from known (non-missing) pairs
known = articles.dropna(subset=['brand', 'brandId'])[['brand','brandId']].drop_duplicates()
name_to_id = (known.groupby('brand')['brandId']
              .agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0]))

id_to_name = (known.groupby('brandId')['brand']
              .agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0]))

# Backfill missing brandId from brand
mask = articles['brandId'].isna() & articles['brand'].notna()
filled_ids = articles.loc[mask, 'brand'].map(name_to_id).astype('string')
# align and assign (avoids FutureWarning)
filled_ids = filled_ids.reindex(articles.index)
articles.loc[mask, 'brandId'] = filled_ids

# Now fill missing brand text with 'unknown' (do this AFTER normalization/backfill)
articles['brand'] = articles['brand'].fillna('unknown').astype('string')

# Price

In [11]:
articles['priceSEK'].isna().sum()

152

In [12]:
import pandas as pd
import requests

# --- Rates (SEK per unit) ----------------------------------------------------
def sek_rates(timeout=8):
    fallback = {"EUR": 11.5, "NOK": 1.0, "DKK": 1.55, "asof": "fallback"}
    try:
        r = requests.get("https://api.frankfurter.app/latest",
                         params={"base": "EUR", "symbols": "SEK,NOK,DKK"},
                         timeout=timeout)
        r.raise_for_status()
        data = r.json()
        eur_sek = float(data["rates"]["SEK"])
        nok_sek = eur_sek / float(data["rates"]["NOK"])
        dkk_sek = eur_sek / float(data["rates"]["DKK"])
        return {"EUR": eur_sek, "NOK": nok_sek, "DKK": dkk_sek, "asof": data.get("date", "unknown")}
    except Exception:
        return fallback

rates = sek_rates()
EUR_to_SEK, NOK_to_SEK, DKK_to_SEK = rates["EUR"], rates["NOK"], rates["DKK"]
print(f"Using rates (SEK per unit) as of {rates['asof']}: EUR={EUR_to_SEK:.6f}, NOK={NOK_to_SEK:.6f}, DKK={DKK_to_SEK:.6f}")

# --- Fill priceSEK (vectorized) ----------------------------------------------
articles = articles.copy()

# Ensure needed cols exist
for c in ["priceSEK", "priceEUR", "priceNOK", "priceDKK"]:
    if c not in articles.columns:
        articles[c] = pd.NA

def _num(s):
    # normalize spaces + decimal comma, then coerce to float
    s = (s.astype("string")
           .str.replace(r"[ \u00A0]", "", regex=True)
           .str.replace(",", ".", regex=False))
    return pd.to_numeric(s, errors="coerce")

eur = _num(articles["priceEUR"])
nok = _num(articles["priceNOK"])
dkk = _num(articles["priceDKK"])

eur_sek = eur * EUR_to_SEK
nok_sek = nok * NOK_to_SEK
dkk_sek = dkk * DKK_to_SEK

# first available: EUR→SEK, else NOK→SEK, else DKK→SEK
calc_sek = eur_sek.fillna(nok_sek).fillna(dkk_sek)

mask = articles["priceSEK"].isna()
num_to_replace = int(mask.sum())

# round down to no decimals, cast to string; fill only missing priceSEK
filled = calc_sek.round(0).astype("Int64").astype("string")
articles.loc[mask, "priceSEK"] = filled.loc[mask]

# uniform dtype
articles["priceSEK"] = articles["priceSEK"].astype("string")

num_replaced = int(articles.loc[mask, "priceSEK"].notna().sum())
print(f"Filled priceSEK for {num_replaced} rows (out of {num_to_replace} missing).")


Using rates (SEK per unit) as of 2025-10-03: EUR=11.003000, NOK=0.942481, DKK=1.473609
Filled priceSEK for 152 rows (out of 152 missing).


In [13]:
# Set priceSEK for specific SKUs as requested
articles.loc[articles['sku'] == '270607-5254', 'priceSEK'] = '1310'
articles.loc[articles['sku'] == '270534-03xl', 'priceSEK'] = '419'


In [14]:
# Output rows where priceSEK is more than 10000 (as SEK)
# First, create a mask for priceSEK > 10000 (as float), then use it to index articles
priceSEK_numeric = pd.to_numeric(articles['priceSEK'], errors='coerce')
high_price_mask = priceSEK_numeric > 10000
high_price_rows = articles[high_price_mask]
high_price_rows


Unnamed: 0,sku,groupId,brandId,name,brand,category,categoryId,priceSEK,priceEUR,priceNOK,priceDKK,forSale
9337,294850,294850,33.0,Scooter Leo,Invacare,Scootrar,2397,24998,2498.0,24998,24998.0,
9338,294843,294843,33.0,Scooter Leo,Invacare,Scootrar,2397,24998,2498.0,24998,24998.0,
9339,294835,294835,33.0,Scooter Colibri,Invacare,Scootrar,2397,19998,1998.0,19998,19998.0,
9508,290036,290036,33.0,Scooter Orion Metro 3-hjul,Invacare,"Scootrar,Rollatorer",23972069,31989,3367.0,31989,31989.0,
9521,290052,290052,33.0,Scooter Orion Metro 4 hjul,Invacare,Scootrar,2397,34998,3498.0,34998,34998.0,
9537,290045,290045,33.0,Scooter Orion Metro 3 hjul,Invacare,Scootrar,2397,34998,3498.0,34998,34998.0,
9770,294124,294124,33.0,Scooter Orion,Invacare,"Scootrar,Rollatorer",23972069,25500,2650.0,25500,25500.0,
9936,294447,294447,5.0,Massagestol,beurer,Massage,1369,16519,1698.0,17998,16519.0,
11105,340569,340569,132.0,Sy-Broderimaskin Singer SE9185,Singer,"Sytillbehör,Symaskiner och tillbehör",151228,20998,1898.9,21998,13998.0,1.0
93816,294884,294884,,Scooter Orion 3-hjul ink,unknown,"Scootrar,Rollatorer",23972069,28995,,28995,,


In [15]:
# Save the cleaned articles DataFrame to a parquet file in the 'processed' directory
articles.to_parquet("../data/processed/articles_clean.parquet", index=False)
