In [1]:
import pandas as pd
articles = pd.read_csv("../data/external/products.csv", dtype='string')

In [2]:
print(articles['groupId'].nunique())


9198


In [3]:
{len(articles)}

{110445}

In [4]:
articles = articles.drop(columns=[
    'status', 'incommingQuantity', 'length', 'width', 'height', 'weight',
    'fabricId', 'fabric', 'description', 'colorId', 'color',
    'sizeId', 'size', 'publishedDate', 'quantity'
])

In [5]:
articles.sample(10)

Unnamed: 0,sku,groupId,brandId,name,brand,audience,audienceId,category,categoryId,priceSEK,priceEUR,priceNOK,priceDKK,forSale
66302,266891-B085,266882,50,Bygel-bh Jacquard&Lace,Miss Mary,Dam,6.0,"Bygel-bh,Bh,Underkläder,Bygel-bh",2232719223,599,59.99,679,449,1.0
82183,260294-B090,260294,50,Miss Mary Bh utan bygel Cooling,Miss Mary,,,"Bh utan bygel,Bh,Bh utan bygel",502750,449,50.0,449,449,
88359,260880-4095,260802,50,Bh utan bygel Broderie Anglaise,Miss Mary,Dam,6.0,"Bh utan bygel,Bh,Underkläder,Bh utan bygel",50271950,399,40.0,399,399,
82718,260129-F115,260129,50,Bh utan bygel,Miss Mary,,,"Bh utan bygel,Bh,Bh utan bygel",502750,379,50.0,379,379,
51875,261479-5100,261479,50,Bh utan bygel,Miss Mary,Dam,6.0,"Bh utan bygel,Underkläder,Framknäppt bh,Bh,Bh ...",50191892750189,349,49.0,349,349,
96437,241620-0042,241562,126,Velourbyxa med resårmidja,Åshild,"Dam,Dam",66.0,"Byxor,Byxor,Mjukisbyxor,Mjukisbyxor,Nederdelar...",689689211021101472147216491649,498,49.9,498,498,1.0
20580,261934-B100,261934,50,Framknäppt bh Nova,Miss Mary,,,"Bh utan bygel,Bh utan bygel,Bh,Bh,Underkläder,...",5050272719195050,649,64.99,749,499,1.0
104833,261604-E085,265843,127,Bomulls-bh utan bygel med Magic Lift-funktion ...,Glamorise,"Dam,Dam",66.0,"Bh utan bygel,Bh utan bygel,Bh,Bh,Underkläder,...",5050272719195050,519,58.9,519,419,1.0
101696,270300-BC54,270300,50,Baddräkt Aruba,Miss Mary,Dam,6.0,"Bh utan bygel,Baddräkter,Badkläder,Bh utan bygel",5046947050,799,69.99,799,529,1.0
45540,210284-5658,210284,126,Jumper,Åshild,,,REA,110,338,33.9,399,338,


In [6]:
articles.isnull().mean()

sku           0.000000
groupId       0.000009
brandId       0.182299
name          0.012776
brand         0.182299
audience      0.581765
audienceId    0.581765
category      0.051691
categoryId    0.051691
priceSEK      0.001530
priceEUR      0.006266
priceNOK      0.006121
priceDKK      0.006691
forSale       0.700901
dtype: float64

Removing bugs

In [7]:
price_columns = ['priceSEK', 'priceEUR', 'priceNOK', 'priceDKK']
articles = articles[~articles[price_columns].isna().all(axis=1)]
len(articles)


110424

# Category

In [8]:
# stats
cat_stats = (articles.loc[articles['categoryId'].notna(), ['category','categoryId']]
             .groupby('category')['categoryId']
             .agg(unique=lambda s: sorted(pd.unique(s)), count='size')
             .sort_values('count', ascending=False))

cat_stats.head()

print(f"missing_categoryId={articles['categoryId'].isna().sum()} "
      f"missing_category={articles['category'].isna().sum()}")



missing_categoryId=5688 missing_category=5688


In [9]:
# category↔ID mapping

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen:
            seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

# Normalize inputs
articles['category']   = articles['category'].apply(dedup_csv).astype('string')
articles['categoryId'] = articles['categoryId'].apply(dedup_csv).astype('string')

# Learn mapping from aligned pairs
pairs = [
    p
    for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False)
    for p in zip(toks(cat), toks(cid))
]

if pairs:
    dfp = pd.DataFrame(pairs, columns=['cat_tok','id_tok'])
    token2id = (dfp.groupby(['cat_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['cat_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('cat_tok')
                  .set_index('cat_tok')['id_tok'])

    articles['categoryId'] = (articles['category']
        .apply(lambda c: ','.join([token2id.get(t, pd.NA) for t in toks(c) if t in token2id]) or pd.NA)
        .astype('string'))

articles['category'] = articles['category'].fillna('unknown').astype('string')



## Brand

In [10]:
def _norm_brand(s):
    if pd.isna(s): return pd.NA
    s = ' '.join(str(s).strip().split())
    return s or pd.NA

# Normalize
articles['brand']   = articles['brand'].apply(_norm_brand).astype('string')
articles['brandId'] = (articles['brandId'].astype('string').str.strip().replace('', pd.NA))

# Learn most frequent brand - id from known pairs
known = articles[['brand','brandId']].dropna()
name_to_id = known.groupby('brand')['brandId'].agg(lambda s: s.value_counts().idxmax())

# Backfill missing brandId from brand
articles['brandId'] = (
    articles['brandId'].where(articles['brandId'].notna(), articles['brand'].map(name_to_id))
).astype('string')

articles['brand'] = articles['brand'].fillna('unknown').astype('string')


# Price

In [11]:
import pandas as pd
import requests

def sek_rates(timeout=8):
    fallback = {"EUR": 11.5, "NOK": 1.0, "DKK": 1.55, "asof": "fallback"}
    try:
        r = requests.get(
            "https://api.frankfurter.app/latest",
            params={"base": "EUR", "symbols": "SEK,NOK,DKK"},
            timeout=timeout,
        )
        r.raise_for_status()
        data = r.json()
        eur_sek = float(data["rates"]["SEK"])
        return {
            "EUR": eur_sek,
            "NOK": eur_sek / float(data["rates"]["NOK"]),
            "DKK": eur_sek / float(data["rates"]["DKK"]),
            "asof": data.get("date", "unknown"),
        }
    except Exception:
        return fallback

rates = sek_rates()
EUR_to_SEK, NOK_to_SEK, DKK_to_SEK = rates["EUR"], rates["NOK"], rates["DKK"]

# Ensure cols exist
for c in ["priceSEK", "priceEUR", "priceNOK", "priceDKK"]:
    if c not in articles.columns:
        articles[c] = pd.NA

def _num(col: pd.Series) -> pd.Series:
    s = (col.astype("string")
            .str.replace(r"[ \u00A0]", "", regex=True)
            .str.replace(",", ".", regex=False))
    return pd.to_numeric(s, errors="coerce")

eur_sek = _num(articles["priceEUR"]) * EUR_to_SEK
nok_sek = _num(articles["priceNOK"]) * NOK_to_SEK
dkk_sek = _num(articles["priceDKK"]) * DKK_to_SEK

calc = eur_sek.fillna(nok_sek).fillna(dkk_sek).round(0).astype("Int64").astype("string")

articles["priceSEK"] = articles["priceSEK"].astype("string")
articles["priceSEK"] = articles["priceSEK"].where(articles["priceSEK"].notna(), calc)



In [12]:
# Set priceSEK for specific SKUs to fix bugs
articles.loc[articles['sku'] == '270607-5254', 'priceSEK'] = '1310'
articles.loc[articles['sku'] == '270534-03xl', 'priceSEK'] = '419'


In [13]:
# Audience cleanup

import re
import pandas as pd

AUD2ID = {'dam':'6','herr':'15','baby & barn':'12','barn & ungdom':'42','generic':'99','hemmet':'222'}

# 1) normalize what’s already there
def norm_audience(a):
    if pd.isna(a): return pd.NA
    toks = {t.strip().lower() for t in str(a).split(',') if t.strip()}
    if any('dam' in t for t in toks):  # “dam” anywhere wins
        return 'dam'
    keep = [t for t in toks if t in AUD2ID]
    return ','.join(keep) if keep else pd.NA

def to_ids(a):
    if pd.isna(a): return pd.NA
    ids = sorted({AUD2ID[t] for t in a.split(',') if t in AUD2ID}, key=int)
    return ','.join(ids) if ids else pd.NA

articles['audience'] = articles['audience'].apply(norm_audience).astype('string')

DAM = [
'dam','bh','trosor','underkläder','body','bodykorselett','korsett','korsetter',
'klänning','klänningar','tunika','tunikor','topp','toppar','kjol','kjolar',
'byxa','byxor','blus','blusar','nattlinne','bikinibh','bikini','t-shirt-bh',
'minimizer','kofta','koftor','väst','västar','skor','väskor','sjalar',
'Bh,Underkläder,Bygel-bh',
'Bygel-bh,Bh,Underkläder',
'Bh utan bygel,Bh,Underkläder',
'Bh utan bygel,Framknäppt bh,Bh,Underkläder',
'Framknäppt bh,Bh,Underkläder',
'Bh,Underkläder,Sport-bh',
'Sport-bh,Bh,Underkläder',
'Minimizer,Bh,Underkläder',
'Underkläder,Trosor',
'Underkläder,Trosor & gördlar',
'Underkläder,Trosor & gördlar,Trosor',
'Trosor,Underkläder,Gördlar',
'Underkjolar,Underkläder',
'Underkläder,Underklänningar',
'Underkläder,Mamelucker',
'Strumpbyxor,Underkläder',
'Baddräkter,Badkläder,Dam',
'Badkläder,Dam',
'Dam,Bikini,Badkläder',
'Dam,Badkläder,Tankini',
'Nattlinnen,Sovkläder,Dam',
'Sovkläder,Dam'
]
HEM = [
'frottéhanddukar','badlakan','bad','badrumsmattor','kökshanddukar','vaxdukar','dukar',
'pläd','plädar','kanallängder','kanalkappa','gardiner','påslakanset','bädd',
'lakan','örngott','hemtextil','kuddfodral','överkast','gardinstänger','kökshjälpmedel',
'dekorationer','metervara','prydnadssaker','belysning','servetter',
'Frottéhanddukar & badlakan',
'Frottéhanddukar & badlakan,Bad',
'Badrumsmattor,Bad',
'Duschdraperier,Bad',
'Kökshanddukar',
'Vaxdukar',
'Dukar',
'Vaxdukar,Dukar',
'Dukar,Vaxdukar',
'Påslakanset',
'Lakan & örngott,Bädd',
'Bädd',
'Bäddtillbehör,Bädd',
'Innerkuddar,Bädd (linea),Kuddar',
'Kuddar',
'Plädar',
'Gardinbåge',
'Kanallängder',
'Kanalkappa',
'Panellängder',
'Multibandslängder',
'Multibandslängder,Mörkläggningsgardiner',
'Öljettkappa',
'Tabletter/underlägg/brickor',
'Batteridrivna ljus',
'Synhjälpmedel,Belysning',
'Ljusstakar & lyktor,Juldekoration',
'Servetter'
]
GEN = [
'inkontinens','stödartiklar','vardagshjälpmedel','rollator','rollatorer','stödstrumpor',
'skotillbehör','fotvård','hobbyhörnan','pussel','sytillbehör','symaskiner','lust',
'massage','synhjälpmedel','medicin','böcker','halkskydd','träning & motion',
'Vardagshjälpmedel',
'Vardagshjälpmedel,Dynor & säten',
'Stödartiklar',
'Synhjälpmedel',
'Gånghjälpmedel',
'Rollatorer',
'Inkontinens',
'Intimvård',
'Fotvård',
'Skotillbehör',
'Stödstrumpor,Underkläder',
'Hobbyhörnan,Pussel',
'Hobbyhörnan,Pysselset',
'Sytillbehör,Symaskiner och tillbehör',
'Symaskiner och tillbehör,Sytillbehör',
'Tvätt & skötsel,Vardagshjälpmedel',
'Tvätt & skötsel,Vardagshjälpmedel,Hushåll övrigt',
'Träning & motion',
'Träning & motion,Hälsa',
'Massage,Kroppsvård,Hälsa',
'Medicin,Hälsa',
'Synhjälpmedel,Belysning,Vardagshjälpmedel',
'Virknålar,Vardagshjälpmedel',
'Halkskydd',
'Halkskydd,Gånghjälpmedel'
]
HERR = [
'herr','skjorta','skjortor','kostym','kavaj','boxer','kalsonger',
'Skjortor,Herr',
'Pyjamas,Herr,Sovkläder',
'Herr,Överdelar,T-shirts',
'Herr,Sovkläder,Nattskjortor',
'Accessoarer,Herr,Kepsar & mössor'
]


REA_TOKEN = re.compile(r'(^|,)\s*rea\s*(?=,|$)')

def strip_rea(s):
    s = s.lower()
    s = REA_TOKEN.sub(lambda m: ',' if m.group(1) else '', s)
    return re.sub(r',+', ',', s).strip(', ').strip()

def classify(cat):
    if pd.isna(cat): return pd.NA
    s = strip_rea(str(cat))
    if not s: return pd.NA
    if any(h in s for h in DAM): return 'dam'
    if any(h in s for h in HERR): return 'herr'
    if any(h in s for h in HEM): return 'hemmet'
    if any(h in s for h in GEN): return 'generic'
    return pd.NA

na_mask = articles['audience'].isna()
fill = articles.loc[na_mask, 'category'].apply(classify)
idx = fill.dropna().index
articles.loc[idx, 'audience'] = fill.loc[idx]

articles['audienceId'] = articles['audience'].apply(to_ids).astype('string')

def move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = move_after(articles, ['audienceId'], 'audience')

# report
filled = len(idx); total = int(na_mask.sum())
print(f"Filled audience for {filled}/{total} ({filled/max(total,1):.1%}). "
      f"Dam={int((articles.loc[idx,'audience']=='dam').sum())}, "
      f"Herr={int((articles.loc[idx,'audience']=='herr').sum())}, "
      f"Hemmet={int((articles.loc[idx,'audience']=='hemmet').sum())}, "
      f"Generic={int((articles.loc[idx,'audience']=='generic').sum())}.")


Filled audience for 51555/64232 (80.3%). Dam=48258, Herr=266, Hemmet=1737, Generic=1294.


In [14]:
articles.to_parquet("../data/processed/articles_clean.parquet", index=False)
