In [1]:
import pandas as pd
articles = pd.read_csv("../data/external/articles.csv", dtype='string')

#how many rows are there in articles?
print(len(articles))

108656


In [2]:
articles = articles.drop(columns=['length', 'width', 'height', 'weight', 'fabricId', 'fabric'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId
0,000DIV,000DIV,,active,,,,,,,,,,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,Blå,264.0,Blå,328.0,,,,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,,,,,Dam,6.0,Tröjor,17.0
3,055573,055573,55.0,active,Beskrivning Luva,Novita,,,,,Dam,6.0,"Mössor & hattar,Mönster",393961.0
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,,,,,Dam,6.0,Vantar,45.0


In [3]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku           0.000000
groupId       0.000009
brandId       0.185705
status        0.000009
name          0.008347
name.1        0.185705
color         0.169388
colorId       0.169388
size          0.033666
sizeId        0.033666
audience      0.582987
audienceId    0.582987
category      0.048474
categoryId    0.048474
dtype: float64

## Name

In [4]:
import re
import pandas as pd

def _is_missing(s):
    return pd.isna(s) or str(s).strip().lower() in {"", "unknown", "nan", "<na>"}

COLORS = [
    "svart","vit","offwhite","off white","grå","ljusgrå","mörkgrå","blå","ljusblå","mellanblå","mörkblå","marin","navy","turkos",
    "grön","mörkgrön","ljusgrön","oliv","khaki","röd","vinröd","rosa","cerise","lila","plommon","gul","orange",
    "beige","sand","natur","brun","kaffe","kamel","taupe","multi","multicolor","flerfärgad",
    "silver","silvergrå","guld","grårosa","vitblå","gråbeige","offvit","gråbrun","gråsvart"
]
_color_alt = [r"off\s*white" if c=="off white" else re.escape(c) for c in COLORS]
COLOR_RE = re.compile(r"\b(" + "|".join(_color_alt) + r")\b", re.IGNORECASE)
DIM_RE = re.compile(r"\b\d{1,3}\s*[x×]\s*\d{1,3}\s*(?:cm|mm)?\b", re.IGNORECASE)
LONE_DIM_RE = re.compile(r"\b\d{1,4}(?:[.,]\d+)?\s*(?:cm|mm)\b", re.IGNORECASE)
DIAM_RE = re.compile(r"[Øø]\s*\d{1,3}\s*(?:cm|mm)\b", re.IGNORECASE)
WEIGHT_RE = re.compile(r"\b\d+(?:[.,]\d+)?\s*(?:kg|g)\b", re.IGNORECASE)
PACK_ANY_RE = re.compile(r"\b(\d+)\s*[- ]?\s*(?:pack|pk|st\.?|st|p|d|del(?:ar)?)\b", re.IGNORECASE)
LETTER_SIZE_RE = re.compile(r"\b(XXXL|XXL|XL|XS|S|M|L)\b", re.IGNORECASE)
BRA_SIZE_RE = re.compile(r"\b([A-H][0-9]{2})\b", re.IGNORECASE)
EU_SIZE_RE = re.compile(r"\b([2-6][0-9])\b(?!\s*(?:cm|mm))", re.IGNORECASE)

def _canon_pack(num, unit):
    u = unit.lower().replace("st.", "st")
    if u in {"pack", "p"}: return f"{num}-pack"
    if u == "pk": return f"{num} pk"
    if u == "st": return f"{num} st"
    return f"{num} delar"

def extract_color(txt):
    if not isinstance(txt, str): return None
    m = COLOR_RE.search(txt)
    return m.group(0).lower() if m else None

def extract_sizes(txt):
    if not isinstance(txt, str): return []
    out = DIM_RE.findall(txt)
    out += [m.group(0) for m in DIAM_RE.finditer(txt)]
    out += LONE_DIM_RE.findall(txt)
    out += WEIGHT_RE.findall(txt)
    out += [_canon_pack(m.group(1), (re.search(r"(pack|pk|st\.?|st|p|d|del(?:ar)?)", m.group(0), re.IGNORECASE) or ["pack"])[0])
            for m in PACK_ANY_RE.finditer(txt)]
    out += [m.group(0).upper() for m in LETTER_SIZE_RE.finditer(txt)]
    out += [m.group(0).upper() for m in BRA_SIZE_RE.finditer(txt)]
    out += [m.group(1) for m in EU_SIZE_RE.finditer(txt)]
    seen = set()
    uniq = []
    for t in out:
        k = t.lower().strip()
        if k not in seen:
            seen.add(k)
            uniq.append(t.strip())
    return uniq

def clean_name(txt, found_color=None, count_only=False):
    if not isinstance(txt, str): return (txt, 0) if count_only else txt
    s, n = txt, 0
    # Remove "Beskrivning " (with space) wherever it appears in the string
    s, n_beskrivning = re.subn(r"\bBeskrivning\s+", "", s)
    n += n_beskrivning
    if found_color: s, n1 = COLOR_RE.subn(" ", s); n += n1
    for pat in (DIM_RE, DIAM_RE, LONE_DIM_RE, WEIGHT_RE, PACK_ANY_RE, LETTER_SIZE_RE, BRA_SIZE_RE, EU_SIZE_RE):
        s, n1 = pat.subn(" ", s); n += n1
    n += sum(s.count(c) for c in ['\\','“','”','"'])
    s = s.replace("\\"," ").replace("“"," ").replace("”"," ").replace('"'," ")
    for pat in [r"\(\s*\)", r"\s*[-–/]\s*", r"\s{2,}"]:
        s, n1 = re.subn(pat, " ", s); n += n1
    s = s.strip(" -–,.;").strip()
    return (s, n) if count_only else s

for col in ["name", "color", "size"]:
    if col in articles.columns:
        articles[col] = articles[col].astype("string")

found_colors = articles["name"].apply(extract_color)
found_sizes = articles["name"].apply(extract_sizes)

mask_c = articles["color"].apply(_is_missing)
color_replacements = (mask_c & found_colors.notna()).sum()
articles.loc[mask_c & found_colors.notna(), "color"] = found_colors[mask_c & found_colors.notna()].str.lower()

mask_s = articles["size"].apply(_is_missing)
joined_sizes = found_sizes.apply(lambda xs: " / ".join(xs) if xs else pd.NA)
size_replacements = (mask_s & joined_sizes.notna()).sum()
articles.loc[mask_s & joined_sizes.notna(), "size"] = joined_sizes[mask_s & joined_sizes.notna()]

cleaned_and_counts = [clean_name(n, c, count_only=True) for n, c in zip(articles["name"], found_colors)]
articles["name"] = [x[0] for x in cleaned_and_counts]
name_replacement_count = sum(x[1] for x in cleaned_and_counts)

print(f"Color replacements made: {color_replacements}")
print(f"Size replacements made: {size_replacements}")
print(f"Name clean replacements made: {name_replacement_count}")

_unwanted_phrases = [
    "Övrigt","Frakt & exp. avgift","Aviavgift","Administrationsavgift","Express","Pf avg","Svarsporto","Krav outl",
    "Hemleverans 1","Hemleverans 2","Hemleverans 3","Hemleverans 4","Tillägg frakt","Krav outlöst","Returporto",
    "Katalogporto","Färgkarta porto","Manual till 293076","Rabatt"
]
_unwanted_re = re.compile("|".join(map(re.escape, _unwanted_phrases)), re.IGNORECASE)
mask_unwanted = articles["name"].astype(str).apply(lambda x: bool(_unwanted_re.search(x)))
removed_rows_count = mask_unwanted.sum()
articles = articles.loc[~mask_unwanted].reset_index(drop=True)
print(f"Rows removed due to unwanted phrases: {removed_rows_count}")

Color replacements made: 1996
Size replacements made: 988
Name clean replacements made: 53617
Rows removed due to unwanted phrases: 26


In [5]:
# Display all 4781 unique values in the 'name' column
unique_names = articles['name'].unique()
print(f"Number of unique names: {len(unique_names)}")
for name in unique_names:
    print(name)


Number of unique names: 4256
<NA>
Lakan örngott
Tröja
Luva
Vantar
Benvärmare
Garn Drops Nepal
Drops Eskimo
Garnpaket Virkade Basketskor
Instruktioner Axelvärmare
Bh utan bygel
Bygel bh
Innerkudde
Swegmark Bh utan bygel
Swegmark Bygel bh
Sport bh
Trofé Bh utan bygel
Fyndpaket Stickgarn
Julbock N
Band my prince
Åshild A kat höst DK
Åshild A kat höst25 FI
Åshild A kat höst25 NO
Åshild A kat höst SE
Bh utan bygel Classic
Bh u.bygel Classic
Bygel bh Anais
Bygel bh Esmeralda
Sport bh Extreme Movement
Sport bh Kimberley
Bh utan bygel Agnes
Bh utan bygel Glitter
Bh utan bygel Corinne
Bh utan bygel Anais
Bh utan bygel Support
Sport bh Courage
Sport bh INCREDIBLE
Glödlampa
Protes bh
Bygel bh Smooth Line
Bygel bh Adamo basic
Sjal
Plånbok och sjal
Plånbok
Poncho
Toppar 3330 C
Toppar 4086B
Mössa
Forstørrelsesglass
Off white fingervante
Tumvantar
Ryggsäck
Väska
Bordsduk Äpple
Kumijalkineet
Hatt
Necessär Papegoja
Handledsvärmare i ull
Skinnhandskar dam
Skinnhandskar Herr
Reflexmössa
Reflexvantar
Öron

## Color

In [6]:
# Show unique colorIds per color and counts (excluding NA colorId)
color_stats = articles[articles['colorId'].notna()].groupby('color', dropna=False)['colorId'].agg(['unique', 'count'])
print(f"{'color':20} {'colorIds':30} count")
for color, (ids, cnt) in color_stats.iterrows():
    print(f"{str(color):20} {str(list(ids)):30} {cnt}")

# Colors with >1 unique colorId (excluding NA)
multi = color_stats['unique'].apply(lambda x: len([i for i in x if pd.notna(i)]) > 1)
if multi.any():
    print("\nColors with more than one colorId:")
    for color, ids in color_stats.loc[multi, 'unique'].items():
        print(f"{color}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo color has more than one colorId.")

print(f"\nNumber of articles without colorId: {articles['colorId'].isna().sum()}")
print(f"Number of articles without color: {articles['color'].isna().sum()}")

# --- Additional analysis: Are there colorIds shared by multiple colors? ---
colorid_to_colors = articles[articles['colorId'].notna()].groupby('colorId', dropna=False)['color'].agg(['unique', 'count'])
multi_colorid = colorid_to_colors['unique'].apply(lambda x: len([i for i in x if pd.notna(i)]) > 1)
if multi_colorid.any():
    print("\ncolorIds used by more than one color:")
    for colorid, colors in colorid_to_colors.loc[multi_colorid, 'unique'].items():
        print(f"{colorid}: {[i for i in colors if pd.notna(i)]}")
else:
    print("\nNo colorId is used by more than one color.")

print(f"\nNumber of unique colorIds: {articles['colorId'].nunique(dropna=True)}")
print(f"Number of unique colors: {articles['color'].nunique(dropna=True)}")

color                colorIds                       count
Antracit             ['1852']                       319
Aprikos              ['1295']                       108
Aqua                 ['1205']                       13
Aqua,Aqua,Aqua       ['660,660,1205']               3
Aqua,Aqua,Aqua,Aqua  ['660,660,1205,1205']          1
Aubergine            ['3760']                       10
Beige                ['311', '121']                 7029
Beige multi          ['3966']                       63
Beige,Beige          ['121,121']                    62
Beige,Beige,Beige    ['311,311,121', '121,121,311'] 3
Beige,Beige,Beige,Beige ['121,121,121,121', '311,311,121,121'] 5
Beige/brun           ['582']                        722
Blush                ['3188']                       1
Blå                  ['264', '269']                 8154
Blå,Blå              ['269,269', '264,264']         17
Blå,Blå,Blå          ['264,264,269']                8
Blå,Blå,Blå,Blå      ['264,264,269,269']          

In [7]:
# Color remapping solution - concise version
def dedup(val):
    if pd.isna(val): return pd.NA
    seen = set()
    tokens = [x.strip() for x in str(val).split(',') if x.strip() and not (x in seen or seen.add(x))]
    return ','.join(tokens) if tokens else pd.NA

def clean_color_name(color):
    if pd.isna(color): return color
    return str(color).replace('/', '-').lower()

def merge_comma_colors(color):
    if pd.isna(color) or ',' not in str(color): return color
    individual_colors = [c.strip() for c in str(color).split(',') if c.strip()]
    best_color = max(individual_colors, key=lambda c: len(articles[articles['color'] == c]), default=color)
    return best_color

# Clean and normalize colors
articles['color'] = articles['color'].apply(dedup).apply(clean_color_name).apply(merge_comma_colors).astype('string')
articles['colorId'] = articles['colorId'].apply(dedup).astype('string')

# Merge rare colors into major categories
rare_color_merges = [
    ("blush", "rosa"), ("cerise", "rosa"), ("grå-rosa", "rosa"), ("grålila", "lila"),
    ("havsblå", "blå"), ("jeansblå", "blå"), ("klarblå", "blå"), ("lavendel", "lila"),
    ("ljus beige", "beige"), ("ljus blå", "blå"), ("ljusgrå mix", "grå"), ("ljusturkos", "turkos"),
    ("marinblå", "marin"), ("mellanblå", "blå"), ("mellanbrun", "brun"), ("mellangrå", "grå"),
    ("mellanrosa", "rosa"), ("mintgrön", "grön"), ("mörkbrun", "brun"), ("mörkröd", "röd"),
    ("natur", "beige"), ("oblekt", "vit"), ("oliv", "grön"), ("orange mix", "orange"),
    ("puderrosa", "rosa"), ("rost", "röd"), ("svart-silver", "svart"), ("transparent", "vit"),
    ("violett", "lila"), ("off white", "offwhite"), ("silverfärgad", "silver"), ("guldgul", "gul"),
    ("ljusrosa", "rosa"), ("gråsvart", "svart"), ("gråbeige", "beige"), ("gråbrun", "brun"),
    ("vitblå", "blå"), ("grårosa", "rosa"), ("plommonlila", "lila"), ("vinröd", "röd"),
    ("multicolor", "multi"), ("flerfärgad", "multi"), ("jeans", "blå"), ("himmelblå", "ljusblå"),
    ("pärlvit", "vit"), ("naturvit", "vit"), ("sandfärgad", "sand"), ("kaffe", "brun"),
    ("kamel", "brun"), ("taupe", "brun"), ("offvit", "vit"), ("beigegrå", "beige")
]


for rare_color, target_color in rare_color_merges:
    mask = articles['color'] == rare_color
    if mask.any():
        articles.loc[mask, 'color'] = target_color
        print(f"'{rare_color}' → '{target_color}' ({mask.sum()} articles)")

# Remap colors to single colorIds
color_to_colorids = articles[articles['color'].notna()].groupby('color')['colorId'].agg(list)
multi_color = color_to_colorids[color_to_colorids.apply(lambda x: len(set([i for i in x if pd.notna(i)])) > 1)]

for color, colorid_list in multi_color.items():
    main_colorid = articles.loc[articles['color'] == color, 'colorId'].value_counts().idxmax()
    articles.loc[articles['color'] == color, 'colorId'] = main_colorid
    print(f"'{color}': {set(colorid_list)} → '{main_colorid}'")

# Fill NA in color column with 'unknown' in place
articles['color'] = articles['color'].fillna('unknown')

'blush' → 'rosa' (1 articles)
'cerise' → 'rosa' (21 articles)
'grå-rosa' → 'rosa' (5 articles)
'grålila' → 'lila' (1 articles)
'havsblå' → 'blå' (1 articles)
'jeansblå' → 'blå' (1 articles)
'klarblå' → 'blå' (1 articles)
'lavendel' → 'lila' (1 articles)
'ljus beige' → 'beige' (1 articles)
'ljus blå' → 'blå' (1 articles)
'ljusgrå mix' → 'grå' (1 articles)
'ljusturkos' → 'turkos' (1 articles)
'marinblå' → 'marin' (1 articles)
'mellanblå' → 'blå' (30 articles)
'mellanbrun' → 'brun' (1 articles)
'mellangrå' → 'grå' (1 articles)
'mellanrosa' → 'rosa' (1 articles)
'mintgrön' → 'grön' (4 articles)
'mörkbrun' → 'brun' (2 articles)
'mörkröd' → 'röd' (5 articles)
'natur' → 'beige' (5 articles)
'oblekt' → 'vit' (1 articles)
'oliv' → 'grön' (1 articles)
'orange mix' → 'orange' (1 articles)
'puderrosa' → 'rosa' (1 articles)
'rost' → 'röd' (5 articles)
'svart-silver' → 'svart' (1 articles)
'transparent' → 'vit' (1 articles)
'violett' → 'lila' (1 articles)
'ljusrosa' → 'rosa' (351 articles)
'gråsvart

### Inspect and transform Category and Category ID

In [8]:
# Show unique categoryIds per category (excluding NA)
cat_stats = articles[articles['categoryId'].notna()].groupby('category')['categoryId'].agg(['unique', 'count'])
print(f"{'category':50} {'categoryIds':40} count")
for cat, (ids, cnt) in cat_stats.iterrows():
    # Format the list of ids as a string, not using a format specifier for the list
    ids_str = str(list(ids))
    print(f"{cat:50} {ids_str:40} {cnt}")

# Categories with >1 unique categoryId
multi_cat = cat_stats['unique'].apply(lambda x: sum(pd.notna(x)) > 1)
if multi_cat.any():
    print("\nCategories with >1 categoryId:")
    for cat, ids in cat_stats.loc[multi_cat, 'unique'].items():
        print(f"{cat}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo category has >1 categoryId.")

print(f"\nArticles without categoryId: {articles['categoryId'].isna().sum()}")
print(f"Articles without category: {articles['category'].isna().sum()}")

category                                           categoryIds                              count
Accessoarer                                        ['454']                                  5
Accessoarer,Bh,Underkläder,Bh-tillbehör            ['454,27,19,691']                        3
Accessoarer,Halsdukar & sjalar                     ['454,7']                                1
Accessoarer,Handskar & vantar                      ['454,1415']                             3
Accessoarer,Herr,Kepsar & mössor                   ['454,162,1447']                         3
Accessoarer,Kepsar & mössor                        ['454,1447']                             18
Accessoarer,Kepsar & mössor,Accessoarer,Kepsar & mössor ['454,1447,454,1447']                    2
Ansiktsvård                                        ['2408']                                 4
Bad,Duschdraperier                                 ['646,338']                              3
Bad,Frottéhanddukar & badlakan                    

In [9]:
# category↔ID mapping

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen: seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

articles['category']   = articles['category'].apply(dedup_csv).astype('string')
articles['categoryId'] = articles['categoryId'].apply(dedup_csv).astype('string')

pairs, mismatched = [], 0
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    if n == 0: continue
    if len(ct) != len(it): mismatched += 1
    pairs.extend(zip(ct[:n], it[:n]))

if not pairs:
    print("No category↔id pairs available.")
else:
    dfp = pd.DataFrame(pairs, columns=['cat_tok','id_tok'])
    token2id = (dfp.groupby(['cat_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['cat_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('cat_tok')
                  .set_index('cat_tok')['id_tok'])

    def rebuild_ids(cat):
        ct = toks(cat)
        mapped = [token2id.get(t, pd.NA) for t in ct if t in token2id]
        return ','.join(mapped) if mapped else pd.NA

    articles['categoryId'] = articles['category'].apply(rebuild_ids).astype('string')

# Replace missing category values with 'unknown' in place
articles['category'] = articles['category'].fillna('unknown').astype('string')

# missing flag
articles['category_missing'] = (articles['category'] == 'unknown').astype('int8')

# keep category_missing next to 'category'
def _move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = _move_after(articles, ['category_missing'], 'category')

# Stats
pairs2 = []
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    pairs2.extend(zip(ct[:n], it[:n]))
d2 = pd.DataFrame(pairs2, columns=['cat_tok','id_tok'])

cat_stats = d2.groupby('cat_tok')['id_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='unique_ids')
cat_stats['count'] = d2.groupby('cat_tok')['id_tok'].size().values

print(f"Mismatched token/id lengths: {mismatched}")
print(f"\n{'category token':40} {'unique_ids':25} count")
for _, r in cat_stats.sort_values('count', ascending=False).iterrows():
    print(f"{r['cat_tok'][:40]:40} {str(r['unique_ids'])[:25]:25} {int(r['count'])}")

multi = cat_stats['unique_ids'].apply(len) > 1
if multi.any():
    print("\nTokens mapping to >1 id:")
    for _, r in cat_stats[multi].iterrows():
        print(f"  {r['cat_tok']}: {r['unique_ids']}")
else:
    print("\n✓ Every category token maps to a single id.")

id_stats = d2.groupby('id_tok')['cat_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='tokens')
multi_id = id_stats['tokens'].apply(len) > 1
if multi_id.any():
    print("\nIds used by multiple tokens:")
    for _, r in id_stats[multi_id].iterrows():
        print(f"  {r['id_tok']}: {r['tokens'][:10]}{' ...' if len(r['tokens'])>10 else ''}")
else:
    print("\n✓ No id is shared by multiple tokens.")

print(f"\nUnique categoryIds: {articles['categoryId'].str.split(',').explode().nunique(dropna=True)}")
print(f"Unique category tokens: {d2['cat_tok'].nunique(dropna=True)}")


Mismatched token/id lengths: 0

category token                           unique_ids                count
Bh                                       ['27']                    60610
Underkläder                              ['19']                    51666
Bh utan bygel                            ['50']                    41392
Bygel-bh                                 ['223']                   20146
REA                                      ['110']                   19798
Sport-bh                                 ['618']                   8344
Badkläder                                ['470']                   4598
Tunikor                                  ['451']                   3805
Överdelar                                ['1552']                  3778
Framknäppt bh                            ['189']                   3765
Dam                                      ['471']                   3422
Byxor                                    ['689']                   2998
Trosor                    

## Audience

In [10]:
print("Unique values of 'audience' and their counts:")
print(articles['audience'].value_counts(dropna=False))


Unique values of 'audience' and their counts:
audience
<NA>                    63319
Dam                     43873
Herr                      823
Dam,Herr                  301
Dam,Dam                   181
Dam,Dam,Dam                80
Dam,Dam,Dam,Dam            18
Dam,Herr,Baby & barn       12
Baby & barn                 9
Barn & ungdom               8
Dam,Dam,Dam,Dam,Dam         5
Hemmet                      1
Name: count, dtype: Int64


In [11]:
# Audience cleanup: short, readable, does the job.

import re
import pandas as pd

AUD2ID = {'dam':'6','herr':'15','baby & barn':'12','barn & ungdom':'42','generic':'99','hemmet':'222'}

# 1) normalize what’s already there
def norm_audience(a):
    if pd.isna(a): return pd.NA
    toks = {t.strip().lower() for t in str(a).split(',') if t.strip()}
    if any('dam' in t for t in toks):  # “dam” anywhere wins
        return 'dam'
    keep = [t for t in toks if t in AUD2ID]
    return ','.join(keep) if keep else pd.NA

def to_ids(a):
    if pd.isna(a): return pd.NA
    ids = sorted({AUD2ID[t] for t in a.split(',') if t in AUD2ID}, key=int)
    return ','.join(ids) if ids else pd.NA

articles['audience'] = articles['audience'].apply(norm_audience).astype('string')

# 2) fill missing using category text (substring matches, ignore “REA” as a token)
DAM = [
    'dam','bh','trosor','underkläder','body','bodykorselett','korsett','korsetter',
    'klänning','klänningar','tunika','tunikor','topp','toppar','kjol','kjolar',
    'byxa','byxor','blus','blusar','nattlinne','bikinibh','bikini','t-shirt-bh',
    'minimizer','kofta','koftor','väst','västar','skor','väskor','sjalar'
]
HEM = [
    'frottéhanddukar','badlakan','bad','badrumsmattor','kökshanddukar','vaxdukar','dukar',
    'pläd','plädar','kanallängder','kanalkappa','gardiner','påslakanset','bädd',
    'lakan','örngott','hemtextil','kuddfodral','överkast','gardinstänger','kökshjälpmedel',
    'dekorationer','metervara','prydnadssaker','belysning','servetter'
]
GEN = [
    'inkontinens','stödartiklar','vardagshjälpmedel','rollator','rollatorer','stödstrumpor',
    'skotillbehör','fotvård','hobbyhörnan','pussel','sytillbehör','symaskiner','lust',
    'massage','synhjälpmedel','medicin','böcker','halkskydd','träning & motion'
]
HER = ['herr','skjorta','skjortor','kostym','kavaj','boxer']

REA_TOKEN = re.compile(r'(^|,)\s*rea\s*(?=,|$)')

def strip_rea(s):
    s = s.lower()
    s = REA_TOKEN.sub(lambda m: ',' if m.group(1) else '', s)
    return re.sub(r',+', ',', s).strip(', ').strip()

def classify(cat):
    if pd.isna(cat): return pd.NA
    s = strip_rea(str(cat))
    if not s: return pd.NA
    if any(h in s for h in DAM): return 'dam'
    if any(h in s for h in HER): return 'herr'
    if any(h in s for h in HEM): return 'hemmet'
    if any(h in s for h in GEN): return 'generic'
    return pd.NA

na_mask = articles['audience'].isna()
fill = articles.loc[na_mask, 'category'].apply(classify)
idx = fill.dropna().index
articles.loc[idx, 'audience'] = fill.loc[idx]

# 3) ids and missing flag; keep columns together
articles['audienceId']       = articles['audience'].apply(to_ids).astype('string')
articles['audience_missing'] = articles['audience'].isna().astype('int8')

def move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = move_after(articles, ['audienceId','audience_missing'], 'audience')

# tiny report
filled = len(idx); total = int(na_mask.sum())
print(f"Filled audience for {filled}/{total} ({filled/max(total,1):.1%}). "
      f"Dam={int((articles.loc[idx,'audience']=='dam').sum())}, "
      f"Herr={int((articles.loc[idx,'audience']=='herr').sum())}, "
      f"Hemmet={int((articles.loc[idx,'audience']=='hemmet').sum())}, "
      f"Generic={int((articles.loc[idx,'audience']=='generic').sum())}.")


Filled audience for 51279/63319 (81.0%). Dam=48059, Herr=266, Hemmet=1683, Generic=1271.


In [12]:
articles['audience'] = articles['audience'].fillna('unknown').astype('string')

## Size

In [13]:
# Ensure string dtype
articles['size'] = articles['size'].astype('string')
articles['sizeId'] = articles['sizeId'].astype('string')

# Unique sizeIds per size
size_stats = (articles[articles['sizeId'].notna()]
              .groupby('size', dropna=False)['sizeId']
              .agg(unique=lambda s: list(pd.unique(s)), count='size')
              .sort_values('count', ascending=False))

print(f"{'size':50} {'sizeIds':40} count")
for size, row in size_stats.iterrows():
    print(f"{size:50} {str(row['unique']):40} {row['count']}")

# Sizes with >1 sizeId
multi_size = size_stats['unique'].apply(lambda ids: sum(pd.notna(i) for i in ids) > 1)
if multi_size.any():
    print("\nSizes with >1 sizeId:")
    for size, ids in size_stats.loc[multi_size, 'unique'].items():
        print(f"{size}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo size has >1 sizeId.")

# sizeIds shared by multiple sizes
rev = (articles[articles['sizeId'].notna()]
       .groupby('sizeId')['size']
       .agg(unique=lambda s: list(pd.unique(s)), count='size'))
multi_id = rev['unique'].apply(lambda ss: sum(pd.notna(x) for x in ss) > 1)
if multi_id.any():
    print("\nsizeIds shared by multiple sizes:")
    for sid, sizes in rev.loc[multi_id, 'unique'].items():
        print(f"{sid}: {[s for s in sizes if pd.notna(s)]}")
else:
    print("\nNo sizeId shared by multiple sizes.")

print(f"\nArticles without sizeId: {articles['sizeId'].isna().sum()}")
print(f"Articles without size: {articles['size'].isna().sum()}")


size                                               sizeIds                                  count
38                                                 ['106']                                  2491
40                                                 ['107']                                  2486
42                                                 ['108']                                  2427
44                                                 ['111']                                  2321
46                                                 ['112']                                  2282
48                                                 ['113']                                  2087
50                                                 ['114']                                  1855
52                                                 ['115']                                  1811
D85                                                ['68']                                   1719
C85                          

In [14]:
# --- cleanup & mapping for size ---
import re

NOISE_TOKENS = {
    '***missing***','rosa','svart','blå','offwhite','vinröd','greige',
    'kuddfodral','påslakan','tomte','ostbricka','plommonlila'
}
SIZE_PATTERNS = [
    r'^\d{2}$', r'^\d{2}/\d{2}$', r'^[A-Z]{1,2}/?[A-Z]{0,2}\d{2}$',
    r'^\d{2}[A-Z]/[A-Z]$', r'^[A-Z]{1,3}$', r'^\d{2}x\d{2,3}\s*cm$',
    r'^\d{2,3}x\d{2,3}\s*cm$', r'^\d+(\.\d+)?\s*mm$', r'^\d{2}x\d{2}$',
    r'^\d+$', r'^\d+[- ]?PACK$', r'^[A-Z]/[A-Z]\d{2}$'
]
size_re = re.compile('|'.join(f'(?:{p})' for p in SIZE_PATTERNS), re.I)

def canon_size_token(t):
    t = t.strip().replace('×','x').replace(' X ','x').replace(' x ','x')
    t = re.sub(r'\s*x\s*', 'x', t)
    t = re.sub(r'\s+cm\b', ' cm', t, flags=re.I)
    t = re.sub(r'\s+mm\b', ' mm', t, flags=re.I)
    t = t.replace(',', '.') if re.search(r'\d,\d', t) else t
    t = t.upper() if t.lower() in {'xs','s','m','l','xl','xxl','3xl','4xl','5xl','6xl','one size'} else t
    t = t.replace('–','-').replace('—','-')
    t = t.replace('-', '/') if re.fullmatch(r'\d{2}-\d{2}', t) else t
    return t

def is_size_token(t):
    t0 = t.strip().lower()
    return t0 not in NOISE_TOKENS and bool(size_re.match(t.strip()))

def normalize_size_cell(s):
    if pd.isna(s): return pd.NA
    toks = [canon_size_token(t) for t in str(s).split(',') if t.strip()]
    toks = [t for t in toks if is_size_token(t)]
    out = []
    for t in toks:
        if t not in out: out.append(t)
    return ','.join(out) if out else pd.NA

articles['size'] = articles['size'].apply(normalize_size_cell).astype('string')
articles.loc[articles['size'].isna() | (articles['size']=='unknown'), 'sizeId'] = pd.NA

# Rebuild sizeId from valid tokens
def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

pairs = []
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st = [t for t in toks(sz) if is_size_token(t)]
    it = toks(sid)
    n = min(len(st), len(it))
    pairs.extend(zip(st[:n], it[:n]))
if pairs:
    dfp = pd.DataFrame(pairs, columns=['size_tok','id_tok'])
    token2id = (dfp.groupby(['size_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['size_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('size_tok')
                  .set_index('size_tok')['id_tok'])
    def rebuild_ids(sz):
        st = [t for t in toks(sz) if is_size_token(t)]
        mapped = [token2id.get(t, pd.NA) for t in st if pd.notna(token2id.get(t, pd.NA))]
        return ','.join(mapped) if mapped else pd.NA
    articles['sizeId'] = articles['size'].apply(rebuild_ids).astype('string')


In [15]:
# Size ↔ sizeId: dedup, learn mapping, rebuild ids, fill missing, stats.

import pandas as pd

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen: seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

# Deduplicate
articles['size']   = articles['size'].apply(dedup_csv).astype('string')
articles['sizeId'] = articles['sizeId'].apply(dedup_csv).astype('string')

# Learn mapping and rebuild ids
pairs, mismatched = [], 0
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st, it = toks(sz), toks(sid)
    n = min(len(st), len(it))
    if n == 0: continue
    if len(st) != len(it): mismatched += 1
    pairs.extend(zip(st[:n], it[:n]))
if pairs:
    dfp = pd.DataFrame(pairs, columns=['size_tok','id_tok'])
    token2id = (dfp.groupby(['size_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['size_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('size_tok')
                  .set_index('size_tok')['id_tok'])
    articles['sizeId'] = articles['size'].apply(
        lambda sz: ','.join([token2id.get(t, pd.NA) for t in toks(sz) if pd.notna(token2id.get(t, pd.NA))]) if pd.notna(sz) and toks(sz) else pd.NA
    ).astype('string')
else:
    print("No size↔id pairs available to learn mapping.")

# Fill missing, add flag, move flag
articles['size'] = articles['size'].fillna('unknown').astype('string')
articles['size_missing'] = (articles['size'] == 'unknown').astype('int8')
def _move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]
articles = _move_after(articles, ['size_missing'], 'size')

# Stats
pairs2 = []
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st, it = toks(sz), toks(sid)
    n = min(len(st), len(it))
    pairs2.extend(zip(st[:n], it[:n]))
d2 = pd.DataFrame(pairs2, columns=['size_tok','id_tok']) if pairs2 else pd.DataFrame(columns=['size_tok','id_tok'])

if not d2.empty:
    size_stats = (d2.groupby('size_tok')['id_tok']
                    .agg(lambda s: sorted(pd.unique(s.dropna())))
                    .reset_index(name='unique_ids'))
    size_stats['count'] = d2.groupby('size_tok')['id_tok'].size().values

    print(f"Mismatched token/id lengths: {mismatched}")
    for _, r in size_stats.sort_values('count', ascending=False).iterrows():
        print(f"{r['size_tok'][:30]:30} {str(r['unique_ids'])[:25]:25} {int(r['count'])}")

    multi = size_stats['unique_ids'].apply(len) > 1
    if multi.any():
        print("\nTokens mapping to >1 id:")
        for _, r in size_stats[multi].iterrows():
            print(f"  {r['size_tok']}: {r['unique_ids']}")
    else:
        print("\n✓ Every size token maps to a single id.")

    id_stats = (d2.groupby('id_tok')['size_tok']
                  .agg(lambda s: sorted(pd.unique(s.dropna())))
                  .reset_index(name='tokens'))
    multi_id = id_stats['tokens'].apply(len) > 1
    if multi_id.any():
        print("\nSizeIds shared by multiple tokens:")
        for _, r in id_stats[multi_id].iterrows():
            print(f"  {r['id_tok']}: {r['tokens'][:10]}{' ...' if len(r['tokens'])>10 else ''}")
    else:
        print("\n✓ No sizeId is shared by multiple size tokens.")
else:
    print("No size/id pairs to report stats on.")

print(f"\nArticles without sizeId: {int(articles['sizeId'].isna().sum())}")
print(f"Articles with size == 'unknown': {int((articles['size']=='unknown').sum())}")


Mismatched token/id lengths: 26
40                             ['107']                   2500
38                             ['106']                   2496
42                             ['108']                   2432
44                             ['111']                   2326
46                             ['112']                   2287
48                             ['113']                   2093
50                             ['114']                   1862
52                             ['115']                   1815
C85                            ['28']                    1724
D85                            ['68']                    1723
C90                            ['62']                    1689
D90                            ['69']                    1686
C80                            ['60']                    1684
D80                            ['67']                    1669
C95                            ['63']                    1605
E85                            ['77'] 

In [16]:
# Print a few rows where sizeId is not na
articles[articles['sizeId'].notna()].head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,size_missing,sizeId,audience,audienceId,audience_missing,category,category_missing,categoryId
49,0HP33S,0HP33S,,active,Instruktioner Axelvärmare,,unknown,,S,0,32,unknown,,1,unknown,1,
50,100501-2075,100501,80.0,inactive,Bh utan bygel,Swegmark,vit-beige,105.0,B75,0,51,dam,6.0,0,"Bh utan bygel,Bh",0,5027.0
51,100501-2080,100501,80.0,inactive,Bh utan bygel,Swegmark,vit-beige,105.0,B80,0,52,dam,6.0,0,"Bh utan bygel,Bh",0,5027.0
52,100501-2085,100501,80.0,inactive,Bh utan bygel,Swegmark,vit-beige,105.0,B85,0,54,dam,6.0,0,"Bh utan bygel,Bh",0,5027.0
53,100501-2090,100501,80.0,inactive,Bh utan bygel,Swegmark,vit-beige,105.0,B90,0,55,dam,6.0,0,"Bh utan bygel,Bh",0,5027.0


## Brand

In [17]:
# Ensure string dtype
articles['size'] = articles['size'].astype('string')
articles['sizeId'] = articles['sizeId'].astype('string')

# Unique sizeIds per size
size_stats = (articles[articles['sizeId'].notna()]
              .groupby('size', dropna=False)['sizeId']
              .agg(unique=lambda s: list(pd.unique(s)), count='size')
              .sort_values('count', ascending=False))

print(f"{'size':50} {'sizeIds':40} count")
for size, row in size_stats.iterrows():
    print(f"{size:50} {str(row['unique']):40} {row['count']}")

# Sizes with >1 sizeId
multi_size = size_stats['unique'].apply(lambda ids: sum(pd.notna(i) for i in ids) > 1)
if multi_size.any():
    print("\nSizes with >1 sizeId:")
    for size, ids in size_stats.loc[multi_size, 'unique'].items():
        print(f"{size}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo size has >1 sizeId.")

# sizeIds shared by multiple sizes
rev = (articles[articles['sizeId'].notna()]
       .groupby('sizeId')['size']
       .agg(unique=lambda s: list(pd.unique(s)), count='size'))
multi_id = rev['unique'].apply(lambda ss: sum(pd.notna(x) for x in ss) > 1)
if multi_id.any():
    print("\nsizeIds shared by multiple sizes:")
    for sid, sizes in rev.loc[multi_id, 'unique'].items():
        print(f"{sid}: {[s for s in sizes if pd.notna(s)]}")
else:
    print("\nNo sizeId shared by multiple sizes.")

print(f"\nArticles without sizeId: {articles['sizeId'].isna().sum()}")
print(f"Articles without size: {articles['size'].isna().sum()}")

size                                               sizeIds                                  count
38                                                 ['106']                                  2496
40                                                 ['107']                                  2491
42                                                 ['108']                                  2432
44                                                 ['111']                                  2326
46                                                 ['112']                                  2287
48                                                 ['113']                                  2093
50                                                 ['114']                                  1862
52                                                 ['115']                                  1815
C85                                                ['28']                                   1724
D85                          

In [18]:
# --- Brand cleanup + backfill + missing instrumentation (concise, no warnings) ---

# Normalize
def _norm_brand(s):
    if pd.isna(s): return pd.NA
    s = ' '.join(str(s).strip().split())  # collapse whitespace
    return s or pd.NA

articles['name.1']  = articles['name.1'].astype('string').apply(_norm_brand)
articles['brandId'] = articles['brandId'].astype('string').str.strip()

# Build maps from known (non-missing) pairs
known = articles.dropna(subset=['name.1', 'brandId'])[['name.1','brandId']].drop_duplicates()
name_to_id = (known.groupby('name.1')['brandId']
              .agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0]))

id_to_name = (known.groupby('brandId')['name.1']
              .agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0]))

# Backfill missing brandId from name.1
mask = articles['brandId'].isna() & articles['name.1'].notna()
filled_ids = articles.loc[mask, 'name.1'].map(name_to_id).astype('string')
# align and assign (avoids FutureWarning)
filled_ids = filled_ids.reindex(articles.index)
articles.loc[mask, 'brandId'] = filled_ids

# Now fill missing brand text with 'unknown' (do this AFTER normalization/backfill)
articles['name.1'] = articles['name.1'].fillna('unknown').astype('string')

# Missing flag for brandId
articles['brand_missing'] = articles['brandId'].isna().astype('int8')

# Keep flag next to brandId (optional)
def _move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = _move_after(articles, ['brand_missing'], 'brandId')

In [19]:
articles.head()

Unnamed: 0,sku,groupId,brandId,brand_missing,status,name,name.1,color,colorId,size,size_missing,sizeId,audience,audienceId,audience_missing,category,category_missing,categoryId
0,000DIV,000DIV,,1,active,,unknown,unknown,,unknown,1,,unknown,,1,unknown,1,
1,052743,052743,,1,inactive,Lakan örngott,unknown,blå,269.0,unknown,1,,unknown,,1,unknown,1,
2,055522,055522,265.0,0,active,Tröja,Gjestal Garn,unknown,,unknown,1,,dam,6.0,0,Tröjor,0,17.0
3,055573,055573,55.0,0,active,Luva,Novita,unknown,,unknown,1,,dam,6.0,0,"Mössor & hattar,Mönster",0,393961.0
4,055575,055575,55.0,0,active,Vantar,Novita,unknown,,unknown,1,,dam,6.0,0,Vantar,0,45.0


## Status

In [20]:
#status column
status = articles['status'].unique()
print(status)
articles['status'] = articles['status'].astype('string')
print(articles['status'].dtype)

<StringArray>
['active', 'inactive', 'discontinued', 'removed', <NA>]
Length: 5, dtype: string
string


In [21]:
articles['status'] = articles['status'].fillna('unknown').astype('string')


In [22]:
# How many rows are there in articles vs how many rows that are in status inactive, discontinued, or removed

total_rows = len(articles)
inactive_rows = articles['status'].isin(['inactive', 'discontinued', 'removed']).sum()

print(f"Total rows in articles: {total_rows}")
print(f"Rows with status inactive, discontinued, or removed: {inactive_rows}")


Total rows in articles: 108630
Rows with status inactive, discontinued, or removed: 75682


In [23]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku                 0.000000
groupId             0.000009
brandId             0.185520
brand_missing       0.000000
status              0.000000
name                0.008349
name.1              0.000000
color               0.000000
colorId             0.151082
size                0.000000
size_missing        0.000000
sizeId              0.222038
audience            0.000000
audienceId          0.110835
audience_missing    0.000000
category            0.000000
category_missing    0.000000
categoryId          0.048256
dtype: float64

In [24]:
# Save the cleaned articles DataFrame to CSV with all columns as string type
articles_clean = articles.astype('string')
articles_clean.to_csv("../data/processed/articles_clean.csv", index=False)

articles_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108630 entries, 0 to 108629
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   sku               108630 non-null  string
 1   groupId           108629 non-null  string
 2   brandId           88477 non-null   string
 3   brand_missing     108630 non-null  string
 4   status            108630 non-null  string
 5   name              107723 non-null  string
 6   name.1            108630 non-null  string
 7   color             108630 non-null  string
 8   colorId           92218 non-null   string
 9   size              108630 non-null  string
 10  size_missing      108630 non-null  string
 11  sizeId            84510 non-null   string
 12  audience          108630 non-null  string
 13  audienceId        96590 non-null   string
 14  audience_missing  108630 non-null  string
 15  category          108630 non-null  string
 16  category_missing  108630 non-null  str

In [25]:
articles_clean.head()

Unnamed: 0,sku,groupId,brandId,brand_missing,status,name,name.1,color,colorId,size,size_missing,sizeId,audience,audienceId,audience_missing,category,category_missing,categoryId
0,000DIV,000DIV,,1,active,,unknown,unknown,,unknown,1,,unknown,,1,unknown,1,
1,052743,052743,,1,inactive,Lakan örngott,unknown,blå,269.0,unknown,1,,unknown,,1,unknown,1,
2,055522,055522,265.0,0,active,Tröja,Gjestal Garn,unknown,,unknown,1,,dam,6.0,0,Tröjor,0,17.0
3,055573,055573,55.0,0,active,Luva,Novita,unknown,,unknown,1,,dam,6.0,0,"Mössor & hattar,Mönster",0,393961.0
4,055575,055575,55.0,0,active,Vantar,Novita,unknown,,unknown,1,,dam,6.0,0,Vantar,0,45.0


In [26]:
#how many rows are there in articles?
print(len(articles_clean))

108630
