In [462]:
import pandas as pd
articles = pd.read_csv("../data/external/articles.csv", dtype='string')

#how many rows are there in articles?
print(len(articles))

108656


In [463]:
articles = articles.drop(columns=['length', 'width', 'height', 'weight'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,,active,,,,,,,,,,,,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,Blå,264.0,Blå,328.0,,,,,,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,,,,,Dam,6.0,Tröjor,17.0,,
3,055573,055573,55.0,active,Beskrivning Luva,Novita,,,,,Dam,6.0,"Mössor & hattar,Mönster",393961.0,,
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,,,,,Dam,6.0,Vantar,45.0,,


In [464]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku           0.000000
groupId       0.000009
brandId       0.185705
status        0.000009
name          0.008347
name.1        0.185705
color         0.169388
colorId       0.169388
size          0.033666
sizeId        0.033666
audience      0.582987
audienceId    0.582987
category      0.048474
categoryId    0.048474
fabric        0.998813
fabricId      0.998813
dtype: float64

In [465]:
#remove fabric column as irrelevant
articles = articles.drop(columns=['fabric'])

#remove fabricId column as irrelevant
articles = articles.drop(columns=['fabricId'])

In [466]:
# Show unique colorIds per color and counts (excluding NA colorId)
color_stats = articles[articles['colorId'].notna()].groupby('color', dropna=False)['colorId'].agg(['unique', 'count'])
print(f"{'color':20} {'colorIds':30} count")
for color, (ids, cnt) in color_stats.iterrows():
    print(f"{str(color):20} {str(list(ids)):30} {cnt}")

# Colors with >1 unique colorId (excluding NA)
multi = color_stats['unique'].apply(lambda x: len([i for i in x if pd.notna(i)]) > 1)
if multi.any():
    print("\nColors with more than one colorId:")
    for color, ids in color_stats.loc[multi, 'unique'].items():
        print(f"{color}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo color has more than one colorId.")

print(f"\nNumber of articles without colorId: {articles['colorId'].isna().sum()}")
print(f"Number of articles without color: {articles['color'].isna().sum()}")

# --- Additional analysis: Are there colorIds shared by multiple colors? ---
colorid_to_colors = articles[articles['colorId'].notna()].groupby('colorId', dropna=False)['color'].agg(['unique', 'count'])
multi_colorid = colorid_to_colors['unique'].apply(lambda x: len([i for i in x if pd.notna(i)]) > 1)
if multi_colorid.any():
    print("\ncolorIds used by more than one color:")
    for colorid, colors in colorid_to_colors.loc[multi_colorid, 'unique'].items():
        print(f"{colorid}: {[i for i in colors if pd.notna(i)]}")
else:
    print("\nNo colorId is used by more than one color.")

print(f"\nNumber of unique colorIds: {articles['colorId'].nunique(dropna=True)}")
print(f"Number of unique colors: {articles['color'].nunique(dropna=True)}")

color                colorIds                       count
Antracit             ['1852']                       319
Aprikos              ['1295']                       108
Aqua                 ['1205']                       13
Aqua,Aqua,Aqua       ['660,660,1205']               3
Aqua,Aqua,Aqua,Aqua  ['660,660,1205,1205']          1
Aubergine            ['3760']                       10
Beige                ['311', '121']                 7029
Beige multi          ['3966']                       63
Beige,Beige          ['121,121']                    62
Beige,Beige,Beige    ['311,311,121', '121,121,311'] 3
Beige,Beige,Beige,Beige ['121,121,121,121', '311,311,121,121'] 5
Beige/brun           ['582']                        722
Blush                ['3188']                       1
Blå                  ['264', '269']                 8154
Blå,Blå              ['269,269', '264,264']         17
Blå,Blå,Blå          ['264,264,269']                8
Blå,Blå,Blå,Blå      ['264,264,269,269']          

In [467]:
# Color remapping solution - concise version
def dedup(val):
    if pd.isna(val): return pd.NA
    seen = set()
    tokens = [x.strip() for x in str(val).split(',') if x.strip() and not (x in seen or seen.add(x))]
    return ','.join(tokens) if tokens else pd.NA

def clean_color_name(color):
    if pd.isna(color): return color
    return str(color).replace('/', '-').lower()

def merge_comma_colors(color):
    if pd.isna(color) or ',' not in str(color): return color
    individual_colors = [c.strip() for c in str(color).split(',') if c.strip()]
    best_color = max(individual_colors, key=lambda c: len(articles[articles['color'] == c]), default=color)
    return best_color

# Clean and normalize colors
articles['color'] = articles['color'].apply(dedup).apply(clean_color_name).apply(merge_comma_colors).astype('string')
articles['colorId'] = articles['colorId'].apply(dedup).astype('string')

# Merge rare colors into major categories
rare_color_merges = [
    ("blush", "rosa"), ("cerise", "rosa"), ("grå-rosa", "rosa"), ("grålila", "lila"),
    ("havsblå", "blå"), ("jeansblå", "blå"), ("klarblå", "blå"), ("lavendel", "lila"),
    ("ljus beige", "beige"), ("ljus blå", "blå"), ("ljusgrå mix", "grå"), ("ljusturkos", "turkos"),
    ("marinblå", "marin"), ("mellanblå", "blå"), ("mellanbrun", "brun"), ("mellangrå", "grå"),
    ("mellanrosa", "rosa"), ("mintgrön", "grön"), ("mörkbrun", "brun"), ("mörkröd", "röd"),
    ("natur", "beige"), ("oblekt", "vit"), ("oliv", "grön"), ("orange mix", "orange"),
    ("puderrosa", "rosa"), ("rost", "röd"), ("svart-silver", "svart"), ("transparent", "vit"), ("violett", "lila")
]

for rare_color, target_color in rare_color_merges:
    mask = articles['color'] == rare_color
    if mask.any():
        articles.loc[mask, 'color'] = target_color
        print(f"'{rare_color}' → '{target_color}' ({mask.sum()} articles)")

# Remap colors to single colorIds
color_to_colorids = articles[articles['color'].notna()].groupby('color')['colorId'].agg(list)
multi_color = color_to_colorids[color_to_colorids.apply(lambda x: len(set([i for i in x if pd.notna(i)])) > 1)]

for color, colorid_list in multi_color.items():
    main_colorid = articles.loc[articles['color'] == color, 'colorId'].value_counts().idxmax()
    articles.loc[articles['color'] == color, 'colorId'] = main_colorid
    print(f"'{color}': {set(colorid_list)} → '{main_colorid}'")

# Create color facet column and insert it right after the color column
articles.insert(articles.columns.get_loc('color') + 1, 'color_facet', articles['color'].fillna('unknown'))

'blush' → 'rosa' (1 articles)
'cerise' → 'rosa' (1 articles)
'grå-rosa' → 'rosa' (5 articles)
'grålila' → 'lila' (1 articles)
'havsblå' → 'blå' (1 articles)
'jeansblå' → 'blå' (1 articles)
'klarblå' → 'blå' (1 articles)
'lavendel' → 'lila' (1 articles)
'ljus beige' → 'beige' (1 articles)
'ljus blå' → 'blå' (1 articles)
'ljusgrå mix' → 'grå' (1 articles)
'ljusturkos' → 'turkos' (1 articles)
'marinblå' → 'marin' (1 articles)
'mellanblå' → 'blå' (1 articles)
'mellanbrun' → 'brun' (1 articles)
'mellangrå' → 'grå' (1 articles)
'mellanrosa' → 'rosa' (1 articles)
'mintgrön' → 'grön' (4 articles)
'mörkbrun' → 'brun' (2 articles)
'mörkröd' → 'röd' (5 articles)
'natur' → 'beige' (3 articles)
'oblekt' → 'vit' (1 articles)
'oliv' → 'grön' (1 articles)
'orange mix' → 'orange' (1 articles)
'puderrosa' → 'rosa' (1 articles)
'rost' → 'röd' (5 articles)
'svart-silver' → 'svart' (1 articles)
'transparent' → 'vit' (1 articles)
'violett' → 'lila' (1 articles)
'aqua': {'1205', '660,1205'} → '1205'
'beige':

### Inspect and transform Category and Category ID

In [468]:
# Show unique categoryIds per category (excluding NA)
cat_stats = articles[articles['categoryId'].notna()].groupby('category')['categoryId'].agg(['unique', 'count'])
print(f"{'category':50} {'categoryIds':40} count")
for cat, (ids, cnt) in cat_stats.iterrows():
    # Format the list of ids as a string, not using a format specifier for the list
    ids_str = str(list(ids))
    print(f"{cat:50} {ids_str:40} {cnt}")

# Categories with >1 unique categoryId
multi_cat = cat_stats['unique'].apply(lambda x: sum(pd.notna(x)) > 1)
if multi_cat.any():
    print("\nCategories with >1 categoryId:")
    for cat, ids in cat_stats.loc[multi_cat, 'unique'].items():
        print(f"{cat}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo category has >1 categoryId.")

print(f"\nArticles without categoryId: {articles['categoryId'].isna().sum()}")
print(f"Articles without category: {articles['category'].isna().sum()}")

category                                           categoryIds                              count
Accessoarer                                        ['454']                                  5
Accessoarer,Bh,Underkläder,Bh-tillbehör            ['454,27,19,691']                        3
Accessoarer,Halsdukar & sjalar                     ['454,7']                                1
Accessoarer,Handskar & vantar                      ['454,1415']                             3
Accessoarer,Herr,Kepsar & mössor                   ['454,162,1447']                         3
Accessoarer,Kepsar & mössor                        ['454,1447']                             18
Accessoarer,Kepsar & mössor,Accessoarer,Kepsar & mössor ['454,1447,454,1447']                    2
Ansiktsvård                                        ['2408']                                 4
Bad,Duschdraperier                                 ['646,338']                              3
Bad,Frottéhanddukar & badlakan                    

In [469]:
# Concise category↔ID mapping and stats. No separate category_facet column; replace missing with 'unknown' in place.

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen: seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

articles['category']   = articles['category'].apply(dedup_csv).astype('string')
articles['categoryId'] = articles['categoryId'].apply(dedup_csv).astype('string')

pairs, mismatched = [], 0
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    if n == 0: continue
    if len(ct) != len(it): mismatched += 1
    pairs.extend(zip(ct[:n], it[:n]))

if not pairs:
    print("No category↔id pairs available.")
else:
    dfp = pd.DataFrame(pairs, columns=['cat_tok','id_tok'])
    token2id = (dfp.groupby(['cat_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['cat_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('cat_tok')
                  .set_index('cat_tok')['id_tok'])

    def rebuild_ids(cat):
        ct = toks(cat)
        mapped = [token2id.get(t, pd.NA) for t in ct if t in token2id]
        return ','.join(mapped) if mapped else pd.NA

    articles['categoryId'] = articles['category'].apply(rebuild_ids).astype('string')

# Replace missing category values with 'unknown' in place
articles['category'] = articles['category'].fillna('unknown').astype('string')

# missing flag
articles['category_missing'] = (articles['category'] == 'unknown').astype('int8')

# keep category_missing next to 'category'
def _move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = _move_after(articles, ['category_missing'], 'category')

# Stats
pairs2 = []
for cat, cid in articles[['category','categoryId']].dropna().itertuples(index=False):
    ct, it = toks(cat), toks(cid)
    n = min(len(ct), len(it))
    pairs2.extend(zip(ct[:n], it[:n]))
d2 = pd.DataFrame(pairs2, columns=['cat_tok','id_tok'])

cat_stats = d2.groupby('cat_tok')['id_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='unique_ids')
cat_stats['count'] = d2.groupby('cat_tok')['id_tok'].size().values

print(f"Mismatched token/id lengths: {mismatched}")
print(f"\n{'category token':40} {'unique_ids':25} count")
for _, r in cat_stats.sort_values('count', ascending=False).iterrows():
    print(f"{r['cat_tok'][:40]:40} {str(r['unique_ids'])[:25]:25} {int(r['count'])}")

multi = cat_stats['unique_ids'].apply(len) > 1
if multi.any():
    print("\nTokens mapping to >1 id:")
    for _, r in cat_stats[multi].iterrows():
        print(f"  {r['cat_tok']}: {r['unique_ids']}")
else:
    print("\n✓ Every category token maps to a single id.")

id_stats = d2.groupby('id_tok')['cat_tok'].agg(lambda s: sorted(pd.unique(s.dropna()))).reset_index(name='tokens')
multi_id = id_stats['tokens'].apply(len) > 1
if multi_id.any():
    print("\nIds used by multiple tokens:")
    for _, r in id_stats[multi_id].iterrows():
        print(f"  {r['id_tok']}: {r['tokens'][:10]}{' ...' if len(r['tokens'])>10 else ''}")
else:
    print("\n✓ No id is shared by multiple tokens.")

print(f"\nUnique categoryIds: {articles['categoryId'].str.split(',').explode().nunique(dropna=True)}")
print(f"Unique category tokens: {d2['cat_tok'].nunique(dropna=True)}")


Mismatched token/id lengths: 0

category token                           unique_ids                count
Bh                                       ['27']                    60610
Underkläder                              ['19']                    51666
Bh utan bygel                            ['50']                    41392
Bygel-bh                                 ['223']                   20146
REA                                      ['110']                   19798
Sport-bh                                 ['618']                   8344
Badkläder                                ['470']                   4598
Tunikor                                  ['451']                   3805
Överdelar                                ['1552']                  3778
Framknäppt bh                            ['189']                   3765
Dam                                      ['471']                   3422
Byxor                                    ['689']                   2998
Trosor                    

In [470]:
articles.head()


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,color_facet,colorId,size,sizeId,audience,audienceId,category,category_missing,categoryId
0,000DIV,000DIV,,active,,,,unknown,,,,,,unknown,1,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,blå,blå,269.0,Blå,328.0,,,unknown,1,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,,unknown,,,,Dam,6.0,Tröjor,0,17.0
3,055573,055573,55.0,active,Beskrivning Luva,Novita,,unknown,,,,Dam,6.0,"Mössor & hattar,Mönster",0,393961.0
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,,unknown,,,,Dam,6.0,Vantar,0,45.0


## Fixing Audience and AudienceID

In [471]:
print("Unique values of 'audience' and their counts:")
print(articles['audience'].value_counts(dropna=False))


Unique values of 'audience' and their counts:
audience
<NA>                    63345
Dam                     43873
Herr                      823
Dam,Herr                  301
Dam,Dam                   181
Dam,Dam,Dam                80
Dam,Dam,Dam,Dam            18
Dam,Herr,Baby & barn       12
Baby & barn                 9
Barn & ungdom               8
Dam,Dam,Dam,Dam,Dam         5
Hemmet                      1
Name: count, dtype: Int64


In [472]:
# Audience cleanup: short, readable, does the job.

import re
import pandas as pd

AUD2ID = {'dam':'6','herr':'15','baby & barn':'12','barn & ungdom':'42','generic':'99','hemmet':'222'}

# 1) normalize what’s already there
def norm_audience(a):
    if pd.isna(a): return pd.NA
    toks = {t.strip().lower() for t in str(a).split(',') if t.strip()}
    if any('dam' in t for t in toks):  # “dam” anywhere wins
        return 'dam'
    keep = [t for t in toks if t in AUD2ID]
    return ','.join(keep) if keep else pd.NA

def to_ids(a):
    if pd.isna(a): return pd.NA
    ids = sorted({AUD2ID[t] for t in a.split(',') if t in AUD2ID}, key=int)
    return ','.join(ids) if ids else pd.NA

articles['audience'] = articles['audience'].apply(norm_audience).astype('string')

# 2) fill missing using category text (substring matches, ignore “REA” as a token)
DAM = [
    'dam','bh','trosor','underkläder','body','bodykorselett','korsett','korsetter',
    'klänning','klänningar','tunika','tunikor','topp','toppar','kjol','kjolar',
    'byxa','byxor','blus','blusar','nattlinne','bikinibh','bikini','t-shirt-bh',
    'minimizer','kofta','koftor','väst','västar','skor','väskor','sjalar'
]
HEM = [
    'frottéhanddukar','badlakan','bad','badrumsmattor','kökshanddukar','vaxdukar','dukar',
    'pläd','plädar','kanallängder','kanalkappa','gardiner','påslakanset','bädd',
    'lakan','örngott','hemtextil','kuddfodral','överkast','gardinstänger','kökshjälpmedel',
    'dekorationer','metervara','prydnadssaker','belysning','servetter'
]
GEN = [
    'inkontinens','stödartiklar','vardagshjälpmedel','rollator','rollatorer','stödstrumpor',
    'skotillbehör','fotvård','hobbyhörnan','pussel','sytillbehör','symaskiner','lust',
    'massage','synhjälpmedel','medicin','böcker','halkskydd','träning & motion'
]
HER = ['herr','skjorta','skjortor','kostym','kavaj','boxer']

REA_TOKEN = re.compile(r'(^|,)\s*rea\s*(?=,|$)')

def strip_rea(s):
    s = s.lower()
    s = REA_TOKEN.sub(lambda m: ',' if m.group(1) else '', s)
    return re.sub(r',+', ',', s).strip(', ').strip()

def classify(cat):
    if pd.isna(cat): return pd.NA
    s = strip_rea(str(cat))
    if not s: return pd.NA
    if any(h in s for h in DAM): return 'dam'
    if any(h in s for h in HER): return 'herr'
    if any(h in s for h in HEM): return 'hemmet'
    if any(h in s for h in GEN): return 'generic'
    return pd.NA

na_mask = articles['audience'].isna()
fill = articles.loc[na_mask, 'category'].apply(classify)
idx = fill.dropna().index
articles.loc[idx, 'audience'] = fill.loc[idx]

# 3) ids and missing flag; keep columns together
articles['audienceId']       = articles['audience'].apply(to_ids).astype('string')
articles['audience_missing'] = articles['audience'].isna().astype('int8')

def move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]

articles = move_after(articles, ['audienceId','audience_missing'], 'audience')

# tiny report
filled = len(idx); total = int(na_mask.sum())
print(f"Filled audience for {filled}/{total} ({filled/max(total,1):.1%}). "
      f"Dam={int((articles.loc[idx,'audience']=='dam').sum())}, "
      f"Herr={int((articles.loc[idx,'audience']=='herr').sum())}, "
      f"Hemmet={int((articles.loc[idx,'audience']=='hemmet').sum())}, "
      f"Generic={int((articles.loc[idx,'audience']=='generic').sum())}.")


Filled audience for 51280/63345 (81.0%). Dam=48059, Herr=266, Hemmet=1683, Generic=1272.


## Size

In [473]:
# Ensure strings (optional)
articles['size'] = articles['size'].astype('string')
articles['sizeId'] = articles['sizeId'].astype('string')

# Unique sizeIds per size
size_stats = (articles[articles['sizeId'].notna()]
              .groupby('size', dropna=False)['sizeId']
              .agg(unique=lambda s: list(pd.unique(s)),
                   count='size')
              .sort_values('count', ascending=False))

print(f"{'size':50} {'sizeIds':40} count")
for size, row in size_stats.iterrows():
    print(f"{str(size):50} {str(row['unique']):40} {row['count']}")

# Sizes with >1 sizeId
multi_size = size_stats['unique'].apply(lambda ids: len([i for i in ids if pd.notna(i)]) > 1)
if multi_size.any():
    print("\nSizes with >1 sizeId:")
    for size, ids in size_stats.loc[multi_size, 'unique'].items():
        print(f"{size}: {[i for i in ids if pd.notna(i)]}")
else:
    print("\nNo size has >1 sizeId.")

# Reverse check: sizeIds shared by multiple sizes
rev = (articles[articles['sizeId'].notna()]
       .assign(_id=lambda df: df['sizeId'])
       .groupby('_id')['size']
       .agg(unique=lambda s: list(pd.unique(s)), count='size'))
multi_id = rev['unique'].apply(lambda ss: len([x for x in ss if pd.notna(x)]) > 1)
if multi_id.any():
    print("\nsizeIds shared by multiple sizes:")
    for sid, sizes in rev.loc[multi_id, 'unique'].items():
        print(f"{sid}: {[s for s in sizes if pd.notna(s)]}")
else:
    print("\nNo sizeId shared by multiple sizes.")

print(f"\nArticles without sizeId: {int(articles['sizeId'].isna().sum())}")
print(f"Articles without size: {int(articles['size'].isna().sum())}")


size                                               sizeIds                                  count
38                                                 ['106']                                  2491
40                                                 ['107']                                  2486
42                                                 ['108']                                  2427
44                                                 ['111']                                  2321
46                                                 ['112']                                  2282
48                                                 ['113']                                  2087
50                                                 ['114']                                  1855
52                                                 ['115']                                  1811
D85                                                ['68']                                   1719
C85                          

In [474]:
# --- Extra cleanup & safety for size mapping ---

import re
def canon_size_token(t):
    t = t.strip()
    t = t.replace('×', 'x').replace(' X ', 'x').replace(' x ', 'x')
    t = re.sub(r'\s*x\s*', 'x', t)                      # "70 x 130" -> "70x130"
    t = re.sub(r'\s+cm\b', ' cm', t, flags=re.I)        # tidy "cm"
    t = re.sub(r'\s+mm\b', ' mm', t, flags=re.I)        # tidy "mm"
    t = t.replace(',', '.') if re.search(r'\d,\d', t) else t  # 0,5 -> 0.5
    t = t.upper() if t.lower() in {'xs','s','m','l','xl','xxl','3xl','4xl','5xl','6xl','one size'} else t
    t = t.replace('ONE SIZE', 'ONE SIZE')
    t = t.replace('–','-').replace('—','-')
    t = t.replace('-', '/') if re.fullmatch(r'\d{2}-\d{2}', t) else t  # 38-40 -> 38/40
    return t

NOISE_TOKENS = {
    '***missing***','rosa','svart','blå','offwhite','vinröd','greige',
    'kuddfodral','påslakan','tomte','ostbricka','plommonlila'
}

# broad-but-safe patterns of *actual* size tokens
SIZE_PATTERNS = [
    r'^\d{2}$',                        # 36, 38, 40...
    r'^\d{2}/\d{2}$',                  # 38/40, 42/44...
    r'^[A-Z]{1,2}/?[A-Z]{0,2}\d{2}$',  # B80, C/D40, 40B/C...
    r'^\d{2}[A-Z]/[A-Z]$',             # 40B/C
    r'^[A-Z]{1,3}$',                   # XS, S, M, L, XL, 3XL...
    r'^\d{2}x\d{2,3}\s*cm$',           # 70x130 cm, 90x150 cm
    r'^\d{2,3}x\d{2,3}\s*cm$',         # 150x210 cm, 180x200 cm
    r'^\d+(\.\d+)?\s*mm$',             # 0.5 mm, 3.0 mm
    r'^\d{2}x\d{2}$',                  # 30x50 (no unit)
    r'^\d+$',                          # 20, 21, 22 (shoe kids)
    r'^\d+[- ]?PACK$',                 # 3-pack, 4PACK
    r'^[A-Z]/[A-Z]\d{2}$',             # B/C50, D/E80
]

size_re = re.compile('|'.join(f'(?:{p})' for p in SIZE_PATTERNS), flags=re.I)

def is_size_token(t):
    t0 = t.strip().lower()
    if t0 in NOISE_TOKENS: return False
    return bool(size_re.match(t.strip()))

# 1) Normalize tokens in-place
def normalize_size_cell(s):
    if pd.isna(s): return pd.NA
    toks = [canon_size_token(t) for t in str(s).split(',') if t.strip()]
    # drop obvious noise tokens
    toks = [t for t in toks if is_size_token(t)]
    # dedup preserving order
    seen, out = set(), []
    for t in toks:
        if t not in seen:
            seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

articles['size'] = articles['size'].apply(normalize_size_cell).astype('string')

# 2) If size is unknown, ensure sizeId is NaN
articles.loc[articles['size'].isna() | (articles['size']=='unknown'), 'sizeId'] = pd.NA

# 3) Sanity: assert duplicates like "38,38" no longer exist
def has_dupes(s):
    toks = [t.strip() for t in str(s).split(',')]
    return len(toks) != len(set(toks))

bad_dupe_rows = articles['size'].dropna().apply(has_dupes)

# 4) (Optional) rebuild sizeId again but only from *valid* tokens
# Relearn mapping with the filtered tokens to avoid noise poisoning
pairs = []
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st = [t for t in toks(sz) if is_size_token(t)]
    it = toks(sid)
    n = min(len(st), len(it))
    pairs.extend(zip(st[:n], it[:n]))
if pairs:
    dfp = pd.DataFrame(pairs, columns=['size_tok','id_tok'])
    token2id = (dfp.groupby(['size_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['size_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('size_tok')
                  .set_index('size_tok')['id_tok'])
    def rebuild_ids(sz):
        st = [t for t in toks(sz) if is_size_token(t)]
        mapped = [token2id.get(t, pd.NA) for t in st]
        mapped = [m for m in mapped if pd.notna(m)]
        return ','.join(mapped) if mapped else pd.NA
    articles['sizeId'] = articles['size'].apply(rebuild_ids).astype('string')


In [475]:
# Size ↔ sizeId: dedup, learn mapping, rebuild ids, fill missing, stats.

import pandas as pd

def dedup_csv(s):
    if pd.isna(s): return pd.NA
    out, seen = [], set()
    for t in map(str.strip, str(s).split(',')):
        if t and t not in seen: seen.add(t); out.append(t)
    return ','.join(out) if out else pd.NA

def toks(s):
    return [t.strip() for t in str(s).split(',') if t.strip()] if pd.notna(s) else []

# 1) Per-row dedup
articles['size']   = articles['size'].apply(dedup_csv).astype('string')
articles['sizeId'] = articles['sizeId'].apply(dedup_csv).astype('string')

# 2) Learn token→id by zipping aligned positions across rows (majority vote)
pairs, mismatched = [], 0
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st, it = toks(sz), toks(sid)
    n = min(len(st), len(it))
    if n == 0: continue
    if len(st) != len(it): mismatched += 1
    pairs.extend(zip(st[:n], it[:n]))

if pairs:
    dfp = pd.DataFrame(pairs, columns=['size_tok','id_tok'])
    token2id = (dfp.groupby(['size_tok','id_tok']).size()
                  .reset_index(name='n')
                  .sort_values(['size_tok','n','id_tok'], ascending=[True,False,True])
                  .drop_duplicates('size_tok')
                  .set_index('size_tok')['id_tok'])

    def rebuild_ids(sz):
        st = toks(sz)
        mapped = [token2id.get(t, pd.NA) for t in st]
        mapped = [m for m in mapped if pd.notna(m)]
        return ','.join(mapped) if mapped else pd.NA

    articles['sizeId'] = articles['size'].apply(rebuild_ids).astype('string')
else:
    print("No size↔id pairs available to learn mapping.")

# 3) Replace missing size with 'unknown' (IDs stay NaN if unknown), add missing flag
articles['size'] = articles['size'].fillna('unknown').astype('string')
articles['size_missing'] = (articles['size'] == 'unknown').astype('int8')

# (optional) keep flag next to 'size'
def _move_after(df, cols, after):
    cols_all = list(df.columns)
    for c in cols:
        if c in cols_all: cols_all.remove(c)
    i = cols_all.index(after) + 1 if after in cols_all else len(cols_all)
    return df[cols_all[:i] + cols + cols_all[i:]]
articles = _move_after(articles, ['size_missing'], 'size')

# 4) Stats (token-level, not list-level)
pairs2 = []
for sz, sid in articles[['size','sizeId']].dropna().itertuples(index=False):
    st, it = toks(sz), toks(sid)
    n = min(len(st), len(it))
    pairs2.extend(zip(st[:n], it[:n]))
d2 = pd.DataFrame(pairs2, columns=['size_tok','id_tok']) if pairs2 else pd.DataFrame(columns=['size_tok','id_tok'])

if not d2.empty:
    size_stats = (d2.groupby('size_tok')['id_tok']
                    .agg(lambda s: sorted(pd.unique(s.dropna())))
                    .reset_index(name='unique_ids'))
    size_stats['count'] = d2.groupby('size_tok')['id_tok'].size().values

    print(f"Mismatched token/id lengths: {mismatched}")
    print(f"\n{'size token':30} {'unique_ids':25} count")
    for _, r in size_stats.sort_values('count', ascending=False).iterrows():
        print(f"{r['size_tok'][:30]:30} {str(r['unique_ids'])[:25]:25} {int(r['count'])}")

    multi = size_stats['unique_ids'].apply(len) > 1
    if multi.any():
        print("\nTokens mapping to >1 id:")
        for _, r in size_stats[multi].iterrows():
            print(f"  {r['size_tok']}: {r['unique_ids']}")
    else:
        print("\n✓ Every size token maps to a single id.")

    id_stats = (d2.groupby('id_tok')['size_tok']
                  .agg(lambda s: sorted(pd.unique(s.dropna())))
                  .reset_index(name='tokens'))
    multi_id = id_stats['tokens'].apply(len) > 1
    if multi_id.any():
        print("\nSizeIds shared by multiple tokens:")
        for _, r in id_stats[multi_id].iterrows():
            print(f"  {r['id_tok']}: {r['tokens'][:10]}{' ...' if len(r['tokens'])>10 else ''}")
    else:
        print("\n✓ No sizeId is shared by multiple size tokens.")
else:
    print("No size/id pairs to report stats on.")

print(f"\nArticles without sizeId: {int(articles['sizeId'].isna().sum())}")
print(f"Articles with size == 'unknown': {int((articles['size']=='unknown').sum())}")


Mismatched token/id lengths: 26

size token                     unique_ids                count
40                             ['107']                   2500
38                             ['106']                   2496
42                             ['108']                   2432
44                             ['111']                   2326
46                             ['112']                   2287
48                             ['113']                   2093
50                             ['114']                   1861
52                             ['115']                   1815
C85                            ['28']                    1724
D85                            ['68']                    1723
C90                            ['62']                    1689
D90                            ['69']                    1686
C80                            ['60']                    1684
D80                            ['67']                    1669
C95                            ['63'

In [479]:
# Print a few rows where sizeId is not na
articles[articles['sizeId'].notna()].head()


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,color_facet,colorId,size,size_missing,sizeId,audience,audienceId,audience_missing,category,category_missing,categoryId
50,100501-2075,100501,80,inactive,Bh utan bygel,Swegmark,vit-beige,vit-beige,105,B75,0,51,dam,6,0,"Bh utan bygel,Bh",0,5027
51,100501-2080,100501,80,inactive,Bh utan bygel,Swegmark,vit-beige,vit-beige,105,B80,0,52,dam,6,0,"Bh utan bygel,Bh",0,5027
52,100501-2085,100501,80,inactive,Bh utan bygel,Swegmark,vit-beige,vit-beige,105,B85,0,54,dam,6,0,"Bh utan bygel,Bh",0,5027
53,100501-2090,100501,80,inactive,Bh utan bygel,Swegmark,vit-beige,vit-beige,105,B90,0,55,dam,6,0,"Bh utan bygel,Bh",0,5027
54,100501-2095,100501,80,inactive,Bh utan bygel,Swegmark,vit-beige,vit-beige,105,B95,0,56,dam,6,0,"Bh utan bygel,Bh",0,5027


In [207]:
#status column
status = articles['status'].unique()
print(status)
articles['status'] = articles['status'].astype('string')
print(articles['status'].dtype)

<StringArray>
['active', 'inactive', 'discontinued', 'removed', <NA>]
Length: 5, dtype: string
string


In [36]:
# How many rows are there in articles vs how many rows that are in status inactive, discontinued, or removed

total_rows = len(articles)
inactive_rows = articles['status'].isin(['inactive', 'discontinued', 'removed']).sum()

print(f"Total rows in articles: {total_rows}")
print(f"Rows with status inactive, discontinued, or removed: {inactive_rows}")


Total rows in articles: 108656
Rows with status inactive, discontinued, or removed: 75689


In [37]:
#inspect name column, output all unique values
print(articles['name'].unique())
#count how many na in name column
print(articles['name'].isna().sum())
#print rows where name is na
articles[articles['name'].isna()]


<StringArray>
[                          <NA>,      'Lakan/örngott blå 4 del',
            'Beskrivning Tröja',            'Beskrivning Luva ',
           'Beskrivning Vantar',       'Beskrivning Benvärmare',
             'Garn Drops Nepal',      'Drops Eskimo brun/beige',
 'Garnpaket Virkade Basketskor',  'Instruktioner Axelvärmare S',
 ...
    'Bruksanvisning Pulsmätare',    'Brugsanvisning RollatorDK',
   'Bruksanvisning Rollator FI',    'Bruksanvisning Rollator N',
    'Bruksanvisning Rollator 3',      'Bruksanvisng Sofia N/DK',
       'Instruktioner S N F Dk',                  'guldarmband',
 'Bruksanvisning Dagsljuslampa',   'Instruktioner värmemadrass']
Length: 4781, dtype: string
907


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,__UNK_BRANDID__,active,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
328,101901-7075,101901,80,inactive,,Swegmark,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2449,200014,200014,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2453,200030,200030,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2460,200048,200048,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108119,590971,590971,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
108120,590985,590985,112,inactive,,Linea,Multi,86,,,Generic,99,Påslakanset,165,,
108537,973036,973036,__UNK_BRANDID__,active,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
108587,KHW041-4244,KHW041,25,active,,Funq Wear,__UNK_COLOR__,__UNK_COLORID__,42/44,22,Generic,99,,,,


In [38]:
#remove rows where name is na and save column as type string
articles = articles[articles['name'].notna()]
articles['name'] = articles['name'].astype('string')

In [58]:
#name.1	 unique values
print(articles['name.1'].unique()) 

#replace nan with __UNK_NAME__ and string type
articles['name.1'] = articles['name.1'].fillna('__UNK_NAME__').astype('string')

#check if name.1 is string
print(articles['name.1'].dtype)

articles.head()
#check if name.1 is string

<StringArray>
[     '__UNK_NAME__',      'Gjestal Garn',            'Novita',
      'Drops Design',      'Svarta Fåret',          'Swegmark',
             'Linea',             'Trofé',      'Knittingroom',
             'Järbo',
 ...
        'Stjernsund',      'Pixie Design',       'Noble House',
 'Arvidssons Textil',   'Nääsgränsgården',           'Fondaco',
    'Oehlenschläger',         'Rosa Faia',             'Coats',
            'Disney']
Length: 114, dtype: string
string


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId,num_unique_categoryIds
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,__UNK_CATEGORY__,__UNK_CATEGORYID__,1
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17,1
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961,2
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45,1
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16,1


In [40]:
#unique values of size
size = articles['size'].unique()
print(size)



<StringArray>
[        'Blå',          <NA>,         'B75',         'B80',         'B85',
         'B90',         'B95',        'B100',         'C75',         'C80',
 ...
   '35x120 cm',   '35x240 cm',  '150x210 cm',  '105x200 cm',  '120x200 cm',
  '140x200 cm',  '160x200 cm',    '65x90 cm', '40x30x10 cm',  'Kuddfodral']
Length: 958, dtype: string


In [41]:
#unique values of sizeId
sizeId = articles['sizeId'].unique()
print(sizeId)


<StringArray>
[ '328',   <NA>,   '51',   '52',   '54',   '55',   '56',   '57',   '59',
   '60',
 ...
  '443', '3722', '2225',  '971',  '974',  '366',  '972', '2007', '1368',
 '1242']
Length: 958, dtype: string


In [42]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['size', 'sizeId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (size, sizeId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,size,sizeId,count
0,38,106,2422
1,40,107,2417
2,42,108,2359
3,44,111,2258
4,46,112,2219
5,48,113,2036
6,50,114,1804
7,52,115,1760
8,D85,68,1718
9,C85,28,1718


In [43]:
#remove size column as irrelevant
articles = articles.drop(columns=['size'])

#remove sizeId column as irrelevant
articles = articles.drop(columns=['sizeId'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId,fabric,fabricId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,,,,
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17.0,,
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961.0,,
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45.0,,
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16.0,,


In [54]:
#unique values of category
category = articles['category'].unique()
display(print(category))


<StringArray>
[                                  '__UNK_CATEGORY__',
                                             'Tröjor',
                           'Mössor & hattar,Mönster ',
                                             'Vantar',
                                  'Sockor & strumpor',
                              'REA,Sockor & strumpor',
                     'Bh utan bygel,Bh,Bh utan bygel',
                               'Bygel-bh,Bh,Bygel-bh',
                    'Kuddar,Innerkuddar,Bädd (linea)',
                               'Sport-bh,Bh,Sport-bh',
 ...
 'Frottéhanddukar & badlakan,REA,Picknick och uteliv',
                                    'Täcken,Bädd,REA',
                                            'Hushåll',
                                    'Vepor & bonader',
                           'Dörr- & trappstegsmattor',
               'REA,Kökshjälpmedel,Vardagshjälpmedel',
                    'REA,Vardagshjälpmedel,Belysning',
                            'Belysning,Fönster

None

In [57]:
# Count the number of unique categoryIds in each row (comma-separated values)
def count_unique_categoryids(val):
    if pd.isna(val):
        return 0
    ids = [v.strip() for v in str(val).split(',') if v.strip() != '']
    return len(set(ids))

articles['num_unique_categoryIds'] = articles['categoryId'].apply(count_unique_categoryids)
print(articles[['categoryId', 'num_unique_categoryIds']].head())

           categoryId  num_unique_categoryIds
1  __UNK_CATEGORYID__                       1
2                  17                       1
3             39,3961                       2
4                  45                       1
5                  16                       1


In [45]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['category', 'categoryId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (category, categoryId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,category,categoryId,count
0,"Bh utan bygel,Bh,Underkläder,Bh utan bygel",50271950,17779
1,"Bygel-bh,Bh,Underkläder,Bygel-bh",2232719223,9212
2,"Bh utan bygel,Bh,Bh utan bygel",502750,6750
3,REA,110,6057
4,"Sport-bh,Bh utan bygel,Bh,Underkläder,Sport-bh,Bh utan bygel",61850271961850,4417
5,"Bygel-bh,Bh,Bygel-bh",22327223,3161
6,"Underkläder,Trosor",1920,2657
7,Bh,27,2385
8,"REA,Tunikor",110451,2338
9,"Bygel-bh,Framknäppt bh,Bh,Underkläder,Bygel-bh,Framknäppt bh",2231892719223189,1603


In [46]:
articles['category'] = articles['category'].fillna('__UNK_CATEGORY__').astype('string')
articles['categoryId'] = articles['categoryId'].fillna('__UNK_CATEGORYID__').astype('string')

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId,fabric,fabricId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,__UNK_CATEGORY__,__UNK_CATEGORYID__,,
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17,,
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961,,
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45,,
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16,,


In [47]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['fabric', 'fabricId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (fabric, fabricId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,fabric,fabricId,count
0,Färgtryckt väv,157,63
1,"Ullgarn,Alpackagarn",368664,34
2,Aida,104,12
3,Frotté,225,7
4,"Aida,Bakgrundstryckt",104333,4
5,Bomull,295,3
6,"Bakgrundstryckt,Aida",333104,1
7,"Bomull,Aida",295104,1
8,"Bomull,Ritade Broderier",295149,1
9,"Linne,Ritade Broderier",170149,1


In [49]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku           0.0
groupId       0.0
brandId       0.0
status        0.0
name          0.0
name.1        0.0
color         0.0
colorId       0.0
audience      0.0
audienceId    0.0
category      0.0
categoryId    0.0
dtype: float64

In [50]:
# Save the cleaned articles DataFrame to CSV with all columns as string type
articles_clean = articles.astype('string')
articles_clean.to_csv("../data/processed/articles_clean.csv", index=False)

articles_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107749 entries, 1 to 108655
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   sku         107749 non-null  string
 1   groupId     107749 non-null  string
 2   brandId     107749 non-null  string
 3   status      107749 non-null  string
 4   name        107749 non-null  string
 5   name.1      107749 non-null  string
 6   color       107749 non-null  string
 7   colorId     107749 non-null  string
 8   audience    107749 non-null  string
 9   audienceId  107749 non-null  string
 10  category    107749 non-null  string
 11  categoryId  107749 non-null  string
dtypes: string(12)
memory usage: 10.7 MB


In [51]:
articles_clean.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,__UNK_CATEGORY__,__UNK_CATEGORYID__
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16


In [52]:
#how many rows are there in articles?
print(len(articles_clean))

107749
