In [144]:
import pandas as pd
articles = pd.read_csv("../data/external/articles.csv", dtype='string')

#how many rows are there in articles?
print(len(articles))

108656


In [145]:
articles = articles.drop(columns=['length', 'width', 'height', 'weight'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,,active,,,,,,,,,,,,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,Blå,264.0,Blå,328.0,,,,,,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,,,,,Dam,6.0,Tröjor,17.0,,
3,055573,055573,55.0,active,Beskrivning Luva,Novita,,,,,Dam,6.0,"Mössor & hattar,Mönster",393961.0,,
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,,,,,Dam,6.0,Vantar,45.0,,


In [146]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku           0.000000
groupId       0.000009
brandId       0.185705
status        0.000009
name          0.008347
name.1        0.185705
color         0.169388
colorId       0.169388
size          0.033666
sizeId        0.033666
audience      0.582987
audienceId    0.582987
category      0.048474
categoryId    0.048474
fabric        0.998813
fabricId      0.998813
dtype: float64

In [147]:
import pandas as pd
import numpy as np

audience_to_id = {
    'Dam': '6',
    'Herr': '15',
    'Baby & barn': '12',
    'Barn & ungdom': '42',
    'Generic': '99',
    'Hemmet': '222',
}

articles['audience'] = articles['audience'].fillna('Generic')

def map_audience_to_id(audience_val):
    if pd.isna(audience_val):
        return np.nan
    # Split on commas for multi-valued cells, strip whitespace
    groups = [a.strip() for a in str(audience_val).split(',') if a.strip()]
    # Map each group to its ID
    ids = [audience_to_id[a] for a in groups if a in audience_to_id]
    return ','.join(ids) if ids else np.nan

articles['audienceId'] = articles['audience'].apply(map_audience_to_id)


articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,,active,,,,,,,Generic,99,,,,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,Blå,264.0,Blå,328.0,Generic,99,,,,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,,,,,Dam,6,Tröjor,17.0,,
3,055573,055573,55.0,active,Beskrivning Luva,Novita,,,,,Dam,6,"Mössor & hattar,Mönster",393961.0,,
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,,,,,Dam,6,Vantar,45.0,,


In [148]:
#print unique values of color
color = articles['color'].unique()
print(color)

<StringArray>
[               <NA>,               'Blå',               'Gul',
            'Cerise',        'Ljusturkos',             'Natur',
               'Grå',           'Mörkgrå',         'Mellangrå',
             'Kamel',
 ...
       'Linne,Linne',        'Ljung,Lila',         'Grön,Grön',
   'Natur,Off-white',   'Svart,Svart/grå',         'Mellanblå',
     'Mullvad,Beige', 'Mullvad,Vit/beige',          'Vit,Rosa',
        'Turkos,Blå']
Length: 172, dtype: string


In [149]:
# Clean color and colorId columns by extracting the first unique non-empty token from comma-separated values
import pandas as pd

def clean_first(val):
    if pd.isna(val):
        return pd.NA
    seen = set()
    tokens = []
    for x in str(val).split(','):
        x = x.strip()
        if x and x not in seen:
            seen.add(x)
            tokens.append(x)
    return tokens[0] if tokens else pd.NA

articles['color'] = articles['color'].apply(clean_first).astype('string')
articles['colorId'] = articles['colorId'].apply(clean_first).astype('string')

articles['color'] = articles['color'].fillna('__UNK_COLOR__').astype('string')
articles['colorId'] = articles['colorId'].fillna('__UNK_COLORID__').astype('string')

articles.head()



Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,,active,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
1,052743,052743,,inactive,Lakan/örngott blå 4 del,,Blå,264,Blå,328.0,Generic,99,,,,
2,055522,055522,265.0,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,Tröjor,17.0,,
3,055573,055573,55.0,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,"Mössor & hattar,Mönster",393961.0,,
4,055575,055575,55.0,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,Vantar,45.0,,


In [150]:
articles['brandId'] = articles['brandId'].fillna('__UNK_BRANDID__').astype('string')
articles['groupId'] = articles['groupId'].fillna('__UNK_GROUPID__').astype('string')

In [151]:
#status column
status = articles['status'].unique()
print(status)
articles['status'] = articles['status'].astype('string')
print(articles['status'].dtype)

<StringArray>
['active', 'inactive', 'discontinued', 'removed', <NA>]
Length: 5, dtype: string
string


In [152]:
# How many rows are there in articles vs how many rows that are in status inactive, discontinued, or removed

total_rows = len(articles)
inactive_rows = articles['status'].isin(['inactive', 'discontinued', 'removed']).sum()

print(f"Total rows in articles: {total_rows}")
print(f"Rows with status inactive, discontinued, or removed: {inactive_rows}")


Total rows in articles: 108656
Rows with status inactive, discontinued, or removed: 75689


In [153]:
#inspect name column, output all unique values
print(articles['name'].unique())
#count how many na in name column
print(articles['name'].isna().sum())
#print rows where name is na
articles[articles['name'].isna()]


<StringArray>
[                          <NA>,      'Lakan/örngott blå 4 del',
            'Beskrivning Tröja',            'Beskrivning Luva ',
           'Beskrivning Vantar',       'Beskrivning Benvärmare',
             'Garn Drops Nepal',      'Drops Eskimo brun/beige',
 'Garnpaket Virkade Basketskor',  'Instruktioner Axelvärmare S',
 ...
    'Bruksanvisning Pulsmätare',    'Brugsanvisning RollatorDK',
   'Bruksanvisning Rollator FI',    'Bruksanvisning Rollator N',
    'Bruksanvisning Rollator 3',      'Bruksanvisng Sofia N/DK',
       'Instruktioner S N F Dk',                  'guldarmband',
 'Bruksanvisning Dagsljuslampa',   'Instruktioner värmemadrass']
Length: 4781, dtype: string
907


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
0,000DIV,000DIV,__UNK_BRANDID__,active,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
328,101901-7075,101901,80,inactive,,Swegmark,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2449,200014,200014,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2453,200030,200030,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
2460,200048,200048,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108119,590971,590971,__UNK_BRANDID__,inactive,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
108120,590985,590985,112,inactive,,Linea,Multi,86,,,Generic,99,Påslakanset,165,,
108537,973036,973036,__UNK_BRANDID__,active,,,__UNK_COLOR__,__UNK_COLORID__,,,Generic,99,,,,
108587,KHW041-4244,KHW041,25,active,,Funq Wear,__UNK_COLOR__,__UNK_COLORID__,42/44,22,Generic,99,,,,


In [154]:
#remove rows where name is na and save column as type string
articles = articles[articles['name'].notna()]
articles['name'] = articles['name'].astype('string')

In [155]:
#name.1	 unique values
print(articles['name.1'].unique()) 

#replace nan with __UNK_NAME__ and string type
articles['name.1'] = articles['name.1'].fillna('__UNK_NAME__').astype('string')

#check if name.1 is string
print(articles['name.1'].dtype)

articles.head()
#check if name.1 is string

<StringArray>
[               <NA>,      'Gjestal Garn',            'Novita',
      'Drops Design',      'Svarta Fåret',          'Swegmark',
             'Linea',             'Trofé',      'Knittingroom',
             'Järbo',
 ...
        'Stjernsund',      'Pixie Design',       'Noble House',
 'Arvidssons Textil',   'Nääsgränsgården',           'Fondaco',
    'Oehlenschläger',         'Rosa Faia',             'Coats',
            'Disney']
Length: 114, dtype: string
string


Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,size,sizeId,audience,audienceId,category,categoryId,fabric,fabricId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Blå,328.0,Generic,99,,,,
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,Tröjor,17.0,,
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,"Mössor & hattar,Mönster",393961.0,,
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,Vantar,45.0,,
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,,,Dam,6,Sockor & strumpor,16.0,,


In [156]:
#unique values of size
size = articles['size'].unique()
print(size)



<StringArray>
[        'Blå',          <NA>,         'B75',         'B80',         'B85',
         'B90',         'B95',        'B100',         'C75',         'C80',
 ...
   '35x120 cm',   '35x240 cm',  '150x210 cm',  '105x200 cm',  '120x200 cm',
  '140x200 cm',  '160x200 cm',    '65x90 cm', '40x30x10 cm',  'Kuddfodral']
Length: 958, dtype: string


In [157]:
#unique values of sizeId
sizeId = articles['sizeId'].unique()
print(sizeId)


<StringArray>
[ '328',   <NA>,   '51',   '52',   '54',   '55',   '56',   '57',   '59',
   '60',
 ...
  '443', '3722', '2225',  '971',  '974',  '366',  '972', '2007', '1368',
 '1242']
Length: 958, dtype: string


In [158]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['size', 'sizeId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (size, sizeId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,size,sizeId,count
0,38,106,2422
1,40,107,2417
2,42,108,2359
3,44,111,2258
4,46,112,2219
5,48,113,2036
6,50,114,1804
7,52,115,1760
8,D85,68,1718
9,C85,28,1718


In [159]:
#remove size column as irrelevant
articles = articles.drop(columns=['size'])

#remove sizeId column as irrelevant
articles = articles.drop(columns=['sizeId'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId,fabric,fabricId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,,,,
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17.0,,
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961.0,,
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45.0,,
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16.0,,


In [160]:
#unique values of category
category = articles['category'].unique()
print(category)

#unique values of categoryId
categoryId = articles['categoryId'].unique()
print(categoryId)


<StringArray>
[                                                <NA>,
                                             'Tröjor',
                           'Mössor & hattar,Mönster ',
                                             'Vantar',
                                  'Sockor & strumpor',
                              'REA,Sockor & strumpor',
                     'Bh utan bygel,Bh,Bh utan bygel',
                               'Bygel-bh,Bh,Bygel-bh',
                    'Kuddar,Innerkuddar,Bädd (linea)',
                               'Sport-bh,Bh,Sport-bh',
 ...
 'Frottéhanddukar & badlakan,REA,Picknick och uteliv',
                                    'Täcken,Bädd,REA',
                                            'Hushåll',
                                    'Vepor & bonader',
                           'Dörr- & trappstegsmattor',
               'REA,Kökshjälpmedel,Vardagshjälpmedel',
                    'REA,Vardagshjälpmedel,Belysning',
                            'Belysning,Fönster

In [161]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['category', 'categoryId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (category, categoryId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,category,categoryId,count
0,"Bh utan bygel,Bh,Underkläder,Bh utan bygel",50271950,17779
1,"Bygel-bh,Bh,Underkläder,Bygel-bh",2232719223,9212
2,"Bh utan bygel,Bh,Bh utan bygel",502750,6750
3,REA,110,6057
4,"Sport-bh,Bh utan bygel,Bh,Underkläder,Sport-bh,Bh utan bygel",61850271961850,4417
5,"Bygel-bh,Bh,Bygel-bh",22327223,3161
6,"Underkläder,Trosor",1920,2657
7,Bh,27,2385
8,"REA,Tunikor",110451,2338
9,"Bygel-bh,Framknäppt bh,Bh,Underkläder,Bygel-bh,Framknäppt bh",2231892719223189,1603


In [162]:
articles['category'] = articles['category'].fillna('__UNK_CATEGORY__').astype('string')
articles['categoryId'] = articles['categoryId'].fillna('__UNK_CATEGORYID__').astype('string')

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId,fabric,fabricId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,__UNK_CATEGORY__,__UNK_CATEGORYID__,,
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17,,
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961,,
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45,,
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16,,


In [163]:
from IPython.display import display, HTML

# Drop rows where both are missing, then show all unique pairs and their counts in the original data
pairs = articles[['fabric', 'fabricId']].dropna(how='all')
pair_counts = pairs.value_counts().reset_index(name='count')

# Display as expandable output
display(HTML('<details><summary>Show (fabric, fabricId) pair counts</summary>' + pair_counts.style.set_sticky().to_html() + '</details>'))


Unnamed: 0,fabric,fabricId,count
0,Färgtryckt väv,157,63
1,"Ullgarn,Alpackagarn",368664,34
2,Aida,104,12
3,Frotté,225,7
4,"Aida,Bakgrundstryckt",104333,4
5,Bomull,295,3
6,"Bakgrundstryckt,Aida",333104,1
7,"Bomull,Aida",295104,1
8,"Bomull,Ritade Broderier",295149,1
9,"Linne,Ritade Broderier",170149,1


In [164]:
#remove fabric column as irrelevant
articles = articles.drop(columns=['fabric'])

#remove fabricId column as irrelevant
articles = articles.drop(columns=['fabricId'])

articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,name.1,color,colorId,audience,audienceId,category,categoryId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,__UNK_NAME__,Blå,264,Generic,99,__UNK_CATEGORY__,__UNK_CATEGORYID__
2,55522,55522,265,active,Beskrivning Tröja,Gjestal Garn,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Tröjor,17
3,55573,55573,55,active,Beskrivning Luva,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,"Mössor & hattar,Mönster",393961
4,55575,55575,55,active,Beskrivning Vantar,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Vantar,45
5,55576,55576,55,active,Beskrivning Benvärmare,Novita,__UNK_COLOR__,__UNK_COLORID__,Dam,6,Sockor & strumpor,16


In [165]:
#drop name.1, color, audience, category
articles = articles.drop(columns=['name.1', 'color', 'audience', 'category'])

In [166]:
#count proportion of missing values in each column
articles.isnull().sum() / len(articles)

sku           0.0
groupId       0.0
brandId       0.0
status        0.0
name          0.0
colorId       0.0
audienceId    0.0
categoryId    0.0
dtype: float64

In [167]:
# Save the cleaned articles DataFrame to CSV with all columns as string type
articles_clean = articles.astype('string')
articles_clean.to_csv("../data/processed/articles_clean.csv", index=False)

articles_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107749 entries, 1 to 108655
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   sku         107749 non-null  string
 1   groupId     107749 non-null  string
 2   brandId     107749 non-null  string
 3   status      107749 non-null  string
 4   name        107749 non-null  string
 5   colorId     107749 non-null  string
 6   audienceId  107749 non-null  string
 7   categoryId  107749 non-null  string
dtypes: string(8)
memory usage: 7.4 MB


In [168]:
articles.head()

Unnamed: 0,sku,groupId,brandId,status,name,colorId,audienceId,categoryId
1,52743,52743,__UNK_BRANDID__,inactive,Lakan/örngott blå 4 del,264,99,__UNK_CATEGORYID__
2,55522,55522,265,active,Beskrivning Tröja,__UNK_COLORID__,6,17
3,55573,55573,55,active,Beskrivning Luva,__UNK_COLORID__,6,393961
4,55575,55575,55,active,Beskrivning Vantar,__UNK_COLORID__,6,45
5,55576,55576,55,active,Beskrivning Benvärmare,__UNK_COLORID__,6,16


In [169]:
#how many rows are there in articles?
print(len(articles_clean))

107749
