# Fashion item classification - The Dataset
<img src="https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/fashionquote13-1624393596.jpg" alt="Getty." align="left" width="300" style="vertical-align:right;margin:0px 30px"/>

## How did we tackle this problem?
We started by looking for an appropriate dataset on which to train our classifier:

## Existing datasets

- [Fashion MNIST](https://github.com/zalandoresearch/fashion-mnist): the grand-daddy of fashion datasets. Academic, unrealistic and contains only 10 classes.
- [Fashion Gen](https://paperswithcode.com/dataset/fashion-gen): unrealistic for user taken pictures.
- [Fashionpedia](https://fashionpedia.github.io/home/): More about modeling than the clothes themselves.
- [Deep Fashion](https://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html): More realistic. 50 classes. More annotations than needed so could be helpful for further work. Still not quite representative of the pictures people would take.
- [Alexey Grigorev's clothing dataset](https://medium.com/data-science-insider/clothing-dataset-5b72cd7c3f1f): Very representative pictures. Small dataset. Only 20 classes. Good for prototyping.

## But could we do better?
**Always.** As mentioned before, the closest existing product, to our minds, is Vinted.com:

- Vast amounts of pictures. A lot more than `100 000 unique pictures` on US site alone.
- `Labels`: 
	- all items are categorized in an arborescence. Yeah! Quality labeling.
	- a search (`keyword`) will almost always return results. Powerful. But result quality needs to be assessed.
		- 	We could end up creating a classifier than behaves like Vinted's search engine instead of doing better.

We crawled their category arborescence (which they call the *catalog*) and discovered `575` categories (including parents). After removing the irrelevant ones, we were left with `491` categories: a vast improvement on our most promising existing dataset from Grigorev.

In [55]:
! pip install -q fake_useragent numpy pandas requests pillow tqdm kaggle

In [56]:
import os
import time
import shutil
import re
import json
import io
import datetime
import zipfile
import random
import itertools

import requests
import fake_useragent
import pandas as pd

import numpy as np
from urllib.error import HTTPError
from PIL import Image
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

In [57]:
random.seed(42) # for reproducibility

Initialize fake_useragent's DB. Can comment afterwards.

In [58]:
fake_useragent.UserAgent().random

'Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'

Here are a few helper functions:

* ``get_session()``: Initiates a session with the API and gets the necessary cookies.
* ``get_url()``: A fault tolerant ``requests.get()``

In [59]:
domain = "com"
VINTED_URL = f"https://www.vinted.{domain}"
VINTED_AUTH_URL = f"https://www.vinted.{domain}/auth/token_refresh"
VINTED_API_URL = f"https://www.vinted.{domain}/api/v2/catalog/items"

def beautiful_sleep(seconds, message):
    "Nicely print a message and sleep for a given number of seconds."
    for i in reversed(range(seconds)):
        print(f"{message}. Sleeping {i} seconds.    \r", end="")
        time.sleep(1)

def get_session(sleep_counter=2):
    "Get a session (cookies) with a fake user agent. Sleep if needed then calls itself."
    session = requests.Session()
    user_agent = fake_useragent.UserAgent().random # Maybe helpful?
    session.headers.update({"User-Agent": user_agent})

    response = session.post(VINTED_AUTH_URL, headers={"User-Agent": user_agent}) # Set cookies

    if response.status_code == 429:
        bench_time = int(response.headers["Retry-After"]) + 10 # at this point...
        beautiful_sleep(bench_time, f"[get_session {response.status_code}]")
        return get_session()

    if response.status_code != 200:
        time.sleep(sleep_counter)
        return get_session(sleep_counter * 2)

    return session

def get_url(url, session=None, params=None, sleep_counter=2):
    "Get a response from a given URL. Sleep if needed then calls itself."
    if session is None:
        session = get_session()
    if params is None:
        params = dict()

    try:
        response = session.get(url, params=params, timeout=5)
    except TimeoutError:
        print(f"Timeout. Skipping")
        return response, None
        
    if response.status_code == 500:
        return response, None
    if response.status_code == 429:
        bench_time = int(response.headers["Retry-After"]) + 10 # at this point...
        beautiful_sleep(bench_time, f"[get_url {response.status_code}]")
        return get_url(url, params=params)
    if response.status_code != 200:
        time.sleep(sleep_counter)
        return get_url(url, params=params, sleep_counter=sleep_counter * 2)
        
    return response, session 

We start by getting the catalog. This is a tree of all the item categories sold on Vinted.com.

In [60]:
def get_catalog():
    "Get the catalog (category tree) from Vinted."
    url = f"https://www.vinted.com/vetements?"
    res, session = get_url(url)
    matches = re.findall('({.+})', res.text)
    sub_matches = list(m for m in matches if 'code' in m) # should be in the regex. Oh well.

    catalog = json.loads(sub_matches[0]).get('catalogTree')
    return catalog  

In [61]:
catalog = get_catalog() # this is a tree
while catalog is None:
    catalog = get_catalog()
catalog[0]    

{'id': 1904,
 'title': 'Women',
 'code': 'WOMEN_ROOT',
 'material_group_id': None,
 'material_group_ids': [],
 'size_group_id': 4,
 'size_group_ids': [4, 7, 53, 52, 30],
 'shippable': True,
 'author_field_visibility': 0,
 'brand_field_visibility': 1,
 'book_title_field_visibility': 0,
 'color_field_visibility': 1,
 'isbn_field_visibility': 0,
 'size_field_visibility': 1,
 'material_field_visibility': 0,
 'location_field_visible': False,
 'condition_field_visible': True,
 'restricted_to_status_id': None,
 'landing': None,
 'allow_browsing_subcategories': True,
 'package_size_ids': [1, 2, 3, 4],
 'order': 0,
 'item_count': 231519112,
 'photo': {'url': 'https://images1.vinted.net/t/03_00ffd_tLaPvmCTXrgwxpvwY4mqZffA/1651062589.png?s=d74cf4c3d78c2156ebf59cda6b1a429af65b4a29',
  'thumbnails': [{'type': 'thumb24',
    'height': 24,
    'width': 24,
    'url': 'https://images1.vinted.net/t/03_00ffd_tLaPvmCTXrgwxpvwY4mqZffA/24x24/1651062589.png?s=3b0064bdf0a4f2d993b2dd8d9ef60b3b4d443023'},
   {

Get all the categories and their parents as a list.

In [62]:
stop_words = {
        'Home', 
        'Beauty', 
        'Grooming', 
        'Toys & games',
        'Baby care',
        'Strollers',
        'Ride-on toys',
        'Chairs',
        "Kids' furniture",
        'School supplies',
        "Other kids' items"
    }

def get_categories(catalog, stop_words, parents=None):
    "Get all categories from a catalog. Recursive."
    if parents is None:
        parents = []
    for c in catalog:
        if c.get('title') in stop_words:
            continue
        if c.get('catalogs'):
            yield from get_categories(c.get('catalogs'), stop_words, parents + [(c.get('id'), c.get('title'))])
        else:
            yield parents + [(c.get('id'), c.get('title'))]

In [63]:
list_categories = list(get_categories(catalog, stop_words=stop_words))
print(len(list_categories))
print(list_categories[:1])

491
[[(1904, 'Women'), (4, 'Clothes'), (1037, 'Coats & jackets'), (1907, 'Coats'), (1087, 'Parkas')]]


In [64]:
def flatten_categories(list_categories):
    for categories in list_categories:
        yield categories[-1][0], ('__'.join(category[-1] for category in categories))

In [65]:
list_categories = list(flatten_categories(list_categories))
print(len(list_categories))
print(list_categories[:1])

491
[(1087, 'Women__Clothes__Coats & jackets__Coats__Parkas')]


In [66]:
def sanitize_labels(categories):
    for id, label in categories:
        yield id, (label.lower()
                    .replace(" ", "_")
                    .replace("-", "_")
                    .replace("'", "")
                    .replace("&", "and")
                    .replace('¾', 'three_quarters')
                    .replace(',', '')
                    .replace('/', '_')
                    )

In [67]:
list_categories = list(sanitize_labels(list_categories))
print(len(list_categories))
print(list_categories[:1])

491
[(1087, 'women__clothes__coats_and_jackets__coats__parkas')]


At this point, we still needed to assess whether these categories yielded enough results to be even attemptable. We sampled a small number of categories manually, particularly the niche ones and decided the dataset would be good enough.

#### We were in business!
How could we get the pictures and their labels easily? We found a sparsely documented library, [pyVinted](https://github.com/aime-risson/vinted-api-wrapper), which makes use of an undocumented Vinted api, and took the important parts and build an api friendly and fault-tolerant crawler to:

1. (Try to) get 300 urls for each category. 300 is a soft limit for the maximum number of items per page.
2. Download the pictures.
3. Save them to disk into train and validation folders containing a flat hierarchy of folders, one for each class, with the matching pictures. Each picture is labeled by the item id given by Vinted.

Getting all picture urls... Time to go for a coffee.

``get_dataframe()`` has basic resuming capabilities. It will try to get 300 pictures of each terminal category but will also attribute them to their parent.

In [68]:
def get_picture_urls_by_category_id(category):
    "Get the picture urls for a given category."
    category_id, label = category
    session = None # will be created in get_url
    params = {"catalog_ids": category_id, "per_page": 300, 'page':1} # Seems like max per_page around 300+

    response, session = get_url(VINTED_API_URL, session, params=params)
    if response.status_code != 200:
        return # skip this category
    
    items = response.json()["items"]

    res = list()
    for item in items:
        try: 
            res.append((item["id"], item["photo"]["url"], label))
        except TypeError:
            continue
    return res
    
def get_picture_urls(categories, num_workers=1):
    "Get a dataframe with all picture urls."
    return thread_map(get_picture_urls_by_category_id, categories, max_workers=num_workers) 

def make_df(gen):
    return pd.DataFrame(itertools.chain(*gen), columns=['id', 'url', 'label']).set_index('id')

In [69]:
df = make_df(get_picture_urls(list_categories[:3], num_workers=1)) # For now, just one worker. Need to figure out how not to get banned.

100%|██████████| 3/3 [00:08<00:00,  2.76s/it]


In [70]:
df.to_csv("vinted_dev.csv")

In [71]:
#df = pd.read_csv("vinted.csv").set_index('id')

Take care of duplicates. Unisex items are both in women and men categories.

In [72]:
def remove_duplicates(df):
    duplicates = df.index.duplicated()
    for row in df[duplicates].iterrows(): # somebody finds something faster please
        df.loc[row[0], 'label'] = row[1]['label'].replace("women", "unisex").replace("men", "unisex")
    return df[~df.index.duplicated(keep='first')] # drop the duplicates    

In [73]:
print(f"Has duplicates? {df.index.has_duplicates}")
df = remove_duplicates(df)
print(f"Has duplicates? {df.index.has_duplicates}")
print(f"Unique labels: {len(df.label.unique())}")

Has duplicates? False
Has duplicates? False
Unique labels: 3


For training purposes, it is easier to make sure all the classes are represented in both the training and the validation sets.

In [74]:
def remove_singleton_categories(df):
    counts_labels = df.groupby("label")["label"].transform(len) >= 2 # at least 2 pictures
    return df[counts_labels]

In [75]:
print(f"Before: {len(df.index)} pictures")
df = remove_singleton_categories(df)
print(f"After: {len(df.index)} pictures")

Before: 899 pictures
After: 899 pictures


Download the pictures, resize them and save them to disk in all the appropriate folders. Time for another coffee.

In [82]:
def params(df, target_folder):
    # Prapare the params for the map function
    for id, row in df.iterrows():
        yield (id, row.url, row.label, target_folder)

def download_picture(args):
    id, url, label, target_folder = args
    filename = os.path.join(target_folder, label, f"{id}.jpeg")

    if not os.path.exists(filename):
        req = requests.get(url)
        if req.status_code != 200:
            return # skip
        # resize picture
        ratio = 800/224 # from 800 px to 224 px for the largest side
        picture = Image.open(io.BytesIO(req.content))
        picture = picture.resize((int(picture.width/ratio), int(picture.height/ratio)))
        # save picture
        picture.save(filename, quality=50)

def make_dirs(df, target_folder):
    for label in df.label.unique():
        os.makedirs(os.path.join(target_folder, label), exist_ok=True)

def download_pictures(df, target_folder):
    "Download all pictures from a given list of urls."
    thread_map(download_picture, list(params(df, target_folder)), tqdm_class=tqdm, max_workers=128) # list() is only needed to make a pretty progress bar at the expense of memory usage

In [83]:
target_folder = "vinted_dev"
make_dirs(df, target_folder)
download_pictures(df, target_folder)

100%|██████████| 899/899 [00:03<00:00, 243.26it/s]


Archive both the pictures for upload to Kaggle.

In [84]:
def make_train_val(folder):
    train_folder, val_folder = f"{folder}_train", f"{folder}_val"
    # for every sub folder in folder
    for label in tqdm(os.listdir(folder), desc="Making train and val sets"):
        # if label is not a folder
        if not os.path.isdir(os.path.join(folder, label)):
            continue
        # make train and val sub folders
        os.makedirs(os.path.join(train_folder, label), exist_ok=True)
        os.makedirs(os.path.join(val_folder, label), exist_ok=True)
        # for every file in sub folder
        filenames = os.listdir(os.path.join(folder, label))
        random.shuffle(filenames)
        modulo = min(len(filenames), 5) # val is 1/5th but should contain at least 1 element
        for i in range(len(filenames)):
            if i % modulo == 0:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(val_folder, label, filenames[i]))
            else:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(train_folder, label, filenames[i]))

def zip_train_val(prefix):
    train_folder, val_folder = f"{prefix}_train", f"{prefix}_val"
    # zip train_folder and val_foder in to a single zip file
    today = datetime.datetime.now().strftime("%Y-%m-%d")
    with zipfile.ZipFile(f"{prefix}_{today}.zip", "w") as zip_file:
        for label in tqdm(os.listdir(train_folder), desc="Zipping the train set"):
            for file in os.listdir(os.path.join(train_folder, label)):
                zip_file.write(os.path.join(train_folder, label, file))
        for label in tqdm(os.listdir(val_folder), desc="Zipping the val set"):
            for file in os.listdir(os.path.join(val_folder, label)):
                zip_file.write(os.path.join(val_folder, label, file))
                

In [85]:
make_train_val(folder=target_folder)
zip_train_val(prefix=target_folder)

Making train and val sets: 100%|██████████| 3/3 [00:00<00:00, 16.54it/s]
Zipping the train set: 100%|██████████| 3/3 [00:00<00:00, 97.99it/s]
Zipping the val set: 100%|██████████| 3/3 [00:00<00:00, 370.65it/s]
