In [1]:
! pip install fake_useragent pandas requests pillow tqdm kaggle



In [2]:
import os
import time
import shutil
import re
import json
import io
import datetime
import zipfile
import random

import requests
import fake_useragent
import pandas as pd


from urllib.error import HTTPError
from PIL import Image
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
from sklearn import model_selection

In [3]:
random.seed(42) # for reproducibility

Initialize fake_useragent's DB. Can comment afterwards.

In [4]:
fake_useragent.UserAgent().random

'Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27'

Here are a few helper functions:

* ``get_session()``: Initiates a session with the API and gets the necessary cookies.
* ``get_url()``: A fault tolerant ``requests.get()``

In [5]:
domain = "com"
VINTED_URL = f"https://www.vinted.{domain}"
VINTED_AUTH_URL = f"https://www.vinted.{domain}/auth/token_refresh"
VINTED_API_URL = f"https://www.vinted.{domain}/api/v2/catalog/items"

def beautiful_sleep(seconds, message):
    "Nicely print a message and sleep for a given number of seconds."
    for i in reversed(range(seconds)):
        print(f"{message}. Sleeping {i} seconds.    \r", end="")
        time.sleep(1)

def get_session(sleep_counter=2):
    "Get a session (cookies) with a fake user agent. Sleep if needed then calls itself."
    session = requests.Session()
    user_agent = fake_useragent.UserAgent().random # Maybe helpful?
    session.headers.update({"User-Agent": user_agent})

    response = session.post(VINTED_AUTH_URL, headers={"User-Agent": user_agent}) # Set cookies

    if response.status_code == 429:
        bench_time = int(response.headers["Retry-After"]) + 10 # at this point...
        beautiful_sleep(bench_time, f"[get_session {response.status_code}]")
        return get_session()

    if response.status_code != 200:
        #beautiful_sleep(sleep_counter, f"[get_session {response.status_code}])")
        time.sleep(sleep_counter)
        return get_session(sleep_counter * 2)

    return session

def get_url(url, session=None, params=None, sleep_counter=2):
    "Get a response from a given URL. Sleep if needed then calls itself."
    if session is None:
        session = get_session()
    if params is None:
        params = dict()

    try:
        response = session.get(url, params=params, timeout=5)
    except TimeoutError:
        print(f"Timeout. Skipping")
        return response, None
        
    if response.status_code == 500:
        #print(f"[get_url {response.status_code}] Skipping.")
        return response, None
    if response.status_code == 429:
        bench_time = int(response.headers["Retry-After"]) + 10 # at this point...
        beautiful_sleep(bench_time, f"[get_url {response.status_code}]")
        return get_url(url, params=params)
    if response.status_code != 200:
        #beautiful_sleep(sleep_counter, f"[get_url {response.status_code}]")
        time.sleep(sleep_counter)
        return get_url(url, params=params, sleep_counter=sleep_counter * 2)
        
    return response, session    

def get_categories(catalog, parents=None):
    "Get all categories from a catalog. Recursive."
    if parents is None:
        parents = []
    for c in catalog:
        if c.get('title') == 'Home':
            continue
        if c.get('catalogs'):
            yield from get_categories(c.get('catalogs'), parents + [(c.get('id'), c.get('title'))])
        else:
            yield parents + [(c.get('id'), c.get('title'))]

def get_catalog():
    "Get the catalog (category tree) from Vinted."
    url = f"https://www.vinted.com/vetements?"
    res, session = get_url(url)
    matches = re.findall('({.+})', res.text)
    sub_matches = list(m for m in matches if 'code' in m) # should be in the regex. Oh well.

    catalog = json.loads(sub_matches[0]).get('catalogTree')
    return catalog  

def get_picture_urls(category_id, label):
    "Get the picture urls for a given category."
    session = None # will be created in get_url
    params = {"catalog_ids": category_id, "per_page": 300, 'page':1} # Seems like max per_page around 300+

    response, session = get_url(VINTED_API_URL, session, params=params)
    if response.status_code != 200:
        return
    
    items = response.json()["items"]

    for item in items:
        try: 
            yield item["id"], item["photo"]["url"]
        except TypeError:
            continue
            
def get_dataframe(categories):
    "Get a dataframe with all picture urls for a given category."
    res = dict()
    for category, parents in tqdm(categories.items()):
        category_id, label = category
        if label in res:
            continue
        for picture_id, picture_url in get_picture_urls(category_id, label):
            res.setdefault(label, dict()).setdefault((picture_id, picture_url), True) # {"coat":{(...,...):1,}}
            for parent_id, parent_label in parents:
                res.setdefault(parent_label, dict()).setdefault((picture_id, picture_url), True) # add all parents

    df = pd.DataFrame(res).fillna(False)
    df = df.reset_index(level=1).rename(columns={"level_1":"url"})
    return df           

We start by getting the catalog. This is a tree of all the item categories sold on Vinted.com.

In [6]:
def get_catalog():
    "Get the catalog (category tree) from Vinted."
    url = f"https://www.vinted.com/vetements?"
    res, session = get_url(url)
    matches = re.findall('({.+})', res.text)
    sub_matches = list(m for m in matches if 'code' in m) # should be in the regex. Oh well.

    catalog = json.loads(sub_matches[0]).get('catalogTree')
    return catalog  

In [7]:
catalog = get_catalog() # this is a tree
while catalog is None:
    catalog = get_catalog()

Get all the categories and their parents as a list.

In [8]:
def get_categories(catalog, parents=None):
    "Get all categories from a catalog. Recursive."
    if parents is None:
        parents = []
    for c in catalog:
        if c.get('title') == 'Home':
            continue
        if c.get('catalogs'):
            yield from get_categories(c.get('catalogs'), parents + [(c.get('id'), c.get('title'))])
        else:
            yield parents + [(c.get('id'), c.get('title'))]

In [9]:
list_categories = list(get_categories(catalog))

There are some unwanted classes. We filter them out.

In [10]:
blacklist = {
        'Home', 
        'Beauty', 
        'Grooming', 
        'Toys & games',
        'Baby care',
        'Strollers',
        'Ride-on toys',
        'Chairs',
        "Kids' furniture",
        'School supplies',
        "Other kids' items"
    }

def filter_categories(list_categories):
    for category in list_categories:
        scat = set(c[1] for c in category)
        if not scat.intersection(blacklist):
            yield category

In [11]:
print(f" Before blacklisting: {len(list_categories)} categories.")
filtered_categories = list(filter_categories(list_categories))
print(f" After blacklisting: {len(filtered_categories)} categories.")

 Before blacklisting: 575 categories.
 After blacklisting: 491 categories.


We transform the filtered categories into a dictionary:

``{'terminal_leaf':[parent, parent, ...], ...}``

In [12]:
dict_categories = {category[-1]:category[:-1] for category in filtered_categories}

Getting all picture urls... Time to go for a coffee.

``get_dataframe()`` has basic resuming capabilities. It will try to get 300 pictures of each terminal category but will also attribute them to their parent.

In [13]:
def get_picture_urls(category_id, label):
    "Get the picture urls for a given category."
    session = None # will be created in get_url
    params = {"catalog_ids": category_id, "per_page": 300, 'page':1} # Seems like max per_page around 300+

    response, session = get_url(VINTED_API_URL, session, params=params)
    if response.status_code != 200:
        return
    
    items = response.json()["items"]

    for item in items:
        try: 
            yield item["id"], item["photo"]["url"]
        except TypeError:
            continue
            
def get_dataframe(categories):
    "Get a dataframe with all picture urls."
    res = dict()
    for category, parents in tqdm(categories.items()):
        category_id, label = category
        if label in res:
            continue
        for picture_id, picture_url in get_picture_urls(category_id, label):
            res.setdefault(label, dict()).setdefault((picture_id, picture_url), True) # {"coat":{(...,...):1,}}
            for parent_id, parent_label in parents:
                res.setdefault(parent_label, dict()).setdefault((picture_id, picture_url), True) # add all parents

    df = pd.DataFrame(res).fillna(False)
    df = df.reset_index(level=1).rename(columns={"level_1":"url"}) # nvm. Just to get the right format.
    return df

In [14]:
df = get_dataframe(dict_categories)

100%|██████████| 491/491 [07:37<00:00,  1.07it/s]


Take this step if you want to only keep the most common labels

In [15]:
def get_smaller_df(df, most_common_labels=50):
    labels = [label for label in df.columns if label != "url"]
    df_count = df[labels].sum(axis=0)
    labels_to_keep = df_count.sort_values(ascending=False).head(most_common_labels).index.to_list()
    filtered_columns = ['url',] + labels_to_keep
    smaller_df = df[filtered_columns]
    return smaller_df

In [16]:
vinted50 = get_smaller_df(df, most_common_labels=50)
print(f"{df.shape} -> {vinted50.shape}") # 334 and not 491: classes overlap (Women's jeans, Men's jeans) => (Women, Men, Jeans)

(77548, 334) -> (77548, 51)


Take this step if you want only the most granular categories without their ancestors.

In [17]:
vinted_all = df[['url'] + list({c[-1][1] for c in list_categories if c[-1][1] in df.columns})]
print(f"{df.shape} -> {vinted_all.shape}")

(77548, 334) -> (77548, 305)


Download the pictures, resize them and save them to disk in all the appropriate folders. Time for another coffee.

In [1]:
def dl_picture(kwargs):
    "Download an actual picture from a given url and save it in the appropriate class."
    url, dests = kwargs # thread_map ontakes a single argument
    
    for dest in dests:
        # check if file exists
        if os.path.isfile(dest):
            return
        # make sure dest's parent directories exist
        os.makedirs(os.path.dirname(dest), exist_ok=True)

    req = requests.get(url)
    if req.status_code != 200:
        return
    picture = req.content

    # resize picture
    ratio = 800/224 # from 800 px to 224 px
    picture = Image.open(io.BytesIO(picture))
    picture = picture.resize((int(picture.width/ratio), int(picture.height/ratio)))

    for dest in dests:
        picture.save(dest, quality=50)

        
def dl_pictures(res):
    "Download all pictures from a given list of urls."
    thread_map(dl_picture, res.items(), tqdm_class=tqdm) 

def url_df_to_folder(df, which_dataset):
    res = dict()
    for item in df.index:
        row = df.loc[item]
        keywords = row[row == True].index.to_list()
        item_dests = [f"{which_dataset}/{keyword}/{item}.jpeg" for keyword in keywords if keyword != "url"]
        res[row.url] = item_dests
    dl_pictures(res)

def replace_special_chars(columns):
    return (columns.str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("'", "")
        .str.replace("&", "and")
        .str.replace('¾', 'three_quarters')
        .str.replace(',', '')
        )    

SyntaxError: invalid syntax (2281969397.py, line 44)

In [20]:
vinted50.columns = replace_special_chars(vinted50.columns)
url_df_to_folder(vinted50, "vinted50") # if this fails, just rerun the cell

100%|██████████| 77548/77548 [06:12<00:00, 208.18it/s]   


In [26]:
vinted_all.columns = replace_special_chars(vinted_all.columns)
url_df_to_folder(vinted_all, "vinted_all") # if this fails, just rerun the cell

100%|██████████| 77548/77548 [04:18<00:00, 299.67it/s]   


Save the datasets's DataFrame to a csv file.

In [None]:
vinted50.to_csv("vinted50.csv")

In [None]:
vinted_all.to_csv("vinted_all.csv")

Archive both the pictures and the csv file for upload to Kaggle.

In [22]:
def make_train_val(folder):
    train_folder, val_folder = f"{folder}_train", f"{folder}_val"
    # for every sub folder in folder
    for label in tqdm(os.listdir(folder)):
        # make train and val sub folders
        os.makedirs(os.path.join(train_folder, label), exist_ok=True)
        os.makedirs(os.path.join(val_folder, label), exist_ok=True)
        # for every file in sub folder
        filenames = os.listdir(os.path.join(folder, label))
        random.shuffle(filenames)
        for i in range(len(filenames)):
            if i % 5 == 0:
                shutil.move(os.path.join(folder, label, filenames[i]), os.path.join(val_folder, label, filenames[i]))
            else:
                shutil.move(os.path.join(folder, label, filenames[i]), os.path.join(train_folder, label, filenames[i]))

def zip_train_val(prefix):
    train_folder, val_folder = f"{prefix}_train", f"{prefix}_val"
    # zip train_folder and val_foder in to a single zip file
    today = datetime.datetime.now().strftime("%Y-%m-%d")
    with zipfile.ZipFile(f"{prefix}_{today}.zip", "w") as zip_file:
        for label in tqdm(os.listdir(train_folder), desc="Train:"):
            for file in os.listdir(os.path.join(train_folder, label)):
                zip_file.write(os.path.join(train_folder, label, file))
        for label in tqdm(os.listdir(val_folder), desc="Validation:"):
            for file in os.listdir(os.path.join(val_folder, label)):
                zip_file.write(os.path.join(val_folder, label, file))
                

In [23]:
make_train_val(folder='vinted50')
zip_train_val(prefix='vinted50')

100%|██████████| 50/50 [00:22<00:00,  2.21it/s]


In [27]:
make_train_val(folder='vinted_all')
zip_train_val(prefix='vinted_all')

100%|██████████| 303/303 [00:08<00:00, 35.07it/s]
Train:: 100%|██████████| 303/303 [00:12<00:00, 23.32it/s]
Validation:: 100%|██████████| 303/303 [00:03<00:00, 83.18it/s] 
