In [1]:
!pip install fake_useragent

Collecting fake_useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: fake_useragent
  Building wheel for fake_useragent (setup.py) ... [?25ldone
[?25h  Created wheel for fake_useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13502 sha256=a0ecedc3248adf210e0fcc0cbea5ad3ffc2f346fd8e81162889a624a59260be8
  Stored in directory: /root/.cache/pip/wheels/ed/f7/62/50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031
Successfully built fake_useragent
Installing collected packages: fake_useragent
Successfully installed fake_useragent-0.1.11
[0m

In [2]:
import time
import shutil

from urllib.error import HTTPError

import requests
import fake_useragent
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

import pandas as pd

In [3]:
domain = "com"
VINTED_URL = f"https://www.vinted.{domain}"
VINTED_AUTH_URL = f"https://www.vinted.{domain}/auth/token_refresh"
VINTED_API_URL = f"https://www.vinted.{domain}/api/v2/catalog/items"

def get_session():
    session = requests.Session()
    user_agent = fake_useragent.UserAgent().random # Maybe helpful?
    session.headers.update({"User-Agent": user_agent})
    session.post(VINTED_AUTH_URL, headers={"User-Agent": user_agent}) # Set cookies
    return session

In [4]:
def get_pictures(keyword, num_pictures=10):
    session = get_session() # May help not to get flagged?
    
    params = {"search_text": keyword, "per_page": 300,} # Seems like max per_page around 300
    
    cpt_page = 1
    cpt_items = 0
    
    with tqdm(total=num_pictures) as progress_bar:
        progress_bar.set_description(f"[{keyword}]")
        while True:
            params['page'] = cpt_page
            
            response = session.get(VINTED_API_URL, params=params)
            sleep_counter = 1
            while response.status_code != 200:
                print(f"[{response.status_code}] Sleeping {sleep_counter} seconds, then retrying.")
                time.sleep(sleep_counter)
                sleep_counter += 1
                session = get_session()
                response = session.get(VINTED_API_URL, params=params)
                
            items = response.json()["items"]
            for item in items:
                try:
                    yield (item["id"], item["photo"]["url"]) # The id is important for multilabels
                    cpt_items += 1
                    progress_bar.update(1)
                    if cpt_items == num_pictures:
                        return
                except TypeError: 
                    continue # Stuff happens, keep going
            cpt_page += 1
            
def get_dataframe(keywords, num_pictures):
    res = dict()
    for keyword in keywords:
        for item in get_pictures(keyword, num_pictures):
            res.setdefault(keyword, dict()).setdefault(item, 1) # {"coat":{(...,...):1,}}

    df = pd.DataFrame(res).fillna(0)
    df = df.reset_index(level=1).rename(columns={"level_1":"url"})
    df[keywords] = df[keywords].astype(bool)
    return df

def dl_picture(kwargs):
    id, url = kwargs # thread_map ontakes a single argument
    req = requests.get(url, stream=True)
    req.raise_for_status()
    with open(f"pictures/{id}.jpeg", 'wb') as f:
        req.raw.decode_content = True
        shutil.copyfileobj(req.raw, f) 
        
def dl_pictures(df):
    thread_map(dl_picture, df.url.to_dict().items(), tqdm_class=tqdm)

In [5]:
!mkdir pictures

In [6]:
keywords = ["coat", "winter"]

In [7]:
num_pictures = 1000

df = get_dataframe(keywords, num_pictures)

[coat]: 100%|██████████| 1000/1000 [00:07<00:00, 130.92it/s]
[winter]: 100%|██████████| 1000/1000 [00:08<00:00, 113.81it/s]


In [8]:
dl_pictures(df)

100%|██████████| 1526/1526 [00:20<00:00, 72.71it/s]


In [9]:
!du -hs pictures/

157M	pictures/
