In [1]:
!pip install fake_useragent pandas

Collecting fake_useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: fake_useragent
  Building wheel for fake_useragent (setup.py) ... [?25l- \ done
[?25h  Created wheel for fake_useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13502 sha256=1b583b2d1acb9970f649ea2912d1c9bb4afe475c6382e261b74685d7a18b7f17
  Stored in directory: /root/.cache/pip/wheels/ed/f7/62/50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031
Successfully built fake_useragent
Installing collected packages: fake_useragent
Successfully installed fake_useragent-0.1.11
[0m

In [2]:
import os
import time
import shutil

from urllib.error import HTTPError

import requests
import fake_useragent
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

import pandas as pd

In [3]:
domain = "com"
VINTED_URL = f"https://www.vinted.{domain}"
VINTED_AUTH_URL = f"https://www.vinted.{domain}/auth/token_refresh"
VINTED_API_URL = f"https://www.vinted.{domain}/api/v2/catalog/items"

def get_session():
    session = requests.Session()
    user_agent = fake_useragent.UserAgent().random # Maybe helpful?
    session.headers.update({"User-Agent": user_agent})
    session.post(VINTED_AUTH_URL, headers={"User-Agent": user_agent}) # Set cookies
    return session

In [4]:
def get_pictures(keyword, num_pictures=10):
    session = get_session() # May help not to get flagged?
    
    params = {"search_text": keyword, "per_page": 300,} # Seems like max per_page around 300
    
    cpt_page = 1
    res = dict()
    with tqdm(total=num_pictures) as progress_bar:
        progress_bar.set_description(f"[{keyword}]")
        while True:
            params['page'] = cpt_page
            
            response = session.get(VINTED_API_URL, params=params)
            sleep_counter = 1
            while response.status_code != 200:
                print(f"[{response.status_code}] Sleeping {sleep_counter} seconds, then retrying.")
                time.sleep(sleep_counter)
                sleep_counter += 1
                session = get_session()
                response = session.get(VINTED_API_URL, params=params)
                
            items = response.json()["items"]
            for item in items:
                try:
                    if item["id"] not in res:
                        res[item["id"]] = item["photo"]["url"]
                        progress_bar.update(1)
                    if len(res) == num_pictures:
                        return res
                except TypeError: 
                    continue # Stuff happens, keep going
            cpt_page += 1
            
def get_dataframe(keywords, num_pictures):
    res = dict()
    for keyword in keywords:
        for id,url in get_pictures(keyword, num_pictures).items():
            res.setdefault(keyword, dict()).setdefault((id, url), True) # {"coat":{(...,...):1,}}

    df = pd.DataFrame(res).fillna(False)
    df = df.reset_index(level=1).rename(columns={"level_1":"url"})
    return df

def dl_picture(kwargs):
    dest, url = kwargs # thread_map ontakes a single argument
    # check if file exists
    if os.path.isfile(dest):
        return
    # make sure dest's parent directories exist
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    req = requests.get(url, stream=True)
    req.raise_for_status()
    
    with open(dest, 'wb') as f:
        req.raw.decode_content = True
        shutil.copyfileobj(req.raw, f) 
        
def dl_pictures(urls):
    thread_map(dl_picture, urls.to_dict().items(), tqdm_class=tqdm) 

def label_index(df, keyword, which_dataset):
    to_dl = df[df[keyword]].url
    to_dl.index = f"{which_dataset}/{keyword}/" + to_dl.index.astype(str) + ".jpeg"
    return to_dl    

In [5]:
keywords = ["coat", "winter"]

In [6]:
num_pictures = 1000

df = get_dataframe(keywords, num_pictures)

[coat]: 100%|██████████| 1000/1000 [00:04<00:00, 233.43it/s]
[winter]: 100%|██████████| 1000/1000 [00:04<00:00, 235.84it/s]


In [7]:
def df_to_folder(df, which_dataset):
    for keyword in df.columns:
        if keyword != "url":
            to_dl = label_index(df, keyword, which_dataset)
            dl_pictures(to_dl)

In [8]:
df_to_folder(df, "train")

100%|██████████| 1000/1000 [01:14<00:00, 13.37it/s]
100%|██████████| 1000/1000 [01:13<00:00, 13.56it/s]


In [9]:
!ls -R

.:
__notebook__.ipynb  train

./train:
coat  winter

./train/coat:
1368113692.jpeg  1755989318.jpeg  1771260993.jpeg  1777112272.jpeg
1372636172.jpeg  1755989458.jpeg  1771488171.jpeg  1777113298.jpeg
1382547290.jpeg  1755990248.jpeg  1771816442.jpeg  1777113440.jpeg
1397663920.jpeg  1755990625.jpeg  1772088934.jpeg  1777113524.jpeg
1422925488.jpeg  1755992618.jpeg  1772138767.jpeg  1777113681.jpeg
1671593417.jpeg  1755993024.jpeg  1772254536.jpeg  1777113797.jpeg
1693773530.jpeg  1755993207.jpeg  1772257847.jpeg  1777113993.jpeg
1711090412.jpeg  1755993389.jpeg  1772337764.jpeg  1777114325.jpeg
1713481371.jpeg  1755998081.jpeg  1772370636.jpeg  1777114750.jpeg
1717234186.jpeg  1755999036.jpeg  1772387300.jpeg  1777114792.jpeg
1721400776.jpeg  1755999548.jpeg  1772454275.jpeg  1777114902.jpeg
1722662514.jpeg  1755999644.jpeg  1772458472.jpeg  1777116509.jpeg
1725660727.jpeg  1755999737.jpeg  1772466066.jpeg  1777118252.jpeg
1728427435.jpeg  1755999901.jpeg  17724687

In [10]:
def get_files(folder):
    for root, dirs, files in os.walk("train"):
        for filename in files:
            if filename.endswith(".jpeg"):
                yield os.path.join(root, filename)

def folder_to_df(folder):
    files = get_files(folder)

    d = dict()
    for f in files:
        id = f.split("/")[-1].split(".")[0]
        d.setdefault(id, dict())
        for keyword in keywords:
            if keyword in f:
                d[id][keyword] = True
    df = pd.DataFrame(d).fillna(False).T
    return df


In [11]:
folder_to_df("train")

Unnamed: 0,winter,coat
1451307151,True,False
1765920167,True,True
1726271776,True,False
1736917028,True,False
1774479255,True,True
...,...,...
1777085628,False,True
1734735343,False,True
1741695337,False,True
1759740762,False,True


In [12]:
df

Unnamed: 0,url,coat,winter
1767975533,https://images1.vinted.net/t/03_0085e_3LA9t6JW...,True,False
1779529721,https://images1.vinted.net/t/03_00d5a_wf4MJH8o...,True,False
1779242336,https://images1.vinted.net/t/03_0134d_uv64T5Kt...,True,False
1777100641,https://images1.vinted.net/t/02_002ab_ZLxoQpSQ...,True,False
1780781565,https://images1.vinted.net/t/03_01204_4Pjga85B...,True,False
...,...,...,...
1481987284,https://images1.vinted.net/t/03_009bd_btmVTUdQ...,False,True
1663240236,https://images1.vinted.net/t/02_00f90_EcP8YLbA...,False,True
1632078345,https://images1.vinted.net/t/01_008f4_qLWXhcjd...,False,True
1458912770,https://images1.vinted.net/t/03_01d4c_4dcjpR8P...,False,True
