In [3]:
!pip install fake_useragent pandas

Collecting pandas
  Downloading pandas-1.4.2-cp39-cp39-macosx_11_0_arm64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.5/503.5 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.20.0
  Downloading numpy-1.22.3-cp39-cp39-macosx_11_0_arm64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.22.3 pandas-1.4.2 pytz-2022.1


In [4]:
import os
import time
import shutil

from urllib.error import HTTPError

import requests
import fake_useragent
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map

import pandas as pd

In [5]:
domain = "com"
VINTED_URL = f"https://www.vinted.{domain}"
VINTED_AUTH_URL = f"https://www.vinted.{domain}/auth/token_refresh"
VINTED_API_URL = f"https://www.vinted.{domain}/api/v2/catalog/items"

def get_session():
    session = requests.Session()
    user_agent = fake_useragent.UserAgent().random # Maybe helpful?
    session.headers.update({"User-Agent": user_agent})
    session.post(VINTED_AUTH_URL, headers={"User-Agent": user_agent}) # Set cookies
    return session

In [156]:
def get_pictures(keyword, num_pictures=10):
    session = get_session() # May help not to get flagged?
    
    params = {"search_text": keyword, "per_page": 300,} # Seems like max per_page around 300
    
    cpt_page = 1
    res = dict()
    with tqdm(total=num_pictures) as progress_bar:
        progress_bar.set_description(f"[{keyword}]")
        while True:
            params['page'] = cpt_page
            
            response = session.get(VINTED_API_URL, params=params)
            sleep_counter = 1
            while response.status_code != 200:
                print(f"[{response.status_code}] Sleeping {sleep_counter} seconds, then retrying.")
                time.sleep(sleep_counter)
                sleep_counter += 1
                session = get_session()
                response = session.get(VINTED_API_URL, params=params)
                
            items = response.json()["items"]
            for item in items:
                try:
                    if item["id"] not in res:
                        res[item["id"]] = item["photo"]["url"]
                        progress_bar.update(1)
                    if len(res) == num_pictures:
                        return res
                except TypeError: 
                    continue # Stuff happens, keep going
            cpt_page += 1
            
def get_dataframe(keywords, num_pictures):
    res = dict()
    for keyword in keywords:
        for id,url in get_pictures(keyword, num_pictures).items():
            res.setdefault(keyword, dict()).setdefault((id, url), True) # {"coat":{(...,...):1,}}

    df = pd.DataFrame(res).fillna(False)
    df = df.reset_index(level=1).rename(columns={"level_1":"url"})
    return df

def dl_picture(kwargs):
    dest, url = kwargs # thread_map ontakes a single argument
    # check if file exists
    if os.path.isfile(dest):
        return
    # make sure dest's parent directories exist
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    req = requests.get(url, stream=True)
    req.raise_for_status()
    
    with open(dest, 'wb') as f:
        req.raw.decode_content = True
        shutil.copyfileobj(req.raw, f) 
        
def dl_pictures(urls):
    thread_map(dl_picture, urls.to_dict().items(), tqdm_class=tqdm) 

def label_index(df, keyword, which_dataset):
    to_dl = df[df[keyword]].url
    to_dl.index = f"{which_dataset}/{keyword}/" + to_dl.index.astype(str) + ".jpeg"
    return to_dl    

In [157]:
keywords = ["coat", "winter"]

In [158]:
num_pictures = 1000

df = get_dataframe(keywords, num_pictures)

[coat]: 100%|██████████| 1000/1000 [00:04<00:00, 238.17it/s]
[winter]: 100%|██████████| 1000/1000 [00:04<00:00, 217.33it/s]


In [163]:
def df_to_folder(df, which_dataset):
    for keyword in df.columns:
        if keyword != "url":
            to_dl = label_index(df, keyword, which_dataset)
            dl_pictures(to_dl)

In [167]:
df_to_folder(df, "train")

100%|██████████| 1000/1000 [00:07<00:00, 135.79it/s]
100%|██████████| 1000/1000 [00:07<00:00, 139.69it/s]


In [174]:
def get_files(folder):
    for root, dirs, files in os.walk("train"):
        for filename in files:
            if filename.endswith(".jpeg"):
                yield os.path.join(root, filename)

def folder_to_df(folder):
    files = get_files(folder)

    d = dict()
    for f in files:
        id = f.split("/")[-1].split(".")[0]
        d.setdefault(id, dict())
        for keyword in keywords:
            if keyword in f:
                d[id][keyword] = True
    df = pd.DataFrame(d).fillna(False).T
    return df


In [175]:
folder_to_df("train")

Unnamed: 0,coat,winter
1762203209,True,False
1776903340,True,False
1779804260,True,False
1744632064,True,False
1780366392,True,False
...,...,...
1772434192,False,True
1735978523,False,True
1753320811,False,True
1490413458,False,True


In [176]:
df

Unnamed: 0,url,coat,winter
1779529721,https://images1.vinted.net/t/03_00d5a_wf4MJH8o...,True,False
1777086609,https://images1.vinted.net/t/03_01665_ZU5B1LEm...,True,False
1779242336,https://images1.vinted.net/t/03_0134d_uv64T5Kt...,True,False
1760057441,https://images1.vinted.net/t/03_00955_i3mek8YE...,True,False
1734777110,https://images1.vinted.net/t/01_02647_e3cKxyHP...,True,False
...,...,...,...
1708908610,https://images1.vinted.net/t/03_02098_ppF5UGtc...,False,True
1706296211,https://images1.vinted.net/t/03_01641_S6FU2imP...,False,True
1706075355,https://images1.vinted.net/t/02_00e6e_ESEYHTkt...,False,True
1704219467,https://images1.vinted.net/t/03_01e74_6SMhRMfe...,False,True
