In [1]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
from urllib.parse import urlparse, urlsplit, urljoin
import re
import os
from tqdm.auto import tqdm, trange
import pandas as pd
from pathlib import Path

HEADERS = {"User-Agent": UserAgent().random}

WEBSITE_NAME = "sneakerbaas"
COLLECTIONS_URL = "https://www.sneakerbaas.com/collections/sneakers/"
HOSTNAME_URL = "https://www.sneakerbaas.com/"
COLLECTIONS = [
    "category-kids",
    "category-unisex",
    "category-women",
    "category-men",
]


def get_hostname_url(url):
    parsed_url = urlsplit(url)
    return f"{parsed_url.scheme}://{parsed_url.netloc}"


def remove_query_from_url(url):
    return urlparse(url)._replace(query="").geturl()


def add_https_to_url(url):
    return urlparse(url)._replace(scheme="https").geturl()


def get_image_extension(url):
    return "." + urlparse(url).path.split("/")[-1].split(".")[1]


def get_max_file_name(folder):
    path_iter = Path(folder).iterdir()
    if any(path_iter):
        return int(
            max(
                (Path(fn).stem for fn in Path(folder).iterdir()),
                key=lambda fn: int(Path(fn).stem),
            )
        )
    else:
        return -1


def add_page_to_url(url, page_number):
    return urlparse(url)._replace(query=f"page={page_number}").geturl()

In [2]:
# url = "https://www.sneakerbaas.com/collections/sneakers"
# url = "https://www.sneakerbaas.com/collections/sneakers"

In [3]:
def get_collection_info(collection=""):
    info = {"url": urljoin(COLLECTIONS_URL, collection)}
    r = requests.get(info["url"], headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    products_string = soup.find_all(class_=re.compile("collection-size"))[
        0
    ].text.strip()
    info["number_of_products"] = int(re.search(r"\d+", products_string).group())
    info["number_of_pages"] = int(
        soup.find_all(class_=re.compile("(?<!\S)pagination(?!\S)"))[0]
        .find_all("span")[-2]
        .a.text
    )
    return info


In [4]:
def get_sneakers_urls(url, sneakers_url_path="/collections/sneakers/products"):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    return set(
        [
            HOSTNAME_URL + item["href"]
            for item in soup.find_all(href=re.compile(sneakers_url_path))
        ]
    )

In [5]:
def get_sneakers_metadata(url, soup, collection):
    # metadata
    metadata_keys = ["brand", "description", "priceCurrency", "price"]
    meta_html = soup.find_all(name="div", class_="page-row-content")[0].div.find_all(
        name="meta"
    )
    metadata = {"url": url, "collection": collection}
    for meta in meta_html:
        if meta.has_attr("itemprop"):
            if meta["itemprop"] in metadata_keys:
                metadata[meta["itemprop"]] = meta["content"].replace("\xa0", " ")

    # default_fields = ["Description", "Colors", "Stijlcode"]
    # for i, item in enumerate(metadata["description"].split("- ")[1:]):
    #     metadata[default_fields[i]] = item
    # metadata.pop("description")

    # format metadata brand
    metadata["brand"] = metadata["brand"].lower()

    # title
    metadata["title"] = (
        soup.find(name="main", id="MainContent").find_all(name="span")[2].text
    )
    return metadata


def get_sneakers_images(soup):
    images = []
    images_section = soup.find_all(name="div", class_="swiper-slide product-image")
    for product_image in images_section:
        raw_image_url = product_image.find("a", {"data-fancybox": "productGallery"})[
            "href"
        ]
        image_url = add_https_to_url(remove_query_from_url(raw_image_url))
        image_binary = requests.get(image_url).content
        image_ext = get_image_extension(image_url)
        images.append((image_binary, image_ext))
    return images


def save_sneakers_images_local(images, metadata, path="data"):
    dir = os.path.join(
        path,
        WEBSITE_NAME,
        metadata["collection"],
        "photos",
        metadata["brand"],
        metadata["title"],
    )
    os.makedirs(dir, exist_ok=True)

    i = get_max_file_name(dir)
    for image_binary, image_ext in images:
        i += 1
        with open(os.path.join(dir, str(i) + image_ext), "wb") as f:
            f.write(image_binary)

    return dir

In [6]:
def parse_sneakers(url, collection, path="data"):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")
    metadata = get_sneakers_metadata(url, soup, collection)
    images = get_sneakers_images(soup)
    photos_path = save_sneakers_images_local(images, metadata, path)  # or S3
    metadata["photos_path"] = photos_path
    return metadata


In [2]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
HEADERS = {"User-Agent": UserAgent().random}
r = requests.get("https://www.sneakerbaas.com/collections/sneakers/products/palermo-lth-black", headers=HEADERS)
soup = BeautifulSoup(r.text, "html.parser")

In [3]:
soup.find_all(name="div", class_="page-row-content")[0].div.find_all(
        name="meta"
    )

[<meta 46"="" black"-="" content="Palermo Lth " itemprop="name"/>,
 <meta content="https://www.sneakerbaas.com/products/palermo-lth-black?variant=48668399927622" itemprop="url"/>,
 <meta content="Puma Sportstyle" itemprop="brand"/>,
 <meta content="//www.sneakerbaas.com/cdn/shop/files/Pblack1_600x600.jpg?v=1697702678" itemprop="image"/>,
 <meta content='- Puma Palermo Lth "Black"- Colour: Black / White- Stijlcode: 396464-03' itemprop="description"/>,
 <meta content="EUR" itemprop="priceCurrency"/>,
 <meta content="89.95" itemprop="price"/>]

In [7]:
# page_url = f"https://www.sneakerbaas.com/collections/men?page={2}"
# temp = get_sneakers_info(page_url)
# sneakers_urls = temp["sneakers_urls"]
# number_of_urls = temp["number_of_urls"]
# print(f"{number_of_urls} кроссовок на странице")


In [8]:
# full_metadata = []
# for sneakers_url in sneakers_urls:
#     metadata = get_metadata_photos(sneakers_url)
#     full_metadata.append(metadata)

In [9]:
# import pandas as pd


# df = pd.DataFrame(full_metadata)
# df


In [10]:
# df.to_csv("parser/sneakerbaas/metadata.csv", index=False)


In [11]:
def parse_sneakerbaas(path="data"):
    full_collection = get_collection_info()
    print(
        f"{WEBSITE_NAME} website: {full_collection['number_of_products']} sneakers found"
    )
    print(f"{len(COLLECTIONS)} collections found")

    full_metadata = []

    for collection in COLLECTIONS:
        metadata_collection = []
        collection_info = get_collection_info(collection)
        print(
            f"Parsing collection: {collection}, found {collection_info['number_of_pages']} pages, "
            f"{collection_info['number_of_products']} products, {collection_info['url']}"
        )
        for page in trange(1, collection_info["number_of_pages"] + 1):
            page_url = add_page_to_url(collection_info["url"], page)
            print(page_url)
            sneakers_urls = get_sneakers_urls(page_url)
            for sneakers_url in tqdm(sneakers_urls):
                metadata = parse_sneakers(sneakers_url, collection)
                metadata_collection.append(metadata)

        print(
            f"Collected {len(metadata_collection)} sneakers out of {collection_info['number_of_products']} in "
            f"{collection} collection"
        )
        df = pd.DataFrame(metadata_collection)
        df.to_csv(
            os.path.join(path, WEBSITE_NAME, collection, "metadata.csv"), index=False
        )

        full_metadata += metadata_collection

    df = pd.DataFrame(full_metadata)
    df.to_csv(os.path.join(path, WEBSITE_NAME, "metadata.csv"), index=False)
    print(
        f"Collected {len(full_metadata)} sneakers out of {full_collection['number_of_products']} in "
        f"{WEBSITE_NAME} website"
    )

In [12]:
parse_sneakerbaas()

sneakerbaas website: 1002 sneakers found
4 collections found
Parsing collection: category-kids, found 2 pages, 53 products, https://www.sneakerbaas.com/collections/sneakers/category-kids


  0%|          | 0/2 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-kids?page=1


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-kids?page=2


  0%|          | 0/9 [00:00<?, ?it/s]

Collected 53 sneakers out of 53 in category-kids collection
Parsing collection: category-unisex, found 10 pages, 405 products, https://www.sneakerbaas.com/collections/sneakers/category-unisex


  0%|          | 0/10 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=1


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=2


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=3


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=4


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=5


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=6


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=7


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=8


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=9


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-unisex?page=10


  0%|          | 0/9 [00:00<?, ?it/s]

Collected 405 sneakers out of 405 in category-unisex collection
Parsing collection: category-women, found 6 pages, 242 products, https://www.sneakerbaas.com/collections/sneakers/category-women


  0%|          | 0/6 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=1


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=2


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=3


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=4


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=5


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-women?page=6


  0%|          | 0/22 [00:00<?, ?it/s]

Collected 242 sneakers out of 242 in category-women collection
Parsing collection: category-men, found 10 pages, 400 products, https://www.sneakerbaas.com/collections/sneakers/category-men


  0%|          | 0/10 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=1


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=2


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=3


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=4


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=5


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=6


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=7


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=8


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=9


  0%|          | 0/44 [00:00<?, ?it/s]

https://www.sneakerbaas.com/collections/sneakers/category-men?page=10


  0%|          | 0/4 [00:00<?, ?it/s]

Collected 400 sneakers out of 400 in category-men collection
Collected 1100 sneakers out of 1002 in sneakerbaas website
