In [3]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
from urllib.parse import urlparse, urlunparse, urlsplit
import re
import os
from tqdm.auto import tqdm, trange
import pandas as pd

HEADERS = {"User-Agent": UserAgent().random}

WEBSITE_NAME = "sneakerbaas"


def get_home_url(url):
    parsed_url = urlsplit(url)
    home_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return home_url


def remove_query_from_url(url):
    parsed_url = urlparse(url)
    parsed_url = parsed_url._replace(query="")
    modified_url = urlunparse(parsed_url)
    return modified_url


def add_https_to_url(url):
    if not url.startswith("http://") and not url.startswith("https://"):
        corrected_url = "https://" + url
    else:
        corrected_url = url
    return corrected_url


def get_image_extension(url):
    parsed_url = urlparse(add_https_to_url(url))
    return "." + parsed_url.path.split("/")[-1].split(".")[1]

In [2]:
url = "https://www.sneakerbaas.com/collections/men"
# url = "https://www.sneakerbaas.com/collections/sneakers"

In [4]:
def get_collection_info(url):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")
    products_string = soup.find_all(class_=re.compile("collection-size"))[
        0
    ].text.strip()
    products_number = int(re.search(r"\d+", products_string).group())
    number_of_pages = int(
        soup.find_all(class_=re.compile("(?<!\S)pagination(?!\S)"))[0]
        .find_all("span")[-2]
        .a.text
    )
    return {"products_number": products_number, "number_of_pages": number_of_pages}

In [4]:
get_collection_info(url)

{'products_number': 399, 'number_of_pages': 10}

In [5]:
def get_sneakers_info(url):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")
    home_url = get_home_url(url)
    sneakers_urls = set(
        [
            home_url + item["href"]
            for item in soup.find_all(href=re.compile("/collections/men/products"))
        ]
    )
    return {"sneakers_urls": sneakers_urls, "number_of_urls": len(sneakers_urls)}


In [6]:
def get_metadata_photos(url):
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    # metadata
    metadata_keys = ["brand", "description", "priceCurrency", "price"]
    meta_html = soup.find_all(name="div", class_="page-row-content")[0].div.find_all(
        name="meta"
    )
    metadata = {"url": url}
    for meta in meta_html:
        if meta.has_attr("itemprop"):
            if meta["itemprop"] in metadata_keys:
                metadata[meta["itemprop"]] = meta["content"].replace("\xa0", " ")

    # default_fields = ["Description", "Colors", "Stijlcode"]
    # for i, item in enumerate(metadata["description"].split("- ")[1:]):
    #     metadata[default_fields[i]] = item
    # metadata.pop("description")

    # title
    metadata["title"] = (
        soup.find(name="main", id="MainContent").find_all(name="span")[2].text
    )

    # images
    for i, product_image in enumerate(
        soup.find_all(name="div", class_="swiper-slide product-image")
    ):
        image_url = add_https_to_url(
            remove_query_from_url(
                product_image.find("a", {"data-fancybox": "productGallery"})["href"][2:]
            )
        )
        img_binary = requests.get(image_url).content
        os.makedirs(
            os.path.join(
                "parser", WEBSITE_NAME, "photos", metadata["brand"], metadata["title"]
            ),
            exist_ok=True,
        )
        with open(
            os.path.join(
                "parser",
                WEBSITE_NAME,
                "photos",
                metadata["brand"],
                metadata["title"],
                str(i) + get_image_extension(image_url),
            ),
            "wb",
        ) as f:
            f.write(img_binary)

    metadata["photos_path"] = os.path.join("parser", WEBSITE_NAME, metadata["title"])

    return metadata

In [7]:
page_url = f"https://www.sneakerbaas.com/collections/men?page={2}"
temp = get_sneakers_info(page_url)
sneakers_urls = temp["sneakers_urls"]
number_of_urls = temp["number_of_urls"]
print(f"{number_of_urls} кроссовок на странице")


44 кроссовок на странице


In [None]:
full_metadata = []
for sneakers_url in sneakers_urls:
    metadata = get_metadata_photos(sneakers_url)
    full_metadata.append(metadata)

In [None]:
import pandas as pd


df = pd.DataFrame(full_metadata)
df


In [21]:
df.to_csv("parser/sneakerbaas/metadata.csv", index=False)


In [7]:
def parse_sneakerbaas(collection_url):
    metadata_collector = []
    collection_info = get_collection_info(collection_url)
    print("collection:", collection_info)
    for page in trange(1, collection_info["number_of_pages"] + 1):
        page_url = f"https://www.sneakerbaas.com/collections/men?page={page}"
        sneakers_info = get_sneakers_info(page_url)
        # print(f"Number of sneakers on page {page}:", sneakers_info["number_of_urls"])
        for sneakers_url in tqdm(sneakers_info["sneakers_urls"]):
            metadata = get_metadata_photos(sneakers_url)
            metadata_collector.append(metadata)

    print(
        f"Collected {len(metadata_collector)} sneakers out of {collection_info['products_number']}"
    )
    df = pd.DataFrame(metadata_collector)
    df.to_csv("parser/sneakerbaas/metadata.csv", index=False)

In [8]:
parse_sneakerbaas("https://www.sneakerbaas.com/collections/men")

collection: {'products_number': 399, 'number_of_pages': 10}


  0%|          | 0/10 [00:00<?, ?it/s]

Number of sneakers on page: 44


  0%|          | 0/44 [00:00<?, ?it/s]

Number of sneakers on page: 44


  0%|          | 0/44 [00:00<?, ?it/s]

Number of sneakers on page: 44


  0%|          | 0/44 [00:00<?, ?it/s]

IndexError: list index out of range