In [1]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlsplit
import re
import os

ua = UserAgent()
headers = {"User-Agent": ua.random}


def get_home_link(url):
    parsed_url = urlsplit(url)
    home_link = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return home_link


def remove_query_from_url(url):
    parsed_url = urlparse(url)
    parsed_url = parsed_url._replace(query="")
    modified_url = urlunparse(parsed_url)
    return modified_url


def add_https_to_url(url):
    if not url.startswith("http://") and not url.startswith("https://"):
        corrected_url = "https://" + url
    else:
        corrected_url = url
    return corrected_url


def get_image_extension(url):
    parsed_url = urlparse(url)
    parsed_url = urlparse(
        add_https_to_url(
            "www.sneakerbaas.com/cdn/shop/files/SH_D1GA330906_03_2000x2000.png"
        )
    )
    return "." + parsed_url.path.split("/")[-1].split(".")[1]


WEBSITE_NAME = "sneakerbaas"

In [2]:
link = "https://www.sneakerbaas.com/collections/men"
# link = "https://www.sneakerbaas.com/collections/sneakers"


In [3]:
def get_website_info(link):
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    products_string = soup.find_all(class_=re.compile("collection-size"))[
        0
    ].text.strip()
    products_number = int(re.search(r"\d+", products_string).group())
    number_of_pages = int(
        soup.find_all(class_=re.compile("(?<!\S)pagination(?!\S)"))[0]
        .find_all("span")[-2]
        .a.text
    )
    return {"products_number": products_number, "number_of_pages": number_of_pages}


In [4]:
get_website_info(link)


{'products_number': 399, 'number_of_pages': 10}

In [5]:
def get_sneakers_links(link):
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    home_link = get_home_link(link)
    sneakers_links = set(
        [
            home_link + item["href"]
            for item in soup.find_all(href=re.compile("/collections/men/products"))
        ]
    )
    return {"sneakers_links": sneakers_links, "number_of_links": len(sneakers_links)}

In [6]:
def get_metadata_photos(url):
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    # metadata
    metadata_keys = ["brand","description","priceCurrency", "price"]
    meta_html = soup.find_all(name="div",class_="page-row-content")[0].div.find_all(name="meta")
    print(meta_html)
    temp = {}
    for meta in meta_html:
        if meta.has_attr("itemprop" ):
            if meta["itemprop"] in metadata_keys:
                temp[meta["itemprop"]] = meta["content"].replace('\xa0', ' ')
    

    default_fields = ["Description", "Colors", "Stijlcode"]
    for i, item in enumerate(temp["description"].split("- ")[1:]):
        temp[default_fields[i]] = item
    temp.pop("description")

    # title
    temp["title"] = soup.find(name="main",id="MainContent").find_all(name="span")[2].text

    # images
    for i,product_image in enumerate(soup.find_all(name="div", class_="swiper-slide product-image")):
        image_link = add_https_to_url(remove_query_from_url(product_image.find("a",{'data-fancybox': 'productGallery'})["href"][2:]))
        img_binary = requests.get(image_link).content
        os.makedirs(os.path.join("parser",WEBSITE_NAME, temp["brand"],temp["title"]), exist_ok=True)
        with open(os.path.join("parser",WEBSITE_NAME,temp["brand"], temp["title"], str(i)+ get_image_extension(image_link)), "wb") as f:
            f.write(img_binary)

    temp["photos_path"] = os.path.join("parser",WEBSITE_NAME,temp["title"])

    return temp
    

In [10]:
page_link = f"https://www.sneakerbaas.com/collections/men?page={2}"
temp = get_sneakers_links(page_link)
sneakers_links = temp["sneakers_links"]
number_of_links = temp["number_of_links"]
print(f"{number_of_links} кроссовок на странице")

44 кроссовок на странице


In [11]:
full_metadata = []
for sneakers_link in sneakers_links:
    metadata = get_metadata_photos(sneakers_link)
    full_metadata.append(metadata)


[<meta 40"="" content="Shaq Attaq " gold"-="" itemprop="name"/>, <meta content="https://www.sneakerbaas.com/products/shaq-attaq-ftwwht-cblack-goldmt?variant=46877002563910" itemprop="url"/>, <meta content="Reebok Classics" itemprop="brand"/>, <meta content="//www.sneakerbaas.com/cdn/shop/files/0920095456_650a42e0051cb_600x600.png?v=1695885843" itemprop="image"/>, <meta content='- Reebok SHAQ ATTAQ "FTWWHT/CBLACK/GOLDMT"- Colour: Black / White / Gold- Stijlcode: 100032830' itemprop="description"/>, <meta content="EUR" itemprop="priceCurrency"/>, <meta content="179.95" itemprop="price"/>]
[<meta 42"="" black"-="" content="Shadow 6000 " itemprop="name" purple=""/>, <meta content="https://www.sneakerbaas.com/products/shadow-6000-purple-black?variant=47077545247046" itemprop="url"/>, <meta content="Saucony" itemprop="brand"/>, <meta content="//www.sneakerbaas.com/cdn/shop/files/Schermopname_205_fe054b19-8b5e-4fd2-bd0c-fb2550f5de2c_600x600.png?v=1691480471" itemprop="image"/>, <meta content=

In [20]:
import pandas as pd


df = pd.DataFrame(full_metadata)
df

Unnamed: 0,brand,priceCurrency,price,Description,Colors,Stijlcode,title,photos_path
0,Reebok Classics,EUR,179.95,"Reebok SHAQ ATTAQ ""FTWWHT/CBLACK/GOLDMT""",Colour: Black / White / Gold,Stijlcode: 100032830,"Shaq Attaq ""Gold""","parser/sneakerbaas/Shaq Attaq ""Gold"""
1,Saucony,EUR,154.95,"Saucony Shadow 6000 ""Purple / Black""",Colour: Purple / Coral / Black,Stijlcode: S70784-1,"Shadow 6000 ""Purple / Black""","parser/sneakerbaas/Shadow 6000 ""Purple / Black"""
2,Reebok Classics,EUR,159.95,"Reebok PUMP TZ ""FTWWHT/CBLACK/SOACYE""",Colour: White / Black / Neon,Stijlcode: 100033132,"Pump TZ ""Neon""","parser/sneakerbaas/Pump TZ ""Neon"""
3,Clarks,EUR,269.95,"Clarks Wallabee Eden ""Dark Sand""",Beige / Cream,Stijlcode: 26173319,"Wallabee Eden ""Dark Sand""","parser/sneakerbaas/Wallabee Eden ""Dark Sand"""
4,Autry,EUR,179.95,"Autry Medalist Low M ""Leat Wht/Malachi""",Colour: Mint / White / Cream,Stijlcode: AULMWB30,"Medalist Low M ""Malachi""","parser/sneakerbaas/Medalist Low M ""Malachi"""
5,Autry,EUR,189.95,"Autry CLC Low M ""Mat Wht/Pbl""",Colour: White / Blue,Stijlcode: ROLMMM06,"CLC Low M ""Blue""","parser/sneakerbaas/CLC Low M ""Blue"""
6,Saucony,EUR,109.95,"Saucony Grid Shadow 2 OG ""Blue / White""",Colour: White / Blue / Black,Stijlcode: S70772-1,Grid Shadow 2 OG,parser/sneakerbaas/Grid Shadow 2 OG
7,Saucony,EUR,129.95,"Saucony Grid Shadow 2 ""Creek Marsh""",Colour: Green / Blue / White,Stijlcode: S70782-1,Grid Shadow 2,parser/sneakerbaas/Grid Shadow 2
8,PUMA Sportstyle,EUR,139.95,"Puma Slipstream Xtreme Color ""Yellow Sizzle""",Colour: White / Green / Yellow,Stijlcode: 394695-1,"Slipstream Xtreme ""Yellow""","parser/sneakerbaas/Slipstream Xtreme ""Yellow"""
9,Saucony Originals,EUR,144.95,"Saucony Shadow 5000 ""Forest""",Colour: Green / Cream / White,Stijlcode: S70778-1,"Shadow 5000 ""Forest""","parser/sneakerbaas/Shadow 5000 ""Forest"""


In [21]:
df.to_csv("parser/sneakerbaas/metadata.csv", index=False)