## Algolia - CRM-система со встроенным ранжированием поисков пользователей. 

Используется в:
- https://www.highsnobiety.com/en-us/shop/category/footwear/sneakers/
- https://www.kickscrew.com/collections/

Search api reference: https://www.algolia.com/doc/api-reference/api-methods/search

### highsnobiety
Доступные категории:

- category/footwear/sneakers - все кроссовки
- brand/adidas/sneakers - по бренду

### kickscrew
Доступные категории:

- TODO
- TODO
 

In [1]:
# !pip install algoliasearch pandas beautifulsoup4

import os
from typing import Callable

import boto3
import pandas as pd
import requests
from algoliasearch.search_client import SearchClient
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from tqdm import tqdm

In [29]:
class ParsePipeline:
    def __init__(self, website_name, bucket_name, parser: Callable):
        self.WEBSITE_NAME = website_name
        self.parser = parser
        self.unique_items = set()
        self.bucket_name = bucket_name
        self.s3 = boto3.session.Session().client(service_name="s3", endpoint_url="https://storage.yandexcloud.net")

    @staticmethod
    def get_image_extension(url: str) -> str:
        return "." + url[-5:].split(".")[-1]

    def upload_to_s3(self, brand: str, name: str, image_url: str) -> str:
        img_binary = requests.get(image_url).content
        image_path = os.path.join("data", "raw", self.WEBSITE_NAME, brand, name)
        self.s3.put_object(Bucket=self.bucket_name, Key=image_path, Body=img_binary)
        print(f"uploaded: {image_path}")
        return image_path

    def process_item(self, item: dict) -> dict:
        return self.parser(item)

    def parse(self, filters: str, pages: int = 10, hit_per_page: int = 100) -> list:
        full_metadata = []
        for page in range(1, pages):
            resp = index.search("", {"filters": filters, "page": page, "hitsPerPage": hit_per_page})
            print(f'processing page {page} with {len(resp["hits"])} items')
            for hit in resp["hits"]:
                print(f'Processing brand: {hit.get("displayBrand") or hit.get("brand")}')
                metadata = self.process_item(hit)
                if metadata["url"] in self.unique_items:
                    print(metadata["title"], "is already parsed")
                    continue

                full_metadata.append(metadata)
                self.unique_items.add(metadata["title"])
        print(f"finished parse with {len(full_metadata)} items")
        return full_metadata

    def parse_more_images(self, metadata: dict, parsemore_func):
        if metadata.get("right-side-img") is not None:
            print(f"skipped {metadata['slug']}")
            return

        print(f"processing {metadata['slug']}")
        r = requests.get(metadata["url"], headers={"User-Agent": UserAgent().random})
        try:
            r.raise_for_status()
        except Exception as e:
            print(f"could not request page {metadata['url']} - {e}")
            return

        soup = BeautifulSoup(r.text, "html.parser")
        parsemore_func(self, soup, metadata)

        print(f"{metadata['slug']} is done")

In [30]:
#
# highsnobiety
#
client = SearchClient.create("KV0RT3PJZC", "37e783b9d54217566682938fa0e2b0f6")
index = client.init_index("production_ec_products_improved_newest_first")
WEBSITE_NAME = "highsnobiety"


def highsnobiety_parser(item: dict) -> dict:
    us_price = item["price"].get("US", {})
    return {
        "url": f'https://www.highsnobiety.com/en-us/shop/product/{item["slug"]["en"]}',
        "brand": item["displayBrand"],
        "slug": item["slug"]["en"],
        "priceCurrency": us_price["currencyCode"],
        "price": int(us_price.get("centAmount", 0)) / 100,
        "title": item.get("displayName", {}).get("en").replace("\\u", ""),
    }


def parse_result(cls, soup, metadata):
    metadata["description"] = soup.select("#acc-controls-details-editorsnote > div > p")[0].text
    image_side = {
        "right-side-img": "#pdp_image_zoom_0",
        "left-side-img": "#pdp_image_zoom_1",
        "front-both-img": "#pdp_image_zoom_2",
    }
    for side, tag in image_side.items():
        t = soup.select(tag)
        if len(t) > 0:
            img_link = t[0]["srcset"].split(", ")[0].split()[0]
            img_path = cls.upload_to_s3(  # can be replaced on cls.save_to_filesystem
                metadata["brand"],
                f"{metadata['slug']}-{side}{cls.get_image_extension(img_link)}",
                img_link,
            )
            metadata[side] = img_path


parser = ParsePipeline(WEBSITE_NAME, "sneakers-ml", highsnobiety_parser)
parse_result_ = parser.parse(
    "localizedPages.en:category/footwear/sneakers AND outOfStockForLongTime:false",
    10,
    100,
)

processing page 1 with 100 items
Processing brand: Norda
Processing brand: Salomon
Processing brand: Reebok
Processing brand: HOKA
Processing brand: HOKA
Processing brand: New Balance
Processing brand: Mizuno
Processing brand: Puma x Noah
Processing brand: asics
Processing brand: asics
Processing brand: Adidas
Processing brand: Adidas
Processing brand: ROA
Processing brand: ROA
Processing brand: Reebok
Processing brand: Reebok
Processing brand: Mizuno
Processing brand: Mizuno
Processing brand: Mizuno
Processing brand: Mizuno
Processing brand: asics x GmbH
Processing brand: Mizuno
Processing brand: Mizuno
Processing brand: Y-3
Processing brand: ROA
Processing brand: Converse
Processing brand: Converse
Processing brand: Merrell
Processing brand: Merrell
Processing brand: HOKA
Processing brand: HOKA
Processing brand: HOKA
Processing brand: New Balance
Processing brand: New Balance
Processing brand: New Balance
Processing brand: Salomon
Processing brand: Salomon
Processing brand: Adidas
Pr

KeyboardInterrupt: 

In [None]:
for metadata in tqdm(parse_result):
    parser.parse_more_images(metadata, parse_result)

df = pd.DataFrame(parse_result_)
df.shape

In [11]:
data = df.to_csv(index=False)
parser.s3.put_object(Bucket=parser.bucket_name, Key=f"data/raw/{WEBSITE_NAME}.csv", Body=data)

{'ResponseMetadata': {'RequestId': '0edc47065b47e69c',
  'HostId': '',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'nginx',
   'date': 'Thu, 19 Oct 2023 18:59:28 GMT',
   'content-type': 'application/octet-stream',
   'transfer-encoding': 'chunked',
   'connection': 'keep-alive',
   'keep-alive': 'timeout=60',
   'etag': '"c6b04682c624804aca9c6fa2b4a92cfa"',
   'x-amz-request-id': '0edc47065b47e69c'},
  'RetryAttempts': 0},
 'ETag': '"c6b04682c624804aca9c6fa2b4a92cfa"'}

In [12]:
df["brand"].value_counts()

brand
New Balance                       128
Converse                           49
Adidas                             38
Vans                               31
asics                              29
                                 ... 
Converse x Fragment                 1
Saucony x Colour Plus Companie      1
Adidas x Bad Bunny                  1
Sean Wotherspoon x Adidas           1
Puma x P.A.M.                       1
Name: count, Length: 72, dtype: int64

In [35]:
#
# kickscrew
#
client = SearchClient.create("7CCJSEVCO9", "173de9e561a4bc91ca6074d4dc6db17c")
index = client.init_index("prod_products")
WEBSITE_NAME = "kickscrew"


def kickscrew_parser(item: dict) -> dict:
    return {
        "url": f'https://www.kickscrew.com/products/{item["handle"]}',
        "brand": item["brand"],
        "slug": item["handle"],
        "priceCurrency": "USD",
        "price": int(item.get("lowest_price", 0)),
        "title": item["title"],
    }


def kickscrew_parsemore(cls, soup, metadata):
    image_side = {
        "right-side-img": "#product-grid-container > product-grid-item:nth-child(2) > div > div > img",  # noqa: E501
        "left-side-img": "#product-grid-container > product-grid-item:nth-child(1) > div > div > img",  # noqa: E501
        "front-both-img": "#product-grid-container > product-grid-item:nth-child(3) > div > div > img",  # noqa: E501
    }
    for side, tag in image_side.items():
        t = soup.select(tag)
        if len(t) > 0:
            img_link = f'https:{t[0]["src"]}'
            img_path = cls.upload_to_s3(  # can be replaced on cls.save_to_filesystem
                metadata["brand"],
                f"{metadata['slug']}-{side}{cls.get_image_extension(img_link)}",
                img_link,
            )
            metadata[side] = img_path


parser = ParsePipeline(WEBSITE_NAME, "sneakers-ml", kickscrew_parser)
parse_result = parser.parse("NOT class: 0", 52, 100)

processing page 1 with 100 items
Processing brand: New Balance
Processing brand: Nike
Processing brand: Nike
Processing brand: New Balance
Processing brand: Nike
Processing brand: Li-Ning
Processing brand: Nike
Processing brand: Onitsuka Tiger
Processing brand: Air Jordan
Processing brand: Nike
Processing brand: Air Jordan
Processing brand: Li-Ning
Processing brand: Li-Ning
Processing brand: Li-Ning
Processing brand: Li-Ning
Processing brand: Nike
Processing brand: Li-Ning
Processing brand: Onitsuka Tiger
Processing brand: Nike
Processing brand: Air Jordan
Processing brand: adidas
Processing brand: New Balance
Processing brand: Nike
Processing brand: Air Jordan
Processing brand: Nike
Processing brand: Nike
Processing brand: UGG
Processing brand: Li-Ning
Processing brand: Air Jordan
Processing brand: Nike
Processing brand: Li-Ning
Processing brand: Air Jordan
Processing brand: Air Jordan
Processing brand: Nike
Processing brand: Nike
Processing brand: Nike
Processing brand: Air Jordan
Pr

In [None]:
for metadata in tqdm(parse_result):
    parser.parse_more_images(metadata, kickscrew_parsemore)

df = pd.DataFrame(parse_result)
df.shape

In [None]:
data = df.to_csv(index=False)
parser.s3.put_object(Bucket=parser.bucket_name, Key=f"data/raw/{WEBSITE_NAME}.csv", Body=data)