## Algolia - CRM-система со встроенным ранжированием поисков пользователей. 

Используется в:
- https://www.highsnobiety.com/en-us/shop/category/footwear/sneakers/
- https://www.kickscrew.com/collections/

Search api reference: https://www.algolia.com/doc/api-reference/api-methods/search

### highsnobiety
Доступные категории:

- category/footwear/sneakers - все кроссовки
- brand/adidas/sneakers - по бренду

### kickscrew
Доступные категории:

- TODO
- TODO
 

In [2]:
# !pip install algoliasearch pandas beautifulsoup

import os
import requests
import pandas as pd

from algoliasearch.search_client import SearchClient
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from typing import Callable

In [5]:
from tqdm import tqdm


class ParsePipeline:

    def __init__(self, website_name, parser: Callable):
        self.WEBSITE_NAME = website_name
        self.parser = parser
        self.unique_items = set()

    @staticmethod
    def get_image_extension(url: str) -> str:
        return '.' + url[-5:].split('.')[-1]

    def save_to_filesystem(self, brand: str, slug: str, image_url: str) -> str:
        photo_folder = os.path.join(self.WEBSITE_NAME, "photos", brand)
        os.makedirs(photo_folder, exist_ok=True)

        image_path = os.path.join(
            photo_folder,
            slug + self.get_image_extension(image_url),  # TODO: more photos needed
        )

        img_binary = requests.get(image_url).content
        os.makedirs(photo_folder, exist_ok=True)
        with open(image_path, "wb") as f:
            f.write(img_binary)
        return image_path

    def process_item(self, item: dict) -> dict:
        return self.parser(item)

    def parse(self, filters: str, pages: int = 10, hit_per_page: int = 100) -> list:
        full_metadata = []
        for page in tqdm(range(1, pages)):
            resp = index.search('', {
                'filters': filters,
                'page': page,
                'hitsPerPage': hit_per_page
            })
            print(f'processing page {page} with {len(resp["hits"])} items')
            for hit in resp['hits']:
                print(f'Processing brand: {hit["displayBrand"]}')
                metadata = self.process_item(hit)
                if metadata['url'] in self.unique_items:
                    print(metadata['title'], 'is already parsed')
                    continue

                full_metadata.append(metadata)
                self.unique_items.add(metadata['title'])
        print(f"finished parse with {len(full_metadata)} items")
        return full_metadata

    async def parse_more_images(self, metadata: dict):
        if metadata.get('right-side-img') is not None:
            print(f"skipped {metadata['slug']}")
            return

        print(f"processing {metadata['slug']}")
        r = requests.get(metadata['url'], headers={"User-Agent": UserAgent().random})
        try:
            r.raise_for_status()
        except Exception as e:
            print(f"could not request page {metadata['url']} - {e}")
            return

        soup = BeautifulSoup(r.text, "html.parser")
        metadata['description'] = soup.select('#acc-controls-details-editorsnote > div > p')[0].text

        image_side = {
            'right-side-img': '#pdp_image_zoom_0',
            'left-side-img': '#pdp_image_zoom_1',
            'front-both-img': '#pdp_image_zoom_2'
        }
        for side, tag in image_side.items():
            t = soup.select(tag)
            if len(t) > 0:
                img_link = t[0]['srcset'].split(', ')[0].split()[0]
                img_path = self.save_to_filesystem(metadata['brand'], f"{metadata['slug']}-{side}", img_link)
                metadata[side] = img_path
        print(f"{metadata['slug']} is done")


In [None]:
#
# highsnobiety
# 
client = SearchClient.create('KV0RT3PJZC', '37e783b9d54217566682938fa0e2b0f6')
index = client.init_index('production_ec_products_improved_newest_first')
WEBSITE_NAME = 'highsnobiety'


def highsnobiety_parser(item: dict) -> dict:
    us_price = item['price'].get('US', {})
    return {
        'url': f'https://www.highsnobiety.com/en-us/shop/product/{item["slug"]["en"]}',
        'brand': item['displayBrand'],
        'slug': item["slug"]["en"],
        'priceCurrency': us_price['currencyCode'],
        'price': int(us_price.get('centAmount', 0)) / 100,
        'title': item.get('displayName', {}).get('en').replace('\\u', '')
    }


parser = ParsePipeline(WEBSITE_NAME, highsnobiety_parser)
parse_result = parser.parse(
    'localizedPages.en:category/footwear/sneakers AND outOfStockForLongTime:false',
    20,
    100
)


In [7]:
import asyncio

ioloop = asyncio.get_event_loop()
coroutines = [ioloop.create_task(parser.parse_more_images(metadata)) for metadata in parse_result]
ioloop.run_until_complete(asyncio.wait(coroutines))


created for salomon-xt-quest-2-falcon-cement-bright-green
created for salomon-acs-pro-pewter-monument-aegean-blue
created for adidas-gazelle-core-black-gum
created for converse-one-star-pro-ltd-ox-berlin-pink
created for saucony-progrid-triumph-4-green-silver
created for saucony-progrid-triumph-4-pink-silver
created for saucony-progrid-triumph-4-blue-silver
created for saucony-peregrine-13-st-glade-bronze
created for asics-gel-kayano-14-pure-silver-pure-silver
created for adidas-cg-split-stan-smith-beige-tonecore-black
created for adidas-cg-split-stan-smith-core-blackgranite
created for adidas-cg-split-stan-smith-whiteblack
created for on-cloudultra-2-black
created for on-cloudstratus-3-undyed-white-sand
created for converse-pl-vulc-pro-ox-forest-shelter-white
created for new-balance-bbw550sb-sea-salt-mercury-blue
created for new-balance-mt580ed2-black-grey-white
created for new-balance-bbw550sg-sea-salt-nori
created for roa-katharina-sneakers-olive-rust
created for roa-lhakpa-sneakers

In [8]:
df = pd.DataFrame(parse_result)
df.to_csv(f"{WEBSITE_NAME}/metadata.csv", index=False)
df.shape


on-the-roger-clubhouse-white-indigo is done


(590, 10)

In [9]:
df['right-side-img']

0      highsnobiety/photos/Salomon/salomon-xt-quest-2...
1      highsnobiety/photos/Salomon/salomon-acs-pro-pe...
2      highsnobiety/photos/Adidas/adidas-gazelle-core...
3      highsnobiety/photos/Converse/converse-one-star...
4      highsnobiety/photos/Saucony/saucony-progrid-tr...
                             ...                        
585    highsnobiety/photos/Puma x P.A.M./puma-p-a-m-s...
586    highsnobiety/photos/On/on-cloudvista-waterproo...
587    highsnobiety/photos/ROA/roa-khatarina-sneaker-...
588    highsnobiety/photos/On/on-cloudventure-ice-kel...
589    highsnobiety/photos/On/on-the-roger-clubhouse-...
Name: right-side-img, Length: 590, dtype: object