## Algolia - CRM-система со встроенным ранжированием поисков пользователей. 

Используется в:
- https://www.highsnobiety.com/en-us/shop/category/footwear/sneakers/
- https://www.kickscrew.com/collections/

Search api reference: https://www.algolia.com/doc/api-reference/api-methods/search

### highsnobiety
Доступные категории:

- category/footwear/sneakers - все кроссовки
- brand/adidas/sneakers - по бренду

### kickscrew
Доступные категории:

- TODO
- TODO
 

In [6]:
# !pip install algoliasearch pandas

import logging
import os
import requests
import pandas as pd

from typing import Callable

from algoliasearch.search_client import SearchClient

In [23]:
class ParsePipeline:

    def __init__(self, website_name, parser: Callable):
        self.WEBSITE_NAME = website_name
        self.parser = parser
        self.logger = logging.getLogger(website_name)
        self.logger.setLevel(logging.DEBUG)
        self.unique_items = set()

    @staticmethod
    def get_image_extension(url: str) -> str:
        return '.' + url[-5:].split('.')[-1]

    def save_to_filesystem(self, metadata: dict) -> str:
        photo_folder = os.path.join(self.WEBSITE_NAME, "photos", metadata['brand'])
        os.makedirs(photo_folder, exist_ok=True)

        image_path = os.path.join(
            photo_folder,
            metadata['slug'] + self.get_image_extension(metadata['image_url']),  # TODO: more photos needed
        )

        img_binary = requests.get(metadata['image_url']).content
        os.makedirs(photo_folder, exist_ok=True)
        with open(image_path, "wb") as f:
            f.write(img_binary)
        return image_path

    def process_item(self, item: dict, parser: Callable) -> dict:
        metadata = parser(item)
        path = self.save_to_filesystem(metadata)

        metadata['image_path'] = path
        del metadata['image_url']
        del metadata['slug']

        return metadata

    def parse(self, filters: str, pages: int = 10, hit_per_page: int = 100) -> list:
        full_metadata = []
        for page in range(1, pages):
            resp = index.search('', {
                'filters': filters,
                'page': page,
                'hitsPerPage': hit_per_page
            })
            self.logger.debug(f'processing page %d with %d items', page, len(resp["hits"]))
            for hit in resp['hits']:
                self.logger.debug('Processing brand: %s', hit["displayBrand"])
                metadata = self.process_item(hit, self.parser)
                if metadata['url'] in self.unique_items:
                    self.logger.warn('%s is already parsed', metadata['title'])
                    continue

                full_metadata.append(metadata)
                self.unique_items.add(metadata['title'])
            self.logger.info("finished parse with %d items", len(full_metadata))
            return full_metadata



In [24]:
#
# highsnobiety
# 
client = SearchClient.create('KV0RT3PJZC', '37e783b9d54217566682938fa0e2b0f6')
index = client.init_index('production_ec_products_improved_newest_first')
WEBSITE_NAME = 'highsnobiety'


def highsnobiety_parser(item: dict) -> dict:
    us_price = item['price'].get('US', {})
    return {
        'url': f'https://www.highsnobiety.com/en-us/shop/product/{item["slug"]["en"]}',
        'brand': item['displayBrand'],
        'slug': item["slug"]["en"],
        # 'description': item.get('displayName', {}).get('en'),
        'priceCurrency': us_price['currencyCode'],
        'price': int(us_price.get('centAmount', 0)) / 100,
        'title': item.get('displayName', {}).get('en').replace('\\u', ''),
        'image_url': item['image']
    }


full_metadata = ParsePipeline(WEBSITE_NAME, highsnobiety_parser).parse(
    'localizedPages.en:category/footwear/sneakers AND outOfStockForLongTime:false',
    2,
    100
)


In [25]:
df = pd.DataFrame(full_metadata)
df.to_csv(f"{WEBSITE_NAME}/metadata.csv", index=False)
df.shape


(100, 6)

In [26]:
df.head()

Unnamed: 0,url,brand,priceCurrency,price,title,image_path
0,https://www.highsnobiety.com/en-us/shop/produc...,Salomon,USD,205.0,Salomon – XT-QUEST 2 Falcon/Cement/Bright Green,highsnobiety/photos/Salomon/salomon-xt-quest-2...
1,https://www.highsnobiety.com/en-us/shop/produc...,Salomon,USD,215.0,Salomon – ACS PRO Pewter/Monument/Aegean Blue,highsnobiety/photos/Salomon/salomon-acs-pro-pe...
2,https://www.highsnobiety.com/en-us/shop/produc...,Adidas,USD,130.0,Adidas – Gazelle Core Black/Gum,highsnobiety/photos/Adidas/adidas-gazelle-core...
3,https://www.highsnobiety.com/en-us/shop/produc...,Converse,USD,110.0,Converse – One Star Pro Berlin Black/Pink,highsnobiety/photos/Converse/converse-one-star...
4,https://www.highsnobiety.com/en-us/shop/produc...,Saucony,USD,195.0,Saucony – ProGrid Triumph 4 Green/Silver,highsnobiety/photos/Saucony/saucony-progrid-tr...
