## Algolia - CRM-система со встроенным ранжированием поисков пользователей. 

Используется в:
- https://www.highsnobiety.com/en-us/shop/category/footwear/sneakers/
- https://www.kickscrew.com/collections/

In [None]:
!pip install algoliasearch

In [56]:
import json
import os
import urllib.parse
import requests
import pandas as pd
from fake_useragent import UserAgent


from algoliasearch.search_client import SearchClient

HEADERS = {"User-Agent": UserAgent().random}

In [35]:
client = SearchClient.create('KV0RT3PJZC', '37e783b9d54217566682938fa0e2b0f6')
index = client.init_index('production_ec_products_improved_newest_first')

WEBSITE_NAME = 'highsnobiety'

Search api reference: https://www.algolia.com/doc/api-reference/api-methods/search

Доступные категории:

- category/footwear/sneakers - все кроссовки
- brand/adidas/sneakers - по бренду

In [69]:
resp = index.search('', {
    'filters': 'localizedPages.en:category/footwear/sneakers AND outOfStockForLongTime:false',
    'page': 1
})

print(len(resp['hits']))
json.dumps(resp)

20


'{"hits": [{"filterBrand": [{"key": "salomon", "label": "Salomon"}, {"key": "and_wander", "label": "And Wander"}], "displayBrand": "Salomon x And Wander", "displayName": {"en": "Salomon x And Wander \\u2013 XA Pro 3D GORE-TEX Brown"}, "collaboration": null, "allCategories": [{"id": "e1981069-a727-454c-a6f9-fc31d6b18d6b", "name": {"en": "Sneakers"}}, {"id": "1e344ca3-2b83-411c-8037-5cd03e5de6b4", "name": {"en": "Footwear"}}], "categories": [{"id": "e1981069-a727-454c-a6f9-fc31d6b18d6b", "name": {"en": "Sneakers"}}], "outOfStockForLongTime": false, "campaignId": "HS_IO_EU_2023_4347", "colors": {"en": ["Brown"]}, "image": "https://www.highsnobiety.com/static-assets/thumbor/8bj_L6VZDJz9EjPtdo6RfR7dnQY=/fit-in/790x987/https://aaba6fc7dd05e6321705-d3c8e77fedf34b64ceac1fa28b6c145b.ssl.cf3.rackcdn.com/SALOMON%20x%20And%20Wander-GiujSbPT.jpg", "isAvailable": true, "isDiscounted": false, "key": "281568", "name": {"en": "XA Pro 3D GORE-TEX Brown"}, "productType": {"id": "9407b898-e03f-4bd6-9ff1-8

In [70]:
print(f"""INFO
"nbHits: {resp["nbHits"]}
"nbPages": {resp["nbPages"]},
"hitsPerPage": {resp["hitsPerPage"]}""")

INFO
"nbHits: 690
"nbPages": 35,
"hitsPerPage": 20


In [72]:
def get_image_extension(url):
    return '.' + url[-5:].split('.')[-1]

def get_metadata(item):
    slug = item["slug"]["en"]
    us_price = item['price'].get('US', {})
    price = int(us_price.get('centAmount', 0)) / 100
    brand = item['displayBrand']
    image_url = item['image']

    photo_folder = os.path.join(
        WEBSITE_NAME,
        "photos",
        brand
    )
    image_path = os.path.join(
        photo_folder,
        slug + get_image_extension(image_url), # TODO: more photos needed
    )

    metadata = {
        'url': f'https://www.highsnobiety.com/en-us/shop/product/{slug}',
        'brand': brand,
        # 'description': item.get('displayName', {}).get('en'),
        'priceCurrency': us_price['currencyCode'],
        'price': price,
        'title': item.get('displayName', {}).get('en').replace('\\u', ''),
        'photo_path': image_path,
    }

    img_binary = requests.get(image_url).content
    os.makedirs(photo_folder, exist_ok=True)
    with open(image_path,"wb",) as f:
        f.write(img_binary)

    return metadata


In [None]:
#
# MAIN
# 

# full_metadata = []
# full_metadata_items = set()

for page in range(1, 20):
    resp = index.search('', {
        'filters': 'localizedPages.en:category/footwear/sneakers AND outOfStockForLongTime:false',
        'page': page,
        'hitsPerPage': 100
    })
    print(f'processing page {page} with {len(resp["hits"])} items')
    for hit in resp['hits']:
        print('Processing brand:', hit["displayBrand"])
        metadata = get_metadata(hit)
        if metadata['url'] in full_metadata_items:
            print(metadata['title'], 'is already parsed')
            continue
        
        full_metadata.append(metadata)
        full_metadata_items.add(metadata['title'])

df = pd.DataFrame(full_metadata)
df.to_csv(f"{WEBSITE_NAME}/metadata.csv", index=False)

In [50]:
# test
assert '.jpeg' == get_image_extension('https://www.highsnobiety.com/static-assets/thumbor/IU5Ni6j11Tuupckbzwx8EnkKeZw=/fit-in/790x987/https://aaba6fc7dd05e6321705-d3c8e77fedf34b64ceac1fa28b6c145b.ssl.cf3.rackcdn.com/asics-gel-venture-6--qin0ZV1i.jpeg')
'ok'

'ok'

In [77]:
df.shape


(380, 6)