In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from requestium import Session, Keys
from selenium.webdriver.common.by import By

driver_path = ChromeDriverManager().install()
s = Session(webdriver_path=driver_path)
s.driver.implicitly_wait(10)

In [2]:
# Open the website and clear the captcha

s.driver.get("https://www.amazon.com")

In [3]:
s.transfer_driver_cookies_to_session()

In [4]:
from selenium.webdriver.common.by import By

def collect_asin_from_search_results():
    asin_list = list()
    while True:
        records = s.driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
        asins = [record.get_attribute("data-asin") for record in records]
        asin_list.extend(asins)
        next_page_link = s.driver.find_element(By.CSS_SELECTOR, ".s-pagination-next")
        if next_page_link.get_attribute("aria-disabled") == "true":
            break
        next_page_link.click()
        
    return asin_list

def get_seed_asins(query):
    s.driver.get("https://www.amazon.com")
    search_box = s.driver.find_element(By.ID, "twotabsearchtextbox")
    search_box.send_keys(query)
    search_box.submit()
    return collect_asin_from_search_results()

asin_list = get_seed_asins("knitted weighted blanket")
print(len(asin_list))

108


In [6]:
headers = s.headers
cookies = s.cookies

In [7]:
import multiprocess as mp
from pathlib import Path
import requests
from copy import deepcopy

from urllib.parse import urljoin
import lxml.etree as ET
import re
import inflection
import json
from itertools import chain


html_dir = Path("./pages")
data_dir = Path("./data")
image_dir = Path("./images")

html_dir.mkdir(exist_ok=True)
data_dir.mkdir(exist_ok=True)
image_dir.mkdir(exist_ok=True)

def download_product_page(asin):
    out_path = html_dir / f"{asin}.html"
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path
    url = f"https://www.amazon.com/dp/{asin}"
    response = requests.get(url, headers=headers, cookies=cookies)
    try:
        response.raise_for_status()
        tree = ET.HTML(response.text)
        captcha = tree.xpath('.//h4[contains(text(), "Enter the characters you see below")]')
        if captcha:
            print(f"Encountered captcha for {url}")
            raise ValueError("Captcha encountered")
        out_path.write_text(response.text)
        return out_path
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {url}: {e}")
        return None



product_base_url = "https://www.amazon.com/dp/"
seller_id_re = re.compile(r"seller=([^&]+)&")


def parse_product_details(page_el):
    detail_rows = page_el.cssselect("table#productDetails_detailBullets_sections1 tr")
    output = dict()
    for row in detail_rows:
        key = row.cssselect("th")[0].text.strip()
        value = row.cssselect("td")[0].text.strip()
        output[inflection.underscore(key.replace(" ", "_"))] = value
    return output

def parse_product_page(page_text):
    tree = ET.HTML(page_text)
    variant_el = tree.find(".//form[@id='twister']")
    try:
        variant_asins = [i for i in variant_el.xpath(".//li[@data-csa-c-item-id]/@data-csa-c-item-id") if i]
    except AttributeError:
        variant_asins = []
    try:
        cover_image_url = tree.xpath('.//img[@data-a-image-name="landingImage"]/@src')[0]
    except IndexError:
        cover_image_url = None
    try:
        price_div = tree.cssselect('div#corePriceDisplay_desktop_feature_div span.aok-offscreen')[0]
        price = "".join(price_div.itertext()).strip()
    except IndexError:
        price = None
    try:
        merchant_el = tree.cssselect("div#merchantInfoFeature_feature_div a#sellerProfileTriggerId")[0]
    except IndexError:
        merchant_el = None
    
    return {
        "product_title": tree.cssselect("span#productTitle")[0].text.strip(),
        "product_details": parse_product_details(tree),
        "variant_asins": variant_asins,
        "cover_image": urljoin(product_base_url, cover_image_url) if cover_image_url is not None else None,
        "price": price,
        "merchant_name": merchant_el.text if merchant_el is not None else None,
        "merchant_url": urljoin(product_base_url, merchant_el.get("href")) if merchant_el is not None else None,
        "seller_id": seller_id_re.search(merchant_el.get("href")).group(1) if merchant_el is not None else None
    }


def parse_path(path):
    data_path = data_dir / f"{path.stem}.json"
    if data_path.exists() and data_path.stat().st_size > 0:
        data = json.loads(data_path.read_text())
        return data['variant_asins']
    try:
        data = parse_product_page(path.read_text())
        data_path.write_text(json.dumps(data, indent=2))
        return data['variant_asins']
    except Exception as e:
        print(f"Failed to parse {path}: {e}")
        raise e
        

asin_list = set(asin_list)
to_search = deepcopy(asin_list)

with mp.Pool() as pool:
    print(f"Evaluating {len(to_search)} products")
    files = pool.map(download_product_page, to_search)
    files = [f for f in files if f]
    variants = pool.map(parse_path, files)
    variant_asins = set(chain.from_iterable(variants))
    to_search = variant_asins - asin_list
    asin_list |= variant_asins
    
 

Evaluating 104 products


In [12]:
with mp.Pool() as pool:
    print(f"Evaluating {len(to_search)} products")
    files = pool.map(download_product_page, to_search)
    files = [f for f in files if f]
    variants = pool.map(parse_path, files)
    variant_asins = set(chain.from_iterable(variants))
    to_search = variant_asins - asin_list
    asin_list |= variant_asins

Evaluating 17 products


In [13]:
html_files = list(html_dir.glob("*.html"))

with mp.Pool() as pool:
    pool.map(parse_path, html_files)

Failed to parse pages/B08R9CLM5Y.html: [Errno 2] No such file or directory: 'data/B08R9CLM5Y.json'Failed to parse pages/B0BPHLTM3D.html: [Errno 2] No such file or directory: 'data/B0BPHLTM3D.json'Failed to parse pages/B092HVK7GB.html: [Errno 2] No such file or directory: 'data/B092HVK7GB.json'Failed to parse pages/B0CPP49XFT.html: [Errno 2] No such file or directory: 'data/B0CPP49XFT.json'
Failed to parse pages/B0BS3QPYXW.html: [Errno 2] No such file or directory: 'data/B0BS3QPYXW.json'

Failed to parse pages/B08F5L6NLT.html: [Errno 2] No such file or directory: 'data/B08F5L6NLT.json'
Failed to parse pages/B0CNSSKQN8.html: [Errno 2] No such file or directory: 'data/B0CNSSKQN8.json'Failed to parse pages/B094V1VLRR.html: [Errno 2] No such file or directory: 'data/B094V1VLRR.json'



Failed to parse pages/B0BGXZV6LH.html: [Errno 2] No such file or directory: 'data/B0BGXZV6LH.json'
Failed to parse pages/B08BYFYCLW.html: [Errno 2] No such file or directory: 'data/B08BYFYCLW.json'
Failed to 

FileNotFoundError: [Errno 2] No such file or directory: 'data/B08R9CLM5Y.json'

In [5]:











def download_product_page(asin):
    out_path = html_dir / f"{asin}.html"
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path.read_text()
    print("downloading", asin)
    url = product_base_url + asin
    response = s.get(url)
    response.raise_for_status()
    out_path.write_text(response.text)
    return out_path.read_text()
    
def parse_asin_list(asin_list):
    variant_asin_list = list()
    
    for asin in asin_list:
        try:
            text = download_product_page(asin)
            data = parse_product_page(text)
            data_path = data_dir / f"{asin}.json"
            data_path.write_text(json.dumps(data, indent=2))
            image_extension = data["cover_image"].split(".")[-1]
            image_path = image_dir / f"{asin}.{image_extension}"
            if not image_path.exists():
                image_response = s.get(data["cover_image"])
                image_response.raise_for_status()
                image_path.write_bytes(image_response.content)
            variant_asin_list.extend(data["variant_asins"])
        except Exception as e:
            print(f"failed to process {asin}: {e}")
            raise e
    return variant_asin_list
        
        
        

def crawl(query):
    #s.driver.get("https://www.amazon.com")
    #search_box = s.driver.find_element(By.ID, "twotabsearchtextbox")
    #search_box.send_keys(query)
    #search_box.submit()
    #asin_list = collect_asin_from_search_results()
    #s.transfer_driver_cookies_to_session()
    asin_list = [p.name.split(".")[0] for p in html_dir.glob("*.html")]
    to_search = set(deepcopy(asin_list))
    while True:
        print(f"scanning {len(to_search)} asins")
        variant_asins = parse_asin_list(to_search)
        to_search = set(variant_asins) - set(asin_list)
        asin_list = set(asin_list) | set(variant_asins)
        if not to_search:
            break
        
        
crawl("weighted knitted blanket")
    
    



scanning 833 asins


  "merchant_name": merchant_el.text if merchant_el else None,
  "merchant_url": urljoin(product_base_url, merchant_el.get("href")) if merchant_el else None,
  "seller_id": seller_id_re.search(merchant_el.get("href")).group(1) if merchant_el else None


scanning 1495 asins
downloading B0CD7Z2NXT
downloading B0C1Z5WQ6W
downloading B0CK2KGWJT
downloading B0811Q9S36


KeyboardInterrupt: 

In [8]:


s.transfer_driver_cookies_to_session()

In [16]:
for asin in asin_list:
    download_product_page(asin)

In [9]:
import lxml.etree as ET
response = s.get(base_url + asin_list[0])
response.raise_for_status()

In [26]:

    
parse_product_page(response.text)

{'product_title': 'Chenille Chunky Knit Blanket Throw （40×50 Inch）, Handmade Warm & Cozy Blanket Couch, Bed, Home Decor, Soft Breathable Fleece Banket, Christmas Thick and Giant Yarn Throws, Cream',
 'product_details': {'material': 'Chenille',
  'color': 'Cream',
  'brand': 'Maetoow',
  'special_feature': 'Skin Friendly',
  'style': 'Modern',
  'blanket_form': 'Throw Blanket',
  'age_range_(description)': 'kids, adults',
  'product_dimensions': '50"L x 40"W',
  'theme': 'Space',
  'pattern': 'Solid',
  'recommended_uses_for_product': 'household',
  'seasons': 'Winter',
  'weave_type': 'Chunky Knit',
  'product_care_instructions': 'Machine Wash',
  'size': "40''x50''",
  'fabric_type': '100% Chenille',
  'unit_count': '1 Count',
  'number_of_items': '5',
  'fabric_warmth_description': 'Medium/Heavyweight',
  'sport': 'Camping',
  'model_name': 'small',
  'item_weight': '3.74 pounds',
  'manufacturer': 'wanhong',
  'asin': 'B0B766SCS5',
  'country_of_origin': 'China',
  'item_model_numbe

In [27]:
asin_list = [p.stem for p in workdir.glob("*.html")]
asin_list

['B08SQFBQV2',
 'B0CJPPZKJC',
 'B09XHC18H9',
 'B0B4VW9CZ1',
 'B09YS1214X',
 'B0CFQCT46L',
 'B0C6MQ2N3S',
 'B0BBZFBYJL',
 'B07SRY7GBK',
 'B0C1FBJGFR',
 'B096FQR7NP',
 'B09BC4YFJM',
 'B0BPSDDNHQ',
 'B0CGJF55C7',
 'B09QQ7CZY4',
 'B0B5GFQ9HW',
 'B0C4ZYX8WP',
 'B0B7WDGG2C',
 'B08FMFTVXC',
 'B0C9KW6THW',
 'B0CPF6G6FN',
 'B0B38YC8CK',
 'B0811Q54RQ',
 'B09D7J26XF',
 'B08BYGK4HH',
 'B0C4S7R8PN',
 'B0CF1TDCPK',
 'B0CCVJFG7Z',
 'B0CKT88DDV',
 'B0CJPQ7CVQ',
 'B0CGV9R2Q4',
 'B0CGWJT9PC',
 'B0CJPRG6GL',
 'B09T3CTKP2',
 'B092HVF7FT',
 'B096FPJCF2',
 'B0BR6LBH65',
 'B0BR728TB6',
 'B0CNV9TLLW',
 'B0B2NSM1HK',
 'B099FKPCDP',
 'B08T6Q716K',
 'B0CF1SV1RP',
 'B082XYS6HB',
 'B093Q6H1XJ',
 'B09514WX62',
 'B0CB48XLH3',
 'B0CJPR48QB',
 'B0CBRDTC6P',
 'B0B61WYSYK',
 'B099FLVTLL',
 'B08DNQLXTT',
 'B0B2NTWD7Q',
 'B0BW3QQJK4',
 'B07MW2YFQ7',
 'B0CJPQCKXY',
 'B0C2PN2SBD',
 'B0CM435Y91',
 'B0C4PHZF49',
 'B0C1BTZP93',
 'B08BL8BJ11',
 'B07TBJ1ZG6',
 'B0BFH6ZW3S',
 'B0CF5KZLGX',
 'B0BZSYMVR9',
 'B0B2NT2NMZ',
 'B0BB2H68