In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from requestium import Session, Keys
from selenium.webdriver.common.by import By

driver_path = ChromeDriverManager().install()
s = Session(webdriver_path=driver_path)
s.driver.implicitly_wait(10)

In [2]:
# Open the website and clear the captcha

s.driver.get("https://www.amazon.com")

In [3]:
s.transfer_driver_cookies_to_session()

In [4]:
from selenium.webdriver.common.by import By

def collect_asin_from_search_results():
    asin_list = list()
    while True:
        records = s.driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
        asins = [record.get_attribute("data-asin") for record in records]
        asin_list.extend(asins)
        next_page_link = s.driver.find_element(By.CSS_SELECTOR, ".s-pagination-next")
        if next_page_link.get_attribute("aria-disabled") == "true":
            break
        next_page_link.click()
        
    return asin_list

def get_seed_asins(query):
    s.driver.get("https://www.amazon.com")
    search_box = s.driver.find_element(By.ID, "twotabsearchtextbox")
    search_box.send_keys(query)
    search_box.submit()
    return collect_asin_from_search_results()

asin_list = get_seed_asins("knitted weighted blanket")
print(len(asin_list))

108


In [6]:
headers = s.headers
cookies = s.cookies

In [14]:
import multiprocess as mp
from pathlib import Path
import requests
from copy import deepcopy

from urllib.parse import urljoin
import lxml.etree as ET
import re
import inflection
import json
from itertools import chain


html_dir = Path("./pages")
data_dir = Path("./data")
image_dir = Path("./images")

html_dir.mkdir(exist_ok=True)
data_dir.mkdir(exist_ok=True)
image_dir.mkdir(exist_ok=True)

def download_product_page(asin):
    out_path = html_dir / f"{asin}.html"
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path
    url = f"https://www.amazon.com/dp/{asin}"
    response = requests.get(url, headers=headers, cookies=cookies)
    try:
        response.raise_for_status()
        tree = ET.HTML(response.text)
        captcha = tree.xpath('.//h4[contains(text(), "Enter the characters you see below")]')
        if captcha:
            print(f"Encountered captcha for {url}")
            raise ValueError("Captcha encountered")
        out_path.write_text(response.text)
        return out_path
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {url}: {e}")
        return None



product_base_url = "https://www.amazon.com/dp/"
seller_id_re = re.compile(r"seller=([^&]+)&")


def parse_product_details(page_el):
    detail_rows = page_el.cssselect("table#productDetails_detailBullets_sections1 tr")
    output = dict()
    for row in detail_rows:
        key = row.cssselect("th")[0].text.strip()
        value = row.cssselect("td")[0].text.strip()
        output[inflection.underscore(key.replace(" ", "_"))] = value
    return output

def parse_product_page(page_text):
    tree = ET.HTML(page_text)
    variant_el = tree.find(".//form[@id='twister']")
    try:
        variant_asins = [i for i in variant_el.xpath(".//li[@data-csa-c-item-id]/@data-csa-c-item-id") if i]
    except AttributeError:
        variant_asins = []
    try:
        cover_image_url = tree.xpath('.//img[@data-a-image-name="landingImage"]/@src')[0]
    except IndexError:
        cover_image_url = None
    try:
        price_div = tree.cssselect('div#corePriceDisplay_desktop_feature_div span.aok-offscreen')[0]
        price = "".join(price_div.itertext()).strip()
    except IndexError:
        price = None
    try:
        merchant_el = tree.cssselect("div#merchantInfoFeature_feature_div a#sellerProfileTriggerId")[0]
    except IndexError:
        merchant_el = None
    
    return {
        "product_title": tree.cssselect("span#productTitle")[0].text.strip(),
        "product_details": parse_product_details(tree),
        "variant_asins": variant_asins,
        "cover_image": urljoin(product_base_url, cover_image_url) if cover_image_url is not None else None,
        "price": price,
        "merchant_name": merchant_el.text if merchant_el is not None else None,
        "merchant_url": urljoin(product_base_url, merchant_el.get("href")) if merchant_el is not None else None,
        "seller_id": seller_id_re.search(merchant_el.get("href")).group(1) if merchant_el is not None else None
    }


def parse_path(path):
    data_path = data_dir / f"{path.stem}.json"
    if data_path.exists() and data_path.stat().st_size > 0:
        data = json.loads(data_path.read_text())
        return data['variant_asins']
    try:
        data = parse_product_page(path.read_text())
        data_path.write_text(json.dumps(data, indent=2))
        return data['variant_asins']
    except Exception as e:
        print(f"Failed to parse {path}: {e}")
        raise e
        


 

In [12]:
asin_list = set(asin_list)
to_search = deepcopy(asin_list)

with mp.Pool() as pool:
    print(f"Evaluating {len(to_search)} products")
    files = pool.map(download_product_page, to_search)
    files = [f for f in files if f]
    variants = pool.map(parse_path, files)
    variant_asins = set(chain.from_iterable(variants))
    to_search = variant_asins - asin_list
    asin_list |= variant_asins

Evaluating 17 products


In [15]:
html_files = list(html_dir.glob("*.html"))
with mp.Pool() as pool:
    pool.map(parse_path, html_files)

In [17]:
data_files = list(data_dir.glob("*.json"))
def download_image(data_path):
    data = json.loads(data_path.read_text())
    image_url = data['cover_image']
    out_path = image_dir / f"{data_path.stem}.{image_url.split('.')[-1]}"
    if out_path.exists() and out_path.stat().st_size > 0:
        return
    response = requests.get(image_url, headers=headers, cookies=cookies)
    try:
        response.raise_for_status()
        out_path.write_bytes(response.content)
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {image_url}: {e}")
        
with mp.Pool() as pool:
    pool.map(download_image, data_files)

In [19]:
from dateutil.parser import parse as parse_dt

def create_row(path):
    data = json.loads(path.read_text())
    return {
        "asin": path.stem,
        "title": data['product_title'],
        "price": data['price'],
        "product_dimensions": data['product_details'].get('product_dimensions', None),
        "size": data['product_details'].get('size', None),
        "item_weight": data['product_details'].get('item_weight', None),
        "date_first_available": parse_dt(data['product_details']['date_first_available']) if 'date_first_available' in data['product_details'] else None,
        "merchant_name": data['merchant_name'],
        "merchant_url": data['merchant_url'],
        "seller_id": data['seller_id'],
        "cover_image": f"images/{path.stem}.{data['cover_image'].split('.')[-1]}"
    }
    
import pandas as pd

blanket_df = pd.DataFrame.from_records([create_row(f) for f in data_files])
blanket_df.to_ex

Unnamed: 0,asin,title,price,product_dimensions,size,item_weight,date_first_available,merchant_name,merchant_url,seller_id,cover_image
0,B0C4SQG1BS,"Handmade Knitted Weighted Blanket,Breathable a...",$139.99,"60""L x 50""W","50""x60"" 10lbs",10 pounds,2023-05-10,GoldGuo,https://www.amazon.com/gp/help/seller/at-a-gla...,A1A5A1X4Y20J80,images/B0C4SQG1BS.jpg
1,B0C2Q385G6,Uttermara Jacquard Weighted Blanket for Adults...,$59.59 with 11 percent savings,"72""L x 48""W","48""×72"" 15lbs",15 pounds,2023-08-13,Buzio Direct,https://www.amazon.com/gp/help/seller/at-a-gla...,A2NGXTFJJJEGMR,images/B0C2Q385G6.jpg
2,B0CT6BK5SL,Bare Home Weighted Blanket Twin or Full Size 7...,$39.99 with 13 percent savings,"60""L x 40""W","40"" x 60"" 7 lbs",7 pounds,2024-01-24,Bare Home,https://www.amazon.com/gp/help/seller/at-a-gla...,A3UBXJ2R9SZ9Z4,images/B0CT6BK5SL.jpg
3,B098N6J3WQ,CREVENT Farmhouse Waffle Knit Throw Blanket fo...,$17.99,"50""L x 60""W","50""X60""",14.5 ounces,2021-09-16,CREVENT Home,https://www.amazon.com/gp/help/seller/at-a-gla...,A36DKLDIUU02F6,images/B098N6J3WQ.jpg
4,B0CQX62VRZ,Dachshund Blanket Love Dachshund Soft Warm Fla...,$25.99,"60""L x 50""W","60""x50""",1.36 pounds,2023-12-25,Yisimu,https://www.amazon.com/gp/help/seller/at-a-gla...,A1MMHBG7VIQV0W,images/B0CQX62VRZ.jpg
...,...,...,...,...,...,...,...,...,...,...,...
3067,B0CK8HJZ4L,"Chunky Knit Blanket Throw 51""x63"", Chenille Th...",$49.99 with 29 percent savings,,,,NaT,YAAPSU,https://www.amazon.com/gp/help/seller/at-a-gla...,AIEWDB29ZB09Y,images/B0CK8HJZ4L.jpg
3068,B093CW56QC,NEWCOSPLAY Super Soft Throw Blanket Pink Premi...,$9.99 with 23 percent savings,"50""L x 40""W","Throw(40""x50"")",13.1 ounces,NaT,BEDHOME,https://www.amazon.com/gp/help/seller/at-a-gla...,A1GUFEWKZ2CN0,images/B093CW56QC.jpg
3069,B0CT5YSNK2,"Plush Throw,Microfiber Blanket, Throw Blanket ...",$79.74,"3""L x 2""W",130x230cm,1.76 ounces,2024-01-24,DXHRY,https://www.amazon.com/gp/help/seller/at-a-gla...,A3C38ETESDS7UC,images/B0CT5YSNK2.jpg
3070,B0CH3JR644,TUNKENCE Fuzzy Throw Blanket for Couch Sofa Wi...,$6.99,"30""L x 40""W",30x40”,,2023-09-01,TUNKENCE（7-14 days delivery）,https://www.amazon.com/gp/help/seller/at-a-gla...,A1T041G4ZKEJD2,images/B0CH3JR644.jpg


In [27]:
merchant_df = blanket_df.groupby("merchant_name").first().reset_index()[["merchant_name", "merchant_url", "seller_id"]]

In [33]:
merchant_tuples = list(merchant_df[["seller_id", "merchant_url"]].itertuples(index=False, name=None))
merchant_dir = Path("./merchant_pages")

def download_merchant_page(seller_id, merchant_url):
    out_path = merchant_dir / f"{seller_id}.html"
    if out_path.exists() and out_path.stat().st_size > 0:
        return
    response = requests.get(merchant_url, headers=headers, cookies=cookies)
    try:
        response.raise_for_status()
        out_path.write_text(response.text)
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {merchant_url}: {e}")
        
with mp.Pool() as pool:
    pool.starmap(download_merchant_page, merchant_tuples)

Failed to download https://www.amazon.com/gp/help/seller/at-a-glance.html/ref=dp_merchant_link?ie=UTF8&seller=A2BMZVUOJQXIR4&asin=B0C48VPFL5&ref_=dp_merchant_link&isAmazonFulfilled=1: 503 Server Error: Service Unavailable for url: https://www.amazon.com/gp/help/seller/at-a-glance.html/ref=dp_merchant_link?ie=UTF8&seller=A2BMZVUOJQXIR4&asin=B0C48VPFL5&ref_=dp_merchant_link&isAmazonFulfilled=1
Failed to download https://www.amazon.com/gp/help/seller/at-a-glance.html/ref=dp_merchant_link?ie=UTF8&seller=AUYFSB2Q2KQH5&asin=B0C1Q7BW7Q&ref_=dp_merchant_link&isAmazonFulfilled=1: 503 Server Error: Service Unavailable for url: https://www.amazon.com/gp/help/seller/at-a-glance.html/ref=dp_merchant_link?ie=UTF8&seller=AUYFSB2Q2KQH5&asin=B0C1Q7BW7Q&ref_=dp_merchant_link&isAmazonFulfilled=1
Failed to download https://www.amazon.com/gp/help/seller/at-a-glance.html/ref=dp_merchant_link?ie=UTF8&seller=A14PHGEXMWRTYL&asin=B07BGD5S7W&ref_=dp_merchant_link&isAmazonFulfilled=1: 503 Server Error: Service Un

In [39]:
def parse_merchant_page(path):
    tree = ET.HTML(path.read_text())
    detail_div = tree.cssselect("div#page-section-detail-seller-info")[0]
    # Extract the business name using XPath
    business_name_xpath = "//span[contains(@class, 'a-text-bold') and contains(text(), 'Business Name:')]/following-sibling::span[1]/text()"
    business_name = detail_div.xpath(business_name_xpath)[0].strip()

    # Extract the business address using XPath
    # Assuming the address follows the "Business Address:" label and is structured in subsequent divs
    business_address_parts_xpath = "//span[contains(@class, 'a-text-bold') and contains(text(), 'Business Address:')]/following::div[contains(@class, 'indent-left')]/span/text()"
    business_address_parts = detail_div.xpath(business_address_parts_xpath)
    business_address = ', '.join([part.strip() for part in business_address_parts])
    
    seller_description_div = tree.cssselect("div#spp-expander-about-seller")[0]
    seller_description = "".join(seller_description_div.itertext()).strip()
    
    return {
        "seller_id": path.stem,
        "seller_name": tree.cssselect("h1#seller-name")[0].text,
        "seller_company_name": business_name,
        "seller_address": business_address,
        "seller_description": seller_description,
    }
    
merchant_detail_df = pd.DataFrame.from_records([parse_merchant_page(p) for p in merchant_dir.glob("*.html")])
merchant_detail_df

Unnamed: 0,seller_id,seller_name,seller_company_name,seller_address,seller_description
0,A1PH85GOILL0MS,feabuccy store,"Hangzhou Zhiyi E-Commerce Co., Ltd","富春峰景世纪花园10幢309室, 桐庐县城南街道春江路8号, 杭州市, 浙江, 311501...",feabuccy store is committed to providing each ...
1,ARSKUX0969NZ,QWINEE,GuangZhou XingMiao DianZi ShangWu YouXian GongSi,"荷光路137号103房L1129号, 广州市, 天河区, 广东省, 510000, CN",QWINEE is committed to providing each customer...
2,A38IQ6YFOIP0IB,ROMROL,jiandeshijunbobaozhuangyouxiangongsi,"建德市乾潭镇新程村, 杭州市建德市, 杭州市, 浙江省, 311602, CN",ROMROL is committed to providing each customer...
3,A3CTF0JIV8PE34,bedbest-US,"Changshu Yunkailai Textile Co., Ltd.","古里镇, 淼泉淼东路, 苏州市, 常熟市, 江苏省, 215000, CN",YUSOKI is a leading home textile manufacturer....
4,A1GEGQQYUCOAXE,LUX-US,su zhou modest Trading Company Ltd.,"涅阳菩提路南段锦汇广场, 南阳市, 镇平县, 河南, 474250, CN",LUX-US is committed to providing each customer...
...,...,...,...,...,...
177,A1OTKRPYD6BHGH,KASYLAN Home,"Hangzhou Yidu e-commerce Co., Ltd","丁兰街道西子智慧产业园17栋406室, 杭州市, 上城区, 浙江省, 310000, CN",KASYLAN Home is committed to providing each cu...
178,AJCJ2BCCP1BN6,Bertte,BERTTE INC,"15181 Fairfield Ranch Road, Unit 150, Chino Hi...",About Bertte Bertte is a top-rated home textil...
179,A2ZPAITC2XZZ4F,Xstar art,chongqingxinweijiajuyouxianzerengongsi,"永川区内环东路388号2幢8-2#, 重庆市, 重庆, 402160, CN",Xstar art is committed to providing each custo...
180,APA0RQGAGAGP1,ATMOSURELY,nantongyuanhejiajukejiyouxiangongsi,"温泉大道三段399号, 花样年花样城19栋3单元1801, 成都市, 温江区, 四川省, 6...",ATMOSURELY is committed to providing each cust...


In [52]:
listing_counts = blanket_df.groupby("seller_id").count()["asin"]
merchant_detail_df.merge(listing_counts.rename("num_asins").to_frame(), left_on="seller_id", right_on="seller_id").sort_values("num_asins", ascending=False).to_excel("merchant_info.xlsx")

In [53]:
blanket_df.to_excel("blanket_info.xlsx")

In [5]:











def download_product_page(asin):
    out_path = html_dir / f"{asin}.html"
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path.read_text()
    print("downloading", asin)
    url = product_base_url + asin
    response = s.get(url)
    response.raise_for_status()
    out_path.write_text(response.text)
    return out_path.read_text()
    
def parse_asin_list(asin_list):
    variant_asin_list = list()
    
    for asin in asin_list:
        try:
            text = download_product_page(asin)
            data = parse_product_page(text)
            data_path = data_dir / f"{asin}.json"
            data_path.write_text(json.dumps(data, indent=2))
            image_extension = data["cover_image"].split(".")[-1]
            image_path = image_dir / f"{asin}.{image_extension}"
            if not image_path.exists():
                image_response = s.get(data["cover_image"])
                image_response.raise_for_status()
                image_path.write_bytes(image_response.content)
            variant_asin_list.extend(data["variant_asins"])
        except Exception as e:
            print(f"failed to process {asin}: {e}")
            raise e
    return variant_asin_list
        
        
        

def crawl(query):
    #s.driver.get("https://www.amazon.com")
    #search_box = s.driver.find_element(By.ID, "twotabsearchtextbox")
    #search_box.send_keys(query)
    #search_box.submit()
    #asin_list = collect_asin_from_search_results()
    #s.transfer_driver_cookies_to_session()
    asin_list = [p.name.split(".")[0] for p in html_dir.glob("*.html")]
    to_search = set(deepcopy(asin_list))
    while True:
        print(f"scanning {len(to_search)} asins")
        variant_asins = parse_asin_list(to_search)
        to_search = set(variant_asins) - set(asin_list)
        asin_list = set(asin_list) | set(variant_asins)
        if not to_search:
            break
        
        
crawl("weighted knitted blanket")
    
    



scanning 833 asins


  "merchant_name": merchant_el.text if merchant_el else None,
  "merchant_url": urljoin(product_base_url, merchant_el.get("href")) if merchant_el else None,
  "seller_id": seller_id_re.search(merchant_el.get("href")).group(1) if merchant_el else None


scanning 1495 asins
downloading B0CD7Z2NXT
downloading B0C1Z5WQ6W
downloading B0CK2KGWJT
downloading B0811Q9S36


KeyboardInterrupt: 

In [8]:


s.transfer_driver_cookies_to_session()

In [16]:
for asin in asin_list:
    download_product_page(asin)

In [9]:
import lxml.etree as ET
response = s.get(base_url + asin_list[0])
response.raise_for_status()

In [26]:

    
parse_product_page(response.text)

{'product_title': 'Chenille Chunky Knit Blanket Throw （40×50 Inch）, Handmade Warm & Cozy Blanket Couch, Bed, Home Decor, Soft Breathable Fleece Banket, Christmas Thick and Giant Yarn Throws, Cream',
 'product_details': {'material': 'Chenille',
  'color': 'Cream',
  'brand': 'Maetoow',
  'special_feature': 'Skin Friendly',
  'style': 'Modern',
  'blanket_form': 'Throw Blanket',
  'age_range_(description)': 'kids, adults',
  'product_dimensions': '50"L x 40"W',
  'theme': 'Space',
  'pattern': 'Solid',
  'recommended_uses_for_product': 'household',
  'seasons': 'Winter',
  'weave_type': 'Chunky Knit',
  'product_care_instructions': 'Machine Wash',
  'size': "40''x50''",
  'fabric_type': '100% Chenille',
  'unit_count': '1 Count',
  'number_of_items': '5',
  'fabric_warmth_description': 'Medium/Heavyweight',
  'sport': 'Camping',
  'model_name': 'small',
  'item_weight': '3.74 pounds',
  'manufacturer': 'wanhong',
  'asin': 'B0B766SCS5',
  'country_of_origin': 'China',
  'item_model_numbe

In [27]:
asin_list = [p.stem for p in workdir.glob("*.html")]
asin_list

['B08SQFBQV2',
 'B0CJPPZKJC',
 'B09XHC18H9',
 'B0B4VW9CZ1',
 'B09YS1214X',
 'B0CFQCT46L',
 'B0C6MQ2N3S',
 'B0BBZFBYJL',
 'B07SRY7GBK',
 'B0C1FBJGFR',
 'B096FQR7NP',
 'B09BC4YFJM',
 'B0BPSDDNHQ',
 'B0CGJF55C7',
 'B09QQ7CZY4',
 'B0B5GFQ9HW',
 'B0C4ZYX8WP',
 'B0B7WDGG2C',
 'B08FMFTVXC',
 'B0C9KW6THW',
 'B0CPF6G6FN',
 'B0B38YC8CK',
 'B0811Q54RQ',
 'B09D7J26XF',
 'B08BYGK4HH',
 'B0C4S7R8PN',
 'B0CF1TDCPK',
 'B0CCVJFG7Z',
 'B0CKT88DDV',
 'B0CJPQ7CVQ',
 'B0CGV9R2Q4',
 'B0CGWJT9PC',
 'B0CJPRG6GL',
 'B09T3CTKP2',
 'B092HVF7FT',
 'B096FPJCF2',
 'B0BR6LBH65',
 'B0BR728TB6',
 'B0CNV9TLLW',
 'B0B2NSM1HK',
 'B099FKPCDP',
 'B08T6Q716K',
 'B0CF1SV1RP',
 'B082XYS6HB',
 'B093Q6H1XJ',
 'B09514WX62',
 'B0CB48XLH3',
 'B0CJPR48QB',
 'B0CBRDTC6P',
 'B0B61WYSYK',
 'B099FLVTLL',
 'B08DNQLXTT',
 'B0B2NTWD7Q',
 'B0BW3QQJK4',
 'B07MW2YFQ7',
 'B0CJPQCKXY',
 'B0C2PN2SBD',
 'B0CM435Y91',
 'B0C4PHZF49',
 'B0C1BTZP93',
 'B08BL8BJ11',
 'B07TBJ1ZG6',
 'B0BFH6ZW3S',
 'B0CF5KZLGX',
 'B0BZSYMVR9',
 'B0B2NT2NMZ',
 'B0BB2H68