In [1]:
import json
import requests
from bs4 import BeautifulSoup

In [2]:
BASE_URL = "https://www.homedepot.com"
STORES = {
    "ny": "6177",
    "dallas": "589"
}

In [3]:
def get_navigation():
    url = f"{BASE_URL}/hdus/en_US/DTCCOMNEW/fetch/headerFooterFlyout-8.json"
    req = requests.get(
        url,
        headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    )
    navigation = req.json()['header']['primaryNavigation']
    return navigation

In [4]:
def get_nav_url(department, sub_department, product_category=None, navigation=None):
    global BASE_URL
    
    if not navigation:
        navigation = get_navigation()
    
    try:
        current_obj = [d for d in navigation if d['title'] == department][0]
    except IndexError:
        return None
    
    try:
        current_obj = [sd for sd in current_obj['l2'] if sd['name'] == sub_department][0]
    except IndexError:
        return None
    
    if product_category:
        try:
            current_obj = [pc for pc in current_obj['l3'] if pc['name'] == product_category][0]
        except IndexError:
            return None
    
    nav_url = current_obj['url'].replace("SECURE_SUPPORTED", BASE_URL)
    
    return nav_url

In [30]:
data = [
    ("Appliances", "Dishwashers", None),
    ("Appliances", "Refrigerators", None),
    ("Decor & Furniture", "Bedroom Furniture", "Mattresses"),
]

for dp, sbdp, prct in data:
    print(get_nav_url(dp, sbdp, prct))

https://www.homedepot.com/b/Appliances-Dishwashers/N-5yc1vZc3po
https://www.homedepot.com/b/Appliances-Refrigerators/N-5yc1vZc3pi
https://www.homedepot.com/b/Furniture-Bedroom-Furniture-Mattresses/N-5yc1vZc7oe


In [5]:
def get_body(url):
    req = requests.get(
        url, 
        headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    )
    return BeautifulSoup(req.content)

In [6]:
def get_brands_from_soup(soup, brand_header="Brands"):
    global BASE_URL
    
    nav_pane = soup.find(id="ColumnRail_thd_20cf")
        
    nav_sections = nav_pane.find_all(class_="customNav__container")
    try:
        nav_brands = [section for section in nav_sections if section.find(class_="customNav__heading").text == brand_header][0]
    except IndexError:
        return None
    
    nav_brands = [brand.a for brand in nav_brands.find_all("li") if brand.a]
    
    brand_urls = {
        brand.text.replace("®", ""): f"{BASE_URL}{brand['href']}"
        for brand in nav_brands
    }
    
    return brand_urls

def get_brands_from_api(nav_param, store_id, sub_department):
    global BASE_URL
    
    req = requests.post(
        f"{BASE_URL}/product-information/model",
        headers={    
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
            'x-experience-name': 'major-appliances' if sub_department in {"Dishwashers", "Refrigerators"} else 'hd-home'
        },
        json={
            "operationName": "searchModel", 
            "query": 'query searchModel($pageSize: Int, $startIndex: Int, $orderBy: ProductSort, $filter: ProductFilter, $storeId: String, $zipCode: String, $skipInstallServices: Boolean = true, $skipSpecificationGroup: Boolean = false, $keyword: String, $navParam: String, $storefilter: StoreFilter = ALL, $channel: Channel = DESKTOP, $additionalSearchParams: AdditionalParams) { searchModel(keyword: $keyword, navParam: $navParam, storefilter: $storefilter, storeId: $storeId, channel: $channel, additionalSearchParams: $additionalSearchParams) { metadata { categoryID analytics { semanticTokens dynamicLCA __typename } canonicalUrl searchRedirect clearAllRefinementsURL contentType cpoData { cpoCount cpoOnly totalCount __typename } isStoreDisplay productCount { inStore __typename } stores { storeId storeName address { postalCode __typename } nearByStores { storeId storeName distance address { postalCode __typename } __typename } __typename } __typename } products(pageSize: $pageSize, startIndex: $startIndex, orderBy: $orderBy, filter: $filter) { identifiers { storeSkuNumber canonicalUrl brandName modelNumber productType productLabel itemId parentId isSuperSku __typename } itemId dataSources availabilityType { discontinued type __typename } badges(storeId: $storeId) { name __typename } details { collection { collectionId name url __typename } __typename } favoriteDetail { count __typename } fulfillment(storeId: $storeId, zipCode: $zipCode) { fulfillmentOptions { type fulfillable services { type locations { inventory { isInStock isLimitedQuantity isOutOfStock isUnavailable quantity maxAllowedBopisQty minAllowedBopisQty __typename } curbsidePickupFlag isBuyInStoreCheckNearBy distance isAnchor locationId state storeName storePhone type __typename } deliveryTimeline deliveryDates { startDate endDate __typename } deliveryCharge dynamicEta { hours minutes __typename } hasFreeShipping freeDeliveryThreshold totalCharge __typename } __typename } anchorStoreStatus anchorStoreStatusType backordered backorderedShipDate bossExcludedShipStates excludedShipStates seasonStatusEligible onlineStoreStatus onlineStoreStatusType __typename } info { isBuryProduct isSponsored isGenericProduct isLiveGoodsProduct sponsoredBeacon { onClickBeacon onViewBeacon __typename } sponsoredMetadata { campaignId placementId slotId __typename } globalCustomConfigurator { customExperience __typename } returnable hidePrice productSubType { name link __typename } categoryHierarchy ecoRebate quantityLimit sskMin sskMax unitOfMeasureCoverage wasMaxPriceRange wasMinPriceRange swatches { isSelected itemId label swatchImgUrl url value __typename } totalNumberOfOptions __typename } installServices @skip(if: $skipInstallServices) { scheduleAMeasure __typename } media { images { url type subType sizes __typename } __typename } reviews { ratingsReviews { averageRating totalReviews __typename } __typename } pricing(storeId: $storeId) { value alternatePriceDisplay alternate { bulk { pricePerUnit thresholdQuantity value __typename } unit { caseUnitOfMeasure unitsOriginalPrice unitsPerCase value __typename } __typename } original mapAboveOriginalPrice message promotion { type description { shortDesc longDesc __typename } dollarOff percentageOff savingsCenter savingsCenterPromos specialBuySavings specialBuyDollarOff specialBuyPercentageOff dates { start end __typename } experienceTag __typename } specialBuy unitOfMeasure __typename } keyProductFeatures { keyProductFeaturesItems { features { name refinementId refinementUrl value __typename } __typename } __typename } specificationGroup @skip(if: $skipSpecificationGroup) { specifications { specName specValue __typename } specTitle __typename } sizeAndFitDetail { attributeGroups { attributes { attributeName dimensions __typename } dimensionLabel productType __typename } __typename } __typename } searchReport { totalProducts didYouMean correctedKeyword keyword pageSize searchUrl sortBy sortOrder startIndex __typename } relatedResults { universalSearch { title __typename } relatedServices { label __typename } visualNavs { label imageId webUrl categoryId imageURL __typename } visualNavContainsEvents relatedKeywords { keyword __typename } __typename } taxonomy { brandLinkUrl breadCrumbs { browseUrl creativeIconUrl deselectUrl dimensionId dimensionName label refinementKey url __typename } __typename } templates partialTemplates dimensions { label refinements { refinementKey label recordCount selected imgUrl url nestedRefinements { label url recordCount refinementKey __typename } __typename } collapse dimensionId isVisualNav nestedRefinementsLimit visualNavSequence __typename } orangeGraph { universalSearchArray { pods { title description imageUrl link __typename } info { title __typename } __typename } __typename } id appliedDimensions { label refinements { label refinementKey url __typename } __typename } __typename }}',
            "variables": {
                "navParam": nav_param,
                "storeId": store_id
            }
        }
    )
    
    dims = req.json()['data']['searchModel']['dimensions']
    brands = [d for d in dims if d['label'] == "Brand"][0]['refinements']
    
    return {b['label']: b['url'] for b in brands}

In [7]:
def get_products(nav_param, store_id, sub_department):
    global BASE_URL
    
    req = requests.post(
        f"{BASE_URL}/product-information/model",
        headers={    
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
            'x-experience-name': 'major-appliances' if sub_department in {"Dishwashers", "Refrigerators"} else 'hd-home'
        },
        json={
            "operationName": "searchModel", 
            "query": 'query searchModel($pageSize: Int, $startIndex: Int, $orderBy: ProductSort, $filter: ProductFilter, $storeId: String, $zipCode: String, $skipInstallServices: Boolean = true, $skipSpecificationGroup: Boolean = false, $keyword: String, $navParam: String, $storefilter: StoreFilter = ALL, $channel: Channel = DESKTOP, $additionalSearchParams: AdditionalParams) { searchModel(keyword: $keyword, navParam: $navParam, storefilter: $storefilter, storeId: $storeId, channel: $channel, additionalSearchParams: $additionalSearchParams) { metadata { categoryID analytics { semanticTokens dynamicLCA __typename } canonicalUrl searchRedirect clearAllRefinementsURL contentType cpoData { cpoCount cpoOnly totalCount __typename } isStoreDisplay productCount { inStore __typename } stores { storeId storeName address { postalCode __typename } nearByStores { storeId storeName distance address { postalCode __typename } __typename } __typename } __typename } products(pageSize: $pageSize, startIndex: $startIndex, orderBy: $orderBy, filter: $filter) { identifiers { storeSkuNumber canonicalUrl brandName modelNumber productType productLabel itemId parentId isSuperSku __typename } itemId dataSources availabilityType { discontinued type __typename } badges(storeId: $storeId) { name __typename } details { collection { collectionId name url __typename } __typename } favoriteDetail { count __typename } fulfillment(storeId: $storeId, zipCode: $zipCode) { fulfillmentOptions { type fulfillable services { type locations { inventory { isInStock isLimitedQuantity isOutOfStock isUnavailable quantity maxAllowedBopisQty minAllowedBopisQty __typename } curbsidePickupFlag isBuyInStoreCheckNearBy distance isAnchor locationId state storeName storePhone type __typename } deliveryTimeline deliveryDates { startDate endDate __typename } deliveryCharge dynamicEta { hours minutes __typename } hasFreeShipping freeDeliveryThreshold totalCharge __typename } __typename } anchorStoreStatus anchorStoreStatusType backordered backorderedShipDate bossExcludedShipStates excludedShipStates seasonStatusEligible onlineStoreStatus onlineStoreStatusType __typename } info { isBuryProduct isSponsored isGenericProduct isLiveGoodsProduct sponsoredBeacon { onClickBeacon onViewBeacon __typename } sponsoredMetadata { campaignId placementId slotId __typename } globalCustomConfigurator { customExperience __typename } returnable hidePrice productSubType { name link __typename } categoryHierarchy ecoRebate quantityLimit sskMin sskMax unitOfMeasureCoverage wasMaxPriceRange wasMinPriceRange swatches { isSelected itemId label swatchImgUrl url value __typename } totalNumberOfOptions __typename } installServices @skip(if: $skipInstallServices) { scheduleAMeasure __typename } media { images { url type subType sizes __typename } __typename } reviews { ratingsReviews { averageRating totalReviews __typename } __typename } pricing(storeId: $storeId) { value alternatePriceDisplay alternate { bulk { pricePerUnit thresholdQuantity value __typename } unit { caseUnitOfMeasure unitsOriginalPrice unitsPerCase value __typename } __typename } original mapAboveOriginalPrice message promotion { type description { shortDesc longDesc __typename } dollarOff percentageOff savingsCenter savingsCenterPromos specialBuySavings specialBuyDollarOff specialBuyPercentageOff dates { start end __typename } experienceTag __typename } specialBuy unitOfMeasure __typename } keyProductFeatures { keyProductFeaturesItems { features { name refinementId refinementUrl value __typename } __typename } __typename } specificationGroup @skip(if: $skipSpecificationGroup) { specifications { specName specValue __typename } specTitle __typename } sizeAndFitDetail { attributeGroups { attributes { attributeName dimensions __typename } dimensionLabel productType __typename } __typename } __typename } searchReport { totalProducts didYouMean correctedKeyword keyword pageSize searchUrl sortBy sortOrder startIndex __typename } relatedResults { universalSearch { title __typename } relatedServices { label __typename } visualNavs { label imageId webUrl categoryId imageURL __typename } visualNavContainsEvents relatedKeywords { keyword __typename } __typename } taxonomy { brandLinkUrl breadCrumbs { browseUrl creativeIconUrl deselectUrl dimensionId dimensionName label refinementKey url __typename } __typename } templates partialTemplates dimensions { label refinements { refinementKey label recordCount selected imgUrl url nestedRefinements { label url recordCount refinementKey __typename } __typename } collapse dimensionId isVisualNav nestedRefinementsLimit visualNavSequence __typename } orangeGraph { universalSearchArray { pods { title description imageUrl link __typename } info { title __typename } __typename } __typename } id appliedDimensions { label refinements { label refinementKey url __typename } __typename } __typename }}',
            "variables": {
                "navParam": nav_param,
                "storeId": store_id
            }
        }
    )
    
    products = req.json()['data']['searchModel']['products']
    
    return products

In [16]:
def transform_product(product):
    product_data = {
        "url": product['identifiers']['canonicalUrl'],
        "brand": product['identifiers']['brandName'],
        "model_number": product['identifiers']['modelNumber'],
        "product_type": product['identifiers']['productType'],
        "product_label": product['identifiers']['productLabel'],
        "item_id": product['itemId'],
        "availability": None if product['availabilityType']['discontinued'] else product['availabilityType']['type'],
        "average_rating": product['reviews']['ratingsReviews']['averageRating'],
        "reviews_number": product['reviews']['ratingsReviews']['totalReviews'],
        "price": product['pricing']['value'],
        "features": {"_".join(f['name'].lower().split()): f['value'] for f in product['keyProductFeatures']['keyProductFeaturesItems'][0]['features']}
    }
    return product_data

In [17]:
def get_appliances(brand, store_id, sub_department, *args, **kwargs):
    if sub_department not in {"Dishwashers", "Refrigerators"}:
        raise ValueError("Wrong sub department")
        
    url = get_nav_url("Appliances", sub_department)
    soup = get_body(url)
    nav_brands = get_brands_from_soup(soup, "Brands" if sub_department == "Dishwashers" else "Top Refrigerator Brands")
    brand_url = nav_brands[brand]
    prods = get_products(brand_url.split("N-")[-1], store_id, sub_department)

    return prods

def get_mattresses(brand, store_id, sub_department, *args, **kwargs):
    nv_prm = get_nav_url("Decor & Furniture", "Bedroom Furniture", "Mattresses").split("N-")[-1]
    nav_brands = get_brands_from_api(nv_prm, store_id, sub_department)
    brand_url = nav_brands[brand]
    prods = get_products(brand_url.split("N-")[-1], store_id, sub_department)

    return prods

def products_factory(brand, store_id, sub_department):
    func_dict = {
        "Dishwashers": get_appliances,
        "Refrigerators": get_appliances,
        "Decor & Furniture": get_mattresses
    }
    func = func_dict[sub_department]
    
    return [transform_product(p) for p in func(brand, store_id, sub_department)]

In [18]:
data = [
    ("Dishwashers", "LG"),
    ("Dishwashers", "Samsung"),
    ("Refrigerators", "Whirlpool"),
    ("Refrigerators", "GE Appliances"),
    ("Decor & Furniture", "Sealy"),
]

In [24]:
def to_snake_case(string):
    return '_'.join(string.lower().split())

In [23]:
import os

prod_dir = "products"
if not os.path.exists(prod_dir):
    os.mkdir(prod_dir)

In [28]:
import time
for sub_dep, brnd in data:
    for store_loc, store_id in STORES.items():
        start = time.time()
        products = products_factory(brnd, store_id, sub_dep)
        end = time.time()
        
        file_path = os.path.join(prod_dir, f"{store_loc}_{to_snake_case(sub_dep)}_{to_snake_case(brnd)}.json")
        with open(file_path, "w") as f:
            json.dump(products, f)
            
        print(f"Elapsed: {end - start}; nb products: {len(products)}")

Elapsed: 1.343376636505127; nb products: 10
Elapsed: 1.5330893993377686; nb products: 10
Elapsed: 1.2989161014556885; nb products: 7
Elapsed: 1.0974972248077393; nb products: 7
Elapsed: 1.2269799709320068; nb products: 24
Elapsed: 1.4655628204345703; nb products: 24
Elapsed: 1.3327343463897705; nb products: 24
Elapsed: 1.7615044116973877; nb products: 24
Elapsed: 1.2806460857391357; nb products: 24
Elapsed: 1.3475773334503174; nb products: 24
