In [None]:
# !pip install -U crawl4ai

In [25]:
from IPython.display import Markdown, HTML
from typing import Optional, Union, Dict, Any
import json
from nest_asyncio import apply
apply()

import os
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig

### Basics

In [2]:
async def main(url: str):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig(
        remove_overlay_elements=True,
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=url,
            config=run_config
        )
        print(result.markdown)

        return result

In [None]:
url = "https://www.lazada.co.th/products/asus-rog-zephyrus-g16-2024-gaming-laptop-16-240hz-wqxga-oled-nvidia-geforce-rtx-4070-intel-arc-graphics-intel-core-ultra-9-185h-32gb-16x2-lpddr5x-1tb-pcie-40-nvme-m2-ssd-rgb-keyboard-gu605mi-qr225wf-i5198958811-s22092173635.html?c=&channelLpJumpArgs=&clickTrackInfo=query%253Aasus%252Brog%252Blaptop%253Bnid%253A5198958811%253Bsrc%253ALazadaMainSrp%253Brn%253A77e039c3d48aee0916f89ae27d0f03c4%253Bregion%253Ath%253Bsku%253A5198958811_TH%253Bprice%253A85990%253Bclient%253Adesktop%253Bsupplier_id%253A100179987420%253Bbiz_source%253Ah5_internal%253Bslot%253A0%253Butlog_bucket_id%253A470687%253Basc_category_id%253A6372%253Bitem_id%253A5198958811%253Bsku_id%253A22092173635%253Bshop_id%253A1029865%253BtemplateInfo%253A107882_D_E%2523-1_A3_C%2523&freeshipping=0&fs_ab=2&fuse_fs=&lang=en&location=Bangkok&price=8.599E%204&priceCompare=skuId%3A22092173635%3Bsource%3Alazada-search-voucher%3Bsn%3A77e039c3d48aee0916f89ae27d0f03c4%3BoriginPrice%3A8599000%3BdisplayPrice%3A8599000%3BsinglePromotionId%3A-1%3BsingleToolCode%3AmockedSalePrice%3BvoucherPricePlugin%3A0%3Btimestamp%3A1744768368447&ratingscore=&request_id=77e039c3d48aee0916f89ae27d0f03c4&review=&sale=1&search=1&source=search&spm=a2o4m.searchlist.list.0&stock=1"
result = asyncio.run(main(url))

In [None]:
Markdown(result[0].markdown)

### Deep Crawling

In [3]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

In [None]:
async def main(url: str):
    # Configure a 2-level deep crawl
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=2, 
            include_external=False
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True
    )

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun(url, config=config)

        print(f"Crawled {len(results)} pages in total")

        # Access individual results
        for result in results[:3]:  # Show first 3 results
            print(f"URL: {result.url}")
            print(f"Depth: {result.metadata.get('depth', 0)}")

        return results

url = "https://www.lazada.co.th/products/asus-rog-zephyrus-g16-2024-gaming-laptop-16-240hz-wqxga-oled-nvidia-geforce-rtx-4070-intel-arc-graphics-intel-core-ultra-9-185h-32gb-16x2-lpddr5x-1tb-pcie-40-nvme-m2-ssd-rgb-keyboard-gu605mi-qr225wf-i5198958811-s22092173635.html?c=&channelLpJumpArgs=&clickTrackInfo=query%253Aasus%252Brog%252Blaptop%253Bnid%253A5198958811%253Bsrc%253ALazadaMainSrp%253Brn%253A77e039c3d48aee0916f89ae27d0f03c4%253Bregion%253Ath%253Bsku%253A5198958811_TH%253Bprice%253A85990%253Bclient%253Adesktop%253Bsupplier_id%253A100179987420%253Bbiz_source%253Ah5_internal%253Bslot%253A0%253Butlog_bucket_id%253A470687%253Basc_category_id%253A6372%253Bitem_id%253A5198958811%253Bsku_id%253A22092173635%253Bshop_id%253A1029865%253BtemplateInfo%253A107882_D_E%2523-1_A3_C%2523&freeshipping=0&fs_ab=2&fuse_fs=&lang=en&location=Bangkok&price=8.599E%204&priceCompare=skuId%3A22092173635%3Bsource%3Alazada-search-voucher%3Bsn%3A77e039c3d48aee0916f89ae27d0f03c4%3BoriginPrice%3A8599000%3BdisplayPrice%3A8599000%3BsinglePromotionId%3A-1%3BsingleToolCode%3AmockedSalePrice%3BvoucherPricePlugin%3A0%3Btimestamp%3A1744768368447&ratingscore=&request_id=77e039c3d48aee0916f89ae27d0f03c4&review=&sale=1&search=1&source=search&spm=a2o4m.searchlist.list.0&stock=1"
results = asyncio.run(main(url))

### Customize

In [18]:
from crawl4ai import (
    AsyncWebCrawler, BrowserConfig, 
    CrawlerRunConfig, LLMConfig,
    LLMExtractionStrategy
)
from pydantic import BaseModel, Field

In [15]:
browser_config = BrowserConfig(
    headless = True, # default is True
    # user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    proxy = os.getenv("PROXY_"),
    ignore_https_errors = True, # default is True
    java_script_enabled = True, # default is True
    text_mode = True, # default is False If True, tries to disable images/other heavy content for speed.
)

In [19]:
class Product(BaseModel):
    brand: str = Field(
        description="The brand of the product",
        example="ASUS"
    )
    model: str = Field(
        description="The model of the product",
        example="ASUS ROG Zephyrus G16"
    )
    current_price: float = Field(
        description="The current price of the product",
        example=1000
    )
    original_price: Optional[float] = Field(
        description="The original price of the product",
        example=1000
    )
    seller: Optional[str] = Field(
        description="The seller of the product. Include the name of the seller and the rating of the seller",
        example="Amazon"
    )
    cpu: Optional[str] = Field(
        description="The cpu of the product. Include the type of cpu and the speed of the cpu",
        example="Intel Core i7-12700H"
    )
    gpu: Optional[str] = Field(
        description="The gpu of the product. Include the type of gpu and the size of the gpu",
        example="NVIDIA GeForce RTX 4070"
    )
    ram: Optional[str] = Field(
        description="The ram of the product. Include the type of ram and the size of the ram",
        example="16GB DDR5"
    )
    storage: Optional[str] = Field(
        description="The storage of the product. Include the type of storage and the size of the storage",
        example="1TB SSD"
    )
    display: Optional[str] = Field(
        description="The display of the product. Include the type of display and the size of the display",
        example="15.6 inch"
    )
    warranty: Optional[str] = Field(
        description="The warranty of the product. Include the type of warranty and the duration of the warranty",
        example="2 years"
    )
    color: Optional[str] = Field(
        description="The color of the product",
        example="Black"
    )
    weight: Optional[str] = Field(
        description="The weight of the product",
        example="2.5 kg"
    )
    dimensions: Optional[str] = Field(
        description="The dimensions of the product",
        example="39.6 x 26.5 x 2.1 cm"
    )
    battery: Optional[str] = Field(
        description="The battery of the product",
        example="5000 mAh"
    )

llm_config = LLMConfig(
    provider="openai/gpt-4o-mini", 
    api_token=os.getenv("OPENAI_API_KEY")
)

extraction_strategy = LLMExtractionStrategy(
    llm_config = llm_config,
    schema = Product.model_json_schema(),
    extraction_type="schema",
    instruction="""
    Extract the product information from the text.
    You are extracting ecommerce product information.
    Specifically these are laptop information.
    User your best judgement to extract the information.
    DO NOT make up any information.
    """,
    extra_args={"temperature": 0.1, "max_tokens": 5000},
    input_format="html"
)

run_cfg = CrawlerRunConfig(
    only_text=True, # default is False If True, tries to extract text-only content.
    scan_full_page=True, # default is False If True, tries to scan the full page.
    simulate_user=True, # default is False If True, Simulate user interactions (mouse movements) to avoid bot detection.
    magic = True, # default is False If True, Automatic handling of popups/consent banners. Experimental.
    pdf = True, # default is False If True, tries to save the page as a PDF.
    extraction_strategy=extraction_strategy
)

In [20]:
async def main(url: str):
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=url,
            config=run_cfg,
        )

        return result
    
url = "https://www.lazada.co.th/products/asus-rog-zephyrus-g16-2024-gaming-laptop-16-240hz-wqxga-oled-nvidia-geforce-rtx-4070-intel-arc-graphics-intel-core-ultra-9-185h-32gb-16x2-lpddr5x-1tb-pcie-40-nvme-m2-ssd-rgb-keyboard-gu605mi-qr225wf-i5198958811-s22092173635.html?c=&channelLpJumpArgs=&clickTrackInfo=query%253Aasus%252Brog%252Blaptop%253Bnid%253A5198958811%253Bsrc%253ALazadaMainSrp%253Brn%253A77e039c3d48aee0916f89ae27d0f03c4%253Bregion%253Ath%253Bsku%253A5198958811_TH%253Bprice%253A85990%253Bclient%253Adesktop%253Bsupplier_id%253A100179987420%253Bbiz_source%253Ah5_internal%253Bslot%253A0%253Butlog_bucket_id%253A470687%253Basc_category_id%253A6372%253Bitem_id%253A5198958811%253Bsku_id%253A22092173635%253Bshop_id%253A1029865%253BtemplateInfo%253A107882_D_E%2523-1_A3_C%2523&freeshipping=0&fs_ab=2&fuse_fs=&lang=en&location=Bangkok&price=8.599E%204&priceCompare=skuId%3A22092173635%3Bsource%3Alazada-search-voucher%3Bsn%3A77e039c3d48aee0916f89ae27d0f03c4%3BoriginPrice%3A8599000%3BdisplayPrice%3A8599000%3BsinglePromotionId%3A-1%3BsingleToolCode%3AmockedSalePrice%3BvoucherPricePlugin%3A0%3Btimestamp%3A1744768368447&ratingscore=&request_id=77e039c3d48aee0916f89ae27d0f03c4&review=&sale=1&search=1&source=search&spm=a2o4m.searchlist.list.0&stock=1"
results = asyncio.run(main(url)) 

[INIT].... → Crawl4AI 0.5.0.post8
[EXPORT].. ℹ Exporting PDF and taking screenshot took 0.44s
[FETCH]... ↓ https://www.lazada.co.th/products/asus-rog-zephyru... | Status: True | Time: 4.71s
[SCRAPE].. ◆ https://www.lazada.co.th/products/asus-rog-zephyru... | Time: 0.149s
[EXTRACT]. ■ Completed for https://www.lazada.co.th/products/asus-rog-zephyru... | Time: 20.78199329203926s
[COMPLETE] ● https://www.lazada.co.th/products/asus-rog-zephyru... | Status: True | Total: 25.65s


In [26]:
json.loads(results[0].extracted_content)

[{'brand': 'ASUS',
  'model': 'ROG Zephyrus G16',
  'current_price': 85990,
  'original_price': None,
  'seller': None,
  'cpu': 'Intel Core Ultra 9-185H',
  'gpu': 'NVIDIA GeForce RTX 4070',
  'ram': '32GB LPDDR5X',
  'storage': '1TB PCIe 4.0 NVMe M.2 SSD',
  'display': '16 inch WQXGA OLED 240Hz',
  'warranty': None,
  'color': None,
  'weight': None,
  'dimensions': None,
  'battery': None,
  'error': False},
 {'brand': 'ASUS',
  'model': 'ROG Zephyrus G16',
  'current_price': 85990,
  'original_price': None,
  'seller': None,
  'cpu': 'Intel Core Ultra 9-185H',
  'gpu': 'NVIDIA GeForce RTX 4070',
  'ram': '32GB LPDDR5X',
  'storage': '1TB PCIe 4.0 NVMe M.2 SSD',
  'display': '16 inch WQXGA OLED 240Hz',
  'warranty': None,
  'color': None,
  'weight': None,
  'dimensions': None,
  'battery': None,
  'error': False},
 {'brand': 'ASUS',
  'model': 'ROG Zephyrus G16',
  'current_price': 85990,
  'original_price': None,
  'seller': None,
  'cpu': 'Intel Core Ultra 9-185H',
  'gpu': 'NVID