# Crawl outline

1. Import Library — import required modules and verify availability.
2. Analyze URL — parse and validate the target URL (HEAD check optional).
3. Define function — implement an async wrapper that runs the crawler and returns results.
4. Run Crawl — execute the crawler for a sample URL and print/save the output.

In [12]:
# Step 1: Import Library
import asyncio
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, JsonCssExtractionStrategy

print('Imports OK')

Imports OK


In [13]:
# Step 2: Analyze URL
url = 'https://www.example.com/sample-article'  # replace with your target URL
parsed = urlparse(url)
print('URL:', url)
print('Parsed:', parsed)

# Optional: quick HEAD check (requires requests)
try:
    import requests
    r = requests.head(url, timeout=5)
    print('HEAD status_code:', r.status_code)
except Exception as e:
    print('HEAD check failed (network may be blocked or requests missing):', e)

URL: https://www.example.com/sample-article
Parsed: ParseResult(scheme='https', netloc='www.example.com', path='/sample-article', params='', query='', fragment='')
HEAD status_code: 404
HEAD status_code: 404


In [14]:
# Step 3: Define function (no top-level run here)
async def simple_crawl(url, headless=True, timeout=30):
    """Run a minimal AsyncWebCrawler on the given URL and return the result object.
    Adjust the extraction schema/strategy for your target pages as needed.
    """
    schema = {
        'name': 'example',
        'baseSelector': 'body',
        'fields': [
            {'name': 'title', 'selector': 'head > title', 'type': 'text'},
            {'name': 'content', 'selector': 'article', 'type': 'text'},
        ]
    }
    extraction_strategy = JsonCssExtractionStrategy(schema)
    browser_cfg = BrowserConfig(headless=headless)
    run_cfg = CrawlerRunConfig(extraction_strategy=extraction_strategy, cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        result = await crawler.arun(url=url, config=run_cfg)
    return result