# Crawl4AI with local LLMs

In [6]:
# imports
%run "../src/utils.py"

In [None]:
# crawl4ai health check
!crawl4ai-doctor

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Running Crawl4AI health check[0m[36m...[0m[36m [0m
[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;36m[[0m[36mTEST[0m[1;36m][0m[36m...[0m[36m. ℹ Testing crawling capabilities[0m[36m...[0m[36m [0m
[1;36m[[0m[36mEXPORT[0m[1;36m][0m[36m.. ℹ Exporting media [0m[1;36m([0m[36mPDF/MHTML/screenshot[0m[1;36m)[0m[36m took [0m[1;36m0.[0m[36m38s [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://crawl4ai.com[0m[32m                                                                    [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m3.[0m[32m91s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://crawl4ai.com[0m[32m                                                                    [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m02s [0m
[1;32m[[0m[32mCOMPLETE[0m[

In [7]:
# If running in notebooks
nest_asyncio.apply()

## Basic web crawling

Example provided in the docs:

In [None]:
async def main():
    browser_conf = BrowserConfig(headless=True)
    run_conf = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(
            url="https://www.scrapethissite.com/pages/",
            config=run_conf
        )
        print(result.markdown)

if __name__ == "__main__":
    asyncio.run(main())

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m3.[0m[32m15s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m01s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m3.[0m[32m16s [0m
  * [ ![](https://www.scrapethissite.com/static/images/scraper-icon.png) Scrape This Site ](https://www.scrapethissite.com/)
  * [ ](https://www.scrapethissite.com/pages/)
  * [ ](https://www.scrapethissite.com/lessons/)
  * [ ](https://www.scrapethissite.com/f

## Use with local LLM

In [None]:
# download your preferred model in terminal
# ollama pull qwen2.5:3b

In [None]:
class Product(BaseModel):
    name: str
    description: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="ollama/qwen2.5:3b", api_token=None),
        schema=Product.model_json_schema(),
        extraction_type="schema",
        instruction=""" 
        From the crawled content
        extract the titles and the description in JSON format like this:
        {"title": "title name", "description: "description text"}
        """,
        chunk_token_threshold=1000,
        overlap_rate=0.0,
        apply_chunking=False,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 500}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(
        headless=True,
        text_mode=True,
        light_mode=True
        )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://www.scrapethissite.com/pages/",
            config=crawl_config
        )

        if result.success:
            # 5. The extracted content is presumably JSON
            data = json.loads(result.extracted_content)
            print("Extracted items:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)
        
        return data 

if __name__ == "__main__":
    asyncio.run(main())


[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m1.[0m[32m80s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m00s [0m
[1;32m[[0m[32mEXTRACT[0m[1;32m][0m[32m. ■ [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m6.[0m[32m25s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m[4;32mhttps://www.scrapethissite.com/pages/[0m[32m                              [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m8.[0m[32m06s [0m
Extracted items: [{'name': 'Countries of the World: A Sim

Extracted items: [{'name': 'Countries of the World: A Simple Example', 'description': 'A single page that lists information about all the countries in the world. Good for those just get started with web scraping.'}, {'name': 'Hockey Teams: Forms, Searching and Pagination', 'description': 'Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.'}, {'name': 'Oscar Winning Films: AJAX and Javascript', 'description': 'Click through a bunch of great films. Learn how content is added to the page asynchronously with Javascript and how you can scrape it.'}, {'name': 'Turtles All the Way Down: Frames & iFrames', 'description': 'Some older sites might still use frames to break up thier pages. Modern ones might be using iFrames to expose data. Learn about turtles as you scrape content inside frames.'}, {'name': "Advanced Topics: Real World Challenges You'll Encounter", 'description': "Scraping real websites, you're likely run into a number of common gotchas. Get practice with spoofing headers, handling logins & session cookies, finding CSRF tokens, and other common network errors."}]

## Real website

Let's try with a real world website

In [11]:

class Product(BaseModel):
    name: str
    price: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="ollama/qwen2.5:3b", api_token=None),
        schema=Product.model_json_schema(),
        extraction_type="schema",
        instruction=""" 
        From the crawled content
        extract the name and the price in JSON format like this:
        {"name": "product name", "price": "price value"}
        """,
        chunk_token_threshold=500,
        overlap_rate=0.0,
        apply_chunking=False,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 1000}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(
        headless=True,
        text_mode=True,
        light_mode=True
        )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://www.microcenter.com/product/670842/intel-core-i7-14700k-raptor-lake-s-refresh-34ghz-twenty-core-lga-1700-boxed-processor-heatsink-not-included",
            config=crawl_config
            
        )

        if result.success:
            # 5. The extracted content is presumably JSON
            data = json.loads(result.extracted_content)
            print("Extracted item:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m
[4;32mhttps://www.microcenter.com/product/670842/intel...e-lga-1700-boxed-processor-he[0m
[4;32matsink-not-included[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m1.[0m[32m57s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m
[4;32mhttps://www.microcenter.com/product/670842/intel...e-lga-1700-boxed-processor-he[0m
[4;32matsink-not-included[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m00s [0m
[1;32m[[0m[32mEXTRACT[0m[1;32m][0m[32m. ■ [0m
[4;32mhttps://www.microcenter.com/product/670842/intel...e-lga-1700-boxed-processor-he[0m
[4;32matsink-not-included[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m1.[0m[32m53s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m
[4;32mhttps://www.microcenter.com/product/670842/intel...e-lga-1700-boxed-processor-he[0m
[4;

---

## Limitations with complex website

In [12]:

class Product(BaseModel):
    name: str
    price: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="ollama/qwen2.5:3b", api_token=None),
        schema=Product.model_json_schema(),
        extraction_type="schema",
        instruction=""" 
        From the crawled content
        extract the name and the price in JSON format like this:
        {"name": "product name", "price": "price value"}
        """,
        chunk_token_threshold=500,
        overlap_rate=0.0,
        apply_chunking=False,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 1000}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(
        headless=True,
        text_mode=True,
        light_mode=True
        )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://www.amazon.com/Bose-Cancelling-Wireless-Bluetooth-Headphones/dp/B07Q9MJKBV/ref=sr_1_1?sr=8-1",
            config=crawl_config
            
        )

        if result.success:
            # 5. The extracted content is presumably JSON
            data = json.loads(result.extracted_content)
            print("Extracted item:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m
[4;32mhttps://www.amazon.com/Bose-Cancelling-Wireless-Bluetooth-Headphones/dp/B07Q9MJK[0m
[4;32mBV/[0m[4;32mref[0m[4;32m=[0m[4;32msr_1_1[0m[4;32m?[0m[4;32msr[0m[4;32m=[0m[4;32m8[0m[4;32m-1[0m[32m | [0m[32m✓[0m[32m | ⏱: [0m[1;32m4.[0m[32m86s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m
[4;32mhttps://www.amazon.com/Bose-Cancelling-Wireless-Bluetooth-Headphones/dp/B07Q9MJK[0m
[4;32mBV/[0m[4;32mref[0m[4;32m=[0m[4;32msr_1_1[0m[4;32m?[0m[4;32msr[0m[4;32m=[0m[4;32m8[0m[4;32m-1[0m[32m | [0m[32m✓[0m[32m | ⏱: [0m[1;33m0.[0m[33m25[0m[32ms [0m
[1;32m[[0m[32mEXTRACT[0m[1;32m][0m[32m. ■ [0m
[4;32mhttps://www.amazon.com/Bose-Cancelling-Wireless-Bluetooth-Headphones/dp/B07Q9MJK[0m
[4;32mBV/[0m[4;32mref[0m[4;32m=[0m[4;32msr_1_1[0m[4

Extracted item: [{'index': 0, 'error': True, 'tags': ['error'], 'content': ['{\n  {\n    "name": "Bose Cancelling Wireless Bluetooth Headphones",\n    "price": "$249.00"\n  }\n}']}]


## Where LLM shines

Constantly updating websites.

In [13]:
class Product(BaseModel):
    summary: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="ollama/qwen2.5:3b", api_token=None),
        schema=Product.model_json_schema(),
        extraction_type="schema",
        instruction=""" 
        From the crawled content make a summary including the program description, courses and admision dates
        """,
        chunk_token_threshold=500,
        overlap_rate=0.0,
        apply_chunking=False,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 1000}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(
        headless=True,
        text_mode=True,
        light_mode=True
        )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://extension.harvard.edu/academics/programs/computer-science-masters-degree-program/#program-overview",
            config=crawl_config
            
        )

        if result.success:
            # 5. The extracted content
            data = json.loads(result.extracted_content)
            print("Extracted item:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m2.[0m[32m43s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m05s [0m
[1;32m[[0m[32mEXTRACT[0m[1;32m][0m[32m. ■ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m6.[0m[32m01s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;

Extracted item: [{'summary': 'The Computer Science Master’s Degree Program at Harvard Extension School is an advanced degree program that can be completed in 2-5 years depending on the pace and number of courses taken each semester. The program offers year-round study, allowing students to take courses in fall, January, spring, and summer. Eligible students receive grant funds to cover a portion of tuition costs each term, in addition to federal financial aid options. Students can explore courses at https://courses.dce.harvard.edu/ today.'}]


Structure the desired information

In [None]:
class Product(BaseModel):
    program: str
    courses: str
    admission: str

async def main():
    # 1. Define the LLM extraction strategy
    llm_strategy = LLMExtractionStrategy(
        llm_config = LLMConfig(provider="ollama/qwen2.5:3b", api_token=None),
        schema=Product.model_json_schema(),
        extraction_type="schema",
        instruction=""" 
        From the crawled content
        extract the program overview, the courses and the admission sections in JSON format like this:
        {"program": "program overview", "courses": "courses", "admission":"admission"}
        """,
        chunk_token_threshold=500,
        overlap_rate=0.0,
        apply_chunking=False,
        input_format="markdown",   # or "html", "fit_markdown"
        extra_args={"temperature": 0.0, "max_tokens": 1000}
    )

    # 2. Build the crawler config
    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS
    )

    # 3. Create a browser config if needed
    browser_cfg = BrowserConfig(
        headless=True,
        text_mode=True,
        light_mode=True
        )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        # 4. Let's say we want to crawl a single page
        result = await crawler.arun(
            url="https://extension.harvard.edu/academics/programs/computer-science-masters-degree-program/#program-overview",
            config=crawl_config
            
        )

        if result.success:
            # 5. The extracted content
            data = json.loads(result.extracted_content)
            print("Extracted item:", data)

            # 6. Show usage stats
            llm_strategy.show_usage()  # prints token usage
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.7[0m[36m.[0m[1;36m6[0m[36m [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m1.[0m[32m71s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m0.[0m[32m07s [0m
[1;32m[[0m[32mEXTRACT[0m[1;32m][0m[32m. ■ [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;32mm/#program-overview[0m[32m  | [0m[32m✓[0m[32m | ⏱: [0m[1;32m5.[0m[32m15s [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m
[4;32mhttps://extension.harvard.edu/academics/programs...science-masters-degree-progra[0m
[4;

Extracted item: [{'program': "The program offers a Master's degree in Computer Science that can be pursued online, during evenings, or at your own pace. The program is designed for lifelong learners from high school to retirement age.", 'courses': 'Courses are available on various topics such as Artificial Intelligence, Cybersecurity, Data Science, and Programming. Students have the option to explore these courses in a variety of formats including online, evening classes, and self-paced learning.', 'admission': "To be admitted into the program, applicants should ideally possess a bachelor's degree from an accredited institution, proficiency in programming languages such as Java, Python, or C++, and some work experience in a technical field. Additionally, they must have excellent problem-solving skills, attention to detail, and critical thinking abilities.", 'error': False}]
