In [2]:
import sys
import asyncio
import os
import json

# Add the project root to the Python path
if '..' not in sys.path:
    sys.path.append('..')
from src.crawler.news.real.VnExpressCrawler import VnExpressCrawler

async def main():
    # Instantiate the crawler
    crawler = VnExpressCrawler()
    
    # --- Demo saving files --- 
    print("--- Testing file saving ---")
    # Save as JSON
    print("Saving as JSON...")
    await crawler.arun(save_to_file=True, save_format=".json")
    print("Done.")
    
    # Save all images as jpg
    print("\nSaving all images as .jpg...")
    await crawler.arun(save_to_file=True, save_format=".jpg")
    print("Done.")

    # --- Demo structured data extraction --- 
    print("\n--- Testing structured data extraction ---")
    articles = await crawler.extract_with_schema()
    if articles:
        print("Extracted articles:")
        # pretty print the json
        print(json.dumps(articles[:2], indent=2)) # print first 2 articles
    else:
        print("No articles extracted.")

    # --- Demo deep crawling a sub-url --- 
    print("\n--- Testing deep crawling a sub-url ---")
    sub_url = "https://vnexpress.net/thu-tuong-xu-nghiem-can-bo-thieu-trach-nhiem-sai-pham-trong-giai-ngan-dau-tu-cong-4952952.html"
    print(f"Crawling {sub_url} with max_depth=1, max_pages=5...")
    await crawler.arun(url=sub_url, mode="deep", max_depth=1, save_to_file=True, save_format=".json")
    print("Done.")

    # --- Demo deep crawling with config --- 
    print("\n--- Testing deep crawling with config ---")
    print("Crawling with max_depth=1, max_pages=5, and css_selector='article.item-news'")
    await crawler.deep_crawl_with_config(
        start_url="https://vnexpress.net/",
        max_depth=1,
        max_pages=5,
        css_selector="article.item-news"
    )
    print("Done.")

# Run the async main function
try:
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.get_event_loop().run_until_complete(main())
except Exception as e:
    print('Failed to run crawler in notebook environment:', e)
    print('If running outside notebook, use: asyncio.run(main())')

--- Testing file saving ---
Saving as JSON...


Failed to run crawler in notebook environment: Invalid expression
If running outside notebook, use: asyncio.run(main())
