In [1]:
import os
import asyncio
import json
import nest_asyncio
import dotenv
dotenv.load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

nest_asyncio.apply()

In [15]:
import newspaper
from newspaper import Config

# 1. Create a configuration object
config = Config()
# 2. Tell it to allow "binary" urls (bypassing the bad check)
config.allow_binary_content = True 

urls = [
    "https://developers.llamaindex.ai/python/framework/understanding/",
    "https://developers.llamaindex.ai/python/framework/understanding/using_llms/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/indexing/",
    "https://developers.llamaindex.ai/python/framework/understanding/rag/querying/",
]   


pages_content = []

# Retrieve the Content
for url in urls:
    try:
        article = newspaper.Article(url, config=config)
        article.download()
        article.parse()
        if len(article.text) > 0:
            pages_content.append(
                {"url": url, "title": article.title, "text": article.text}
            )
    except:
        print(f"Failed to retrieve content from {url}")
        continue

print(pages_content[0])
print(len(pages_content))

{'url': 'https://developers.llamaindex.ai/python/framework/understanding/', 'title': 'Building an LLM application', 'text': 'Using LLMs: hit the ground running by getting started working with LLMs. We’ll show you how to use any of our dozens of supported LLMs, whether via remote API calls or running locally on your machine.\n\nBuilding agents: agents are LLM-powered knowledge workers that can interact with the world via a set of tools. Those tools can retrieve information (such as RAG, see below) or take action. This tutorial includes:\n\nBuilding a single agent: We show you how to build a simple agent that can interact with the world via a set of tools.\n\nUsing existing tools: LlamaIndex provides a registry of pre-built agent tools at LlamaHub that you can incorporate into your agents.\n\nMaintaining state: agents can maintain state, which is important for building more complex applications.\n\nStreaming output and events: providing visibility and feedback to the user is important, a

In [16]:
pages_content

[{'url': 'https://developers.llamaindex.ai/python/framework/understanding/',
  'title': 'Building an LLM application',
  'text': 'Using LLMs: hit the ground running by getting started working with LLMs. We’ll show you how to use any of our dozens of supported LLMs, whether via remote API calls or running locally on your machine.\n\nBuilding agents: agents are LLM-powered knowledge workers that can interact with the world via a set of tools. Those tools can retrieve information (such as RAG, see below) or take action. This tutorial includes:\n\nBuilding a single agent: We show you how to build a simple agent that can interact with the world via a set of tools.\n\nUsing existing tools: LlamaIndex provides a registry of pre-built agent tools at LlamaHub that you can incorporate into your agents.\n\nMaintaining state: agents can maintain state, which is important for building more complex applications.\n\nStreaming output and events: providing visibility and feedback to the user is importa

Convert that into the Document so Llama-index understand it

In [17]:
# Convert to Document
from llama_index.core.schema import Document

documents = [
    Document(text=row["text"], metadata={"title": row["title"], "url": row["url"]})
    for row in pages_content
]


## Now we will use the **crawl4ai** for scrapt the page

In [22]:
# from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

# urls_to_crawl = [
#     "https://docs.llamaindex.ai/en/stable/understanding/",
# ]

# def crawl_sync():
#     async def crawl_with_crawl4ai():
#         config = CrawlerRunConfig(
#             cache_mode=CacheMode.BYPASS,
#             page_timeout=80000,
#             word_count_threshold=50
#         )

#         data_res = {"data": []}

#         async with AsyncWebCrawler() as crawler:
#             results = await crawler.arun(
#                 urls=urls_to_crawl,
#                 run_config=config
#             ) 

#             for result in results:
#                 if result.success:
#                     title = result.metadata.get("title" , "")

#                     if not title and result.markdown:
#                         lines = result.markdown.raw_markdown.split("\n")
#                         for line in lines:
#                             if line.startswith("#"):
#                                 title = line.lstrip("#").strip()
#                                 break

#                     data_res["data"].append({
#                         "text": result.markdown.raw.markdown if result.markdown else "",
#                         "url": result.url,
#                         "meta": {
#                             "meta": {
#                                 "title": title
#                             }
#                         }
#                     })
#         return data_res

    
#     # Handle async execution
#     nest_asyncio.apply()
#     loop = asyncio.get_event_loop()
#     data = loop.run_until_complete(crawl_with_crawl4ai())
#     return data

# data_res = crawl_sync()

# # Print results (same format as before)
# print("URL:", data_res["data"][0]["meta"]["url"])
# print("Title:", data_res["data"][0]["meta"]["meta"]["title"])
# print("Content:", data_res["data"][0]["text"][0:500], "...")





In [2]:
import asyncio
import sys
import nest_asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

# --- FIX 1: Windows Subprocess Support ---
# This must run before any async code
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

# --- FIX 2: Apply Nest Asyncio Globally ---
# This allows us to run "sync" wrappers inside a Notebook
nest_asyncio.apply()

urls_to_crawl = [
    "https://docs.llamaindex.ai/en/stable/understanding/",
]

def crawl_sync():
    async def crawl_with_crawl4ai():
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            page_timeout=80000,
            word_count_threshold=50
        )

        data_res = {"data": []}

        async with AsyncWebCrawler() as crawler:
            results = await crawler.arun_many(
                urls_to_crawl, 
                config=config
            )

            for result in results:
                if result.success:
                    # Logic to find title if missing
                    title = result.metadata.get("title", "")
                    if not title and result.markdown:
                        lines = result.markdown.raw_markdown.split('\n')
                        for line in lines:
                            if line.startswith('#'):
                                title = line.strip('#').strip()
                                break
                    
                    data_res["data"].append({
                        "text": result.markdown.raw_markdown if result.markdown else "",
                        "meta": {
                            "url": result.url,
                            "meta": {
                                "title": title
                            }
                        }
                    })
        return data_res

    # --- FIX 3: Re-use the Existing Loop ---
    # Instead of creating a new loop, we ask Jupyter for the current one
    loop = asyncio.get_event_loop()
    return loop.run_until_complete(crawl_with_crawl4ai())

# Run the crawler
data_res = crawl_sync()

# Print results
if data_res["data"]:
    print("URL:", data_res["data"][0]["meta"]["url"])
    print("Title:", data_res["data"][0]["meta"]["meta"]["title"])
    print("Content Sample:", data_res["data"][0]["text"][0:200], "...")
else:
    print("No data found.")

Task exception was never retrieved
future: <Task finished name='Task-5' coro=<Connection.run() done, defined at e:\Extra Learning Resources\Extra Learning Resources\NLP\RAG\Practice Learning\venv_rag\Lib\site-packages\playwright\_impl\_connection.py:305> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "e:\Extra Learning Resources\Extra Learning Resources\NLP\RAG\Practice Learning\venv_rag\Lib\site-packages\playwright\_impl\_connection.py", line 312, in run
    await self._transport.connect()
  File "e:\Extra Learning Resources\Extra Learning Resources\NLP\RAG\Practice Learning\venv_rag\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "e:\Extra Learning Resources\Extra Learning Resources\NLP\RAG\Practice Learning\venv_rag\Lib\site-pack

NotImplementedError: 

In [2]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url="https://crawl4ai.com")

        # Print the extracted content
        print(result.markdown)

# Run the async main function, adapting for Colab environment
await main()

NotImplementedError: 