In [1]:
import os
import asyncio
import json
import nest_asyncio
import dotenv
dotenv.load_dotenv()


GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

nest_asyncio.apply()


## Scrape vs. Crawl
The mode parameter in the FireCrawlWebReader initialization specifies how the web content should be processed. There are two main modes: Crawling and scraping are related but distinct processes in web data extraction: 

* Crawling (crawl)involves systematically navigating through a website, and following links to discover and collect data from multiple pages. It’s ideal for gathering comprehensive datasets from an entire site or a section of it. For example, crawling can retrieve all blog posts from a website. 
* Scraping (scrape) focuses on extracting specific data from a single webpage or a limited set of pages. It’s best for targeted data collection, such as pulling the title and description from a single product page. 

In short, crawling is about exploring multiple pages, while scraping is about extracting data from specific pages. Choosing the appropriate mode depends on your specific use case and the amount of data you need to extract from the target website.

In [None]:
from llama_index.readers.web import FireCrawlWebReader

# Initialize the FireCrawl web reader
reader = FireCrawlWebReader(
    api_key=FIRECRAWL_API_KEY,
    mode="scrape",
    params={"formats": ["markdown"]} # You can also use "html" here, I had issue related to max chunk, this could be added if embedding model supports larger chunks
)
# Markdown format - The content of the webpage converted into Markdown syntax (e.g., # Title, **bold**, [link](url)).
# HTML format - The raw HTML content of the webpage.
# https://docs.firecrawl.dev/features/scrape#scrape-formats

documents = reader.load_data(url="https://towardsai.net/")

In [17]:
documents[0]

Document(id_='a709554a-0c3e-471b-8b92-f20909f1d800', embedding=None, metadata={'title': 'Towards AI', 'description': 'Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.', 'url': 'https://towardsai.net/', 'language': 'en-US', 'keywords': None, 'robots': 'max-image-preview:large', 'og_title': 'Towards AI', 'og_description': 'Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.', 'og_url': 'https://towardsai.net/', 'og_image': 'https://towardsai.net/wp-content/uploads/2024/09/towards-ai-og-graph.jpg', 'og_audio': None, 'og_determiner': None, 'og_locale': None, 'og_locale_alternate': None, 'og_site_name': None, 'og_video': None, 'favicon': 'https://b3688296.smushcdn.com/3688296/wp-content/uploads/2019/05/cropped-towards-ai-square-circle-png-32x32.png?lossy=0&strip=1&webp=1', 'dc_term

> I was getting issue related to the metadata becoming greater than the max context window of the embedding model. 
I resolved this by removing irrelevant fields, but in real production, we can use different embedding having a larger context window

In [18]:
# Print the keys and their sizes for the first document
doc = documents[0]
print("Keys in Metadata:", doc.metadata.keys())

for key, value in doc.metadata.items():
    print(f"Key: {key} | Length: {len(str(value))} characters")

Keys in Metadata: dict_keys(['title', 'description', 'url', 'language', 'keywords', 'robots', 'og_title', 'og_description', 'og_url', 'og_image', 'og_audio', 'og_determiner', 'og_locale', 'og_locale_alternate', 'og_site_name', 'og_video', 'favicon', 'dc_terms_created', 'dc_date_created', 'dc_date', 'dc_terms_type', 'dc_type', 'dc_terms_audience', 'dc_terms_subject', 'dc_subject', 'dc_description', 'dc_terms_keywords', 'modified_time', 'published_time', 'article_tag', 'article_section', 'source_url', 'status_code', 'scrape_id', 'num_pages', 'content_type', 'proxy_used', 'timezone', 'cache_state', 'cached_at', 'credits_used', 'concurrency_limited', 'concurrency_queue_duration_ms', 'error', 'generator', 'fb:app_id', 'og:url', 'foundingDate', 'twitter:site', 'bestRating', 'viewport', 'og:image', 'msapplication-TileColor', 'og:title', 'worstRating', 'twitter:description', 'twitter:image', 'twitter:title', 'twitter:card', 'msapplication-TileImage', 'theme-color', 'target', 'og:description', 

In [20]:
# Loop through all documents and remove heavy fields
for docs in documents:
    # Keep only the URL and Title, delete everything else
    to_keep = {'url', 'title', 'description', 'sourceURL'}
    
    # Create a clean dictionary
    clean_metadata = {k: v for k, v in docs.metadata.items() if k in to_keep}
    
    # Overwrite the metadata
    docs.metadata = clean_metadata

# Check if it worked 
print(documents[0].metadata)

{'title': 'Towards AI', 'description': 'Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.', 'url': 'https://towardsai.net/'}


In [21]:
documents

[Document(id_='a709554a-0c3e-471b-8b92-f20909f1d800', embedding=None, metadata={'title': 'Towards AI', 'description': 'Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.', 'url': 'https://towardsai.net/'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='- [Latest](https://towardsai.net/p/)\n- [Trends](https://medium.com/towards-artificial-intelligence/trending)\n- [Shop](https://gumroad.com/towardsai)\n\n[Google News](https://news.google.com/publications/CAAqBwgKMNiLmgswgpayAw)[Medium](https://pub.towardsai.net/)[Linkedin](https://linkedin.com/company/towards-artificial-intelligence)[Twitter](https://twitter.com/towards_ai?lang=en)[Facebook](https://www.facebook.com/towardsAl/)[Instagram](https://instagram.com/towards_ai/)[Gith

In [4]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core import VectorStoreIndex
import google.genai.types as types

config = types.GenerateContentConfig(
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    max_output_tokens=1024,
    temperature=0.6,
)

llm = GoogleGenAI(
    model="gemini-2.5-flash",
    generation_config=config,
    )

Settings.embed_model = CohereEmbedding(
    model_name="embed-english-v3.0",
    input_type="search_document",
    api_key=COHERE_API_KEY
)
Settings.llm = llm
Settings.text_splitter = SentenceSplitter(chunk_size=384, chunk_overlap=50)

In [22]:
index = VectorStoreIndex.from_documents(documents, show_progress=True)
query_engine = index.as_query_engine()

Parsing nodes: 100%|██████████| 1/1 [00:12<00:00, 12.59s/it]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 17.27it/s]


In [24]:
res = query_engine.query("What is towards AI aim?")

print(res.response)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['url'])
  print("Score\t", src.score)
  print("Description\t", src.metadata.get("description"))
  print("-_"*20)

Towards AI aims to help individuals and companies master AI through practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy. It also serves as a leading artificial intelligence and technology publication, read by thought-leaders and decision-makers globally.
-----------------
Node ID	 f6254025-da11-408c-ae9e-cb4d443d4637
Title	 Towards AI
URL	 https://towardsai.net/
Score	 0.5347926030189166
Description	 Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 6097ac95-92dd-436f-963a-89a445fcb8ba
Title	 Towards AI
URL	 https://towardsai.net/
Score	 0.5275377338361159
Description	 Master AI with Towards AI. We offer practical AI Engineering courses, corporate AI bootcamps, and LLM development consultancy for individuals and companies.
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


# CRAWL A WEBSITE

## Load The CSV

CSV contains the list of tools and url of the page which we use to get information about the tool.

In [25]:
import requests
import csv

# Google Sheets file URL (CSV export link)
url = 'https://docs.google.com/spreadsheets/d/1gHB-aQJGt9Nl3cyOP2GorAkBI_Us2AqkYnfqrmejStc/export?format=csv'

# Send a GET request to fetch the CSV file
response = requests.get(url)

response_list = []
# Check if the request was successful
if response.status_code == 200:
    # Decode the content to a string
    content = response.content.decode('utf-8')

    # Use the csv.DictReader to read the content as a dictionary
    csv_reader = csv.DictReader(content.splitlines(), delimiter=',')
    response_list = [row for row in csv_reader]
else:
    print(f"Failed to retrieve the file: {response.status_code}")


In [26]:
import random

start_index = random.randint(0, len(response_list) - 3)
website_list = response_list[start_index:start_index+10] # Crawling 10 websites only.
website_list

[{'Name': 'Jax',
  'Tool Type': 'Library',
  'Parent': '',
  'Company': '',
  'Description': 'Autograd and XLA for high-performance machine learning research',
  'Category': 'Scientific Computing',
  'URL': 'https://jax.readthedocs.io/en/latest/',
  'Is a direct URL company /tool website?': 'Yes',
  '': ''},
 {'Name': 'Keras',
  'Tool Type': 'Library',
  'Parent': '',
  'Company': '',
  'Description': 'Open-source library that provides a Python interface for artificial neural networks',
  'Category': 'Model Development',
  'URL': 'https://keras.io/',
  'Is a direct URL company /tool website?': 'Yes',
  '': ''},
 {'Name': 'Koalas',
  'Tool Type': 'Library',
  'Parent': '',
  'Company': '',
  'Description': 'pandas API on Apache Spark',
  'Category': 'Big Data Processing',
  'URL': 'https://koalas.readthedocs.io/en/latest/',
  'Is a direct URL company /tool website?': 'Yes',
  '': ''},
 {'Name': 'Librosa',
  'Tool Type': 'Library',
  'Parent': '',
  'Company': '',
  'Description': 'Pytho

## Initialize the Firecrawl

In [27]:
import os
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

In [31]:
import time

# Crawl websites and handle responses
url_response = {}
crawl_per_min = 1  # Max crawl per minute

# Track crawls
crawled_websites = 0
scraped_pages = 0

for i, website_dict in enumerate(website_list):
    url = website_dict.get('URL')
    print(f"Crawling: {url}")

    try:
        response = app.crawl(
            url,
            limit=2,
            scrape_options={
                'formats': ['markdown', 'html']
            }
        )
        crawled_websites += 1

    except Exception as exc:
        print(f"Failed to fetch {url} -> {exc}")
        continue

    # Store the scraped data and associated info in the response dict
    url_response[url] = {
        "scraped_data": response.data,
        "csv_data": website_dict
    }

    # Pause to comply with crawl per minute limit for free version its 1 crawl per minute
    if i!=len(website_list) and (i + 1) % crawl_per_min == 0:
        print("Pausing for 1 minute to comply with crawl limit...")
        time.sleep(60)  # Pause for 1 minute after every crawl


Crawling: https://jax.readthedocs.io/en/latest/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://keras.io/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://koalas.readthedocs.io/en/latest/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://librosa.org/doc/latest/tutorial.html
Pausing for 1 minute to comply with crawl limit...
Crawling: https://lime-ml.readthedocs.io/en/latest/#
Pausing for 1 minute to comply with crawl limit...
Crawling: https://luigi.readthedocs.io/en/stable/workflows.html
Pausing for 1 minute to comply with crawl limit...
Crawling: https://marshmallow.readthedocs.io/en/stable/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://matplotlib.org/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://mimesis.name/en/v13.1.0/
Pausing for 1 minute to comply with crawl limit...
Crawling: https://rasbt.github.io/mlxtend/


KeyboardInterrupt: 

In [None]:
from llama_index.core import Document
documents = []

for _, scraped_content in url_response.items():
    csv_data = scraped_content.get("csv_data")
    scraped_results = scraped_content.get("scraped_data")

    for scraped_site_dict in scraped_results:
        for result in scraped_results:
            markdown_content = result.get("markdown")
            title = result.get("metadata").get("title")
            url = result.get("metadata").get("sourceURL")
            documents.append(
                Document(
                    text=markdown_content,
                    metadata={
                        "title": title,
                        "url": url,
                        "description": csv_data.get("Description"),
                        "category": csv_data.get("Category")
                    }
                )
            )


In [None]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core import VectorStoreIndex
import google.genai.types as types

config = types.GenerateContentConfig(
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    max_output_tokens=1024,
    temperature=1,
)

llm = GoogleGenAI(
    model="gemini-2.5-flash",
    generation_config=config,
    )

Settings.embed_model = CohereEmbedding(
    model_name="embed-english-v3.0",
    input_type="search_document",
    api_key=COHERE_API_KEY
)
Settings.llm = llm
Settings.text_splitter = SentenceSplitter(chunk_size=300, chunk_overlap=50)

In [None]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [None]:
res = query_engine.query("Explain GraphRAG")
print(res.response)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['url'])
  print("Score\t", src.score)
  print("Description\t", src.metadata.get("description"))
  print("Category\t", src.metadata.get("category"))
  print("-_"*20)