In [1]:
import os
from firecrawl import FirecrawlApp
from google import genai
from google.genai import types
import pathlib
import httpx
import json

In [2]:
client = genai.Client(
  api_key=os.environ['GOOGLE_API_KEY'],
)

In [3]:
doc_url = 'https://arxiv.org/pdf/2412.15605v1'

In [None]:
filepath = pathlib.Path('./data/file.pdf')
filepath.write_bytes(httpx.get(doc_url).content)

121164

In [5]:
prompt = "who are the authors"
response = client.models.generate_content(
  model='gemini-2.0-flash',
  contents=[
    types.Part.from_bytes(
      data=filepath.read_bytes(),
      mime_type='application/pdf'
    ),
    prompt
  ]
)

In [6]:
print(response.text)

The authors are:

*   Brian J Chan
*   Chao-Ting Chen
*   Jui-Hung Cheng
*   Hen-Hsen Huang


In [7]:
fire = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
url = "https://docs.firecrawl.dev/"
all_links = fire.map_url(url).get('links')[1:]

# all_links

In [8]:
len(all_links)

154

In [9]:
def filter_api_urls(docs_links, search_query):
  response = client.models.generate_content(
    model='gemini-2.0-flash',
    contents= f"{docs_links}. \n\nAbove is a list of urls. Please return only the urls that are related to the REST api.",
    config={
      "response_mime_type": "application/json",
      "response_schema": list[str]
    }
  )
  return json.loads(response.candidates[0].content.parts[0].text)

api_links = filter_api_urls(all_links, "REST api")

len(api_links)


30

In [10]:
api_links

['https://docs.firecrawl.dev/api-reference/introduction',
 'https://docs.firecrawl.dev/api-reference/endpoint/search',
 'https://docs.firecrawl.dev/api-reference/endpoint/extract',
 'https://docs.firecrawl.dev/api-reference/endpoint/scrape',
 'https://docs.firecrawl.dev/api-reference/endpoint/map',
 'https://docs.firecrawl.dev/v0/api-reference/introduction',
 'https://docs.firecrawl.dev/api-reference/endpoint/crawl-get',
 'https://docs.firecrawl.dev/api-reference/endpoint/batch-scrape',
 'https://docs.firecrawl.dev/api-reference/endpoint/crawl-delete',
 'https://docs.firecrawl.dev/api-reference/endpoint/crawl-post',
 'https://docs.firecrawl.dev/api-reference/endpoint/extract-get',
 'https://docs.firecrawl.dev/api-reference/endpoint/batch-scrape-get',
 'https://docs.firecrawl.dev/api-reference/endpoint/crawl-get-errors',
 'https://docs.firecrawl.dev/v0/api-reference/endpoint/crawl-cancel',
 'https://docs.firecrawl.dev/api-reference/endpoint/batch-scrape-get-errors',
 'http://docs.firecr

In [11]:
# Now we can use the api links to get info about firecrawl documentation
# And pass the relevant documentation to the llm to answer question