# Index Benchmark

---

In [70]:
from firecrawl import FirecrawlApp, ScrapeOptions
import os
from dotenv import load_dotenv
from datetime import datetime
import statistics

load_dotenv()

True

In [73]:
STAGING_API_URL = os.getenv("STAGING_API_URL") or None

if STAGING_API_URL is None:
    raise ValueError("STAGING_API_URL is not set")
    
app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')

In [66]:
scrape_urls=[
    'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)
    'https://httpbin.org', # - HTTP testing service (very reliable)
    'https://example.com', # - Standard test domain (minimal content)
    'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)
    'https://stackoverflow.com/questions', # - Stack Overflow questions page
    'https://www.wikipedia.org', # - Wikipedia main page (rich content)
    'https://jsonplaceholder.typicode.com', # - Fake API for testing
    'https://httpstat.us/200', # - HTTP status testing (minimal response)
    'https://www.reddit.com/r/programming', # - Reddit programming subreddit
    'https://docs.python.org/3/' # - Python documentation (structured docs)
]


crawl_urls = [
    ["docs.python.org/3/tutorial/*"],
    ["httpbin.org/*"],
    ["example.com/*"],
    ["jsonplaceholder.typicode.com/*"],
    ["httpstat.us/*"],
    ["postman-echo.com/*"],
    ["docs.github.com/*"],
    ["news.ycombinator.com/*"],
    ["dev.to/*"],
    ["reqres.in/*"]
]


map_urls=crawl_urls

## Scrape

Hypothesis: Indexed scrapes are faster

In [67]:
scrape_times_no_cache = []
scrape_times_cached = []

for i, url in enumerate(scrape_urls):  # Test first 5 URLs
    print(f"Testing {i+1}/{len(scrape_urls)}: {url}")
    
    # No cache (maxAge=1)
    try:
        start = datetime.now()
        doc = app.scrape_url(url, maxAge=1)
        no_cache_time = (datetime.now() - start).total_seconds()
        scrape_times_no_cache.append(no_cache_time)
        print(f"  No cache: {no_cache_time:.2f}s")
    except Exception as e:
        print(f"  No cache: FAILED - {e}")
        scrape_times_no_cache.append(None)
    
    # Cached (maxAge=100000)
    try:
        start = datetime.now()
        doc = app.scrape_url(url, maxAge=100000)
        cached_time = (datetime.now() - start).total_seconds()
        scrape_times_cached.append(cached_time)
        print(f"  Cached: {cached_time:.2f}s")
    except Exception as e:
        print(f"  Cached: FAILED - {e}")
        scrape_times_cached.append(None)

Testing 1/10: https://news.ycombinator.com
  No cache: 2.15s
  Cached: 0.98s
Testing 2/10: https://httpbin.org
  No cache: 11.61s
  Cached: 0.59s
Testing 3/10: https://example.com
  No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.
  Cached: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.
Testing 4/10: https://github.com/microsoft/vscode
  No cache: 15.73s
  Cached: 1.12s
Testing 5/10: https://stackoverflow.com/questions
  No cache: 2.74s
  Cached: 0.98s
Testing 6/10: https://www.wikipedia.org
  No cache: 3.84s
  Cached: 0.58s
Testing 7/10: https://jsonplaceholder.typicode.com
  No cache: 4.09s
  Cached: 0.81s
Testing 8/10: https://httpstat.us/200
  No cache: 10.84s
  Cached: 0.56s
Testing 9/10: https://www.reddit.com/r/programming
  No cache: 3.63s
  Cached: 0.92s
Testing 10/10: https://docs.python.org/3/
  No cac

In [71]:
# Calculate averages
valid_no_cache = [t for t in scrape_times_no_cache if t is not None]
valid_cached = [t for t in scrape_times_cached if t is not None]

if valid_no_cache and valid_cached:
    avg_no_cache = statistics.mean(valid_no_cache)
    avg_cached = statistics.mean(valid_cached)
    speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0
    
    print("SCRAPE RESULTS:")
    print(f"Average no cache: {avg_no_cache:.2f}s")
    print(f"Average cached: {avg_cached:.2f}s")
    print(f"Speedup: {speedup:.1f}x faster with cache")
    print(f"Time saved: {avg_no_cache - avg_cached:.2f}s per request")

SCRAPE RESULTS:
Average no cache: 6.25s
Average cached: 0.80s
Speedup: 7.8x faster with cache
Time saved: 5.45s per request


---

## Crawl

--- for now used to improve map 

Hypothesis: Indexed crawls are faster

In [None]:
# I think this one only improves /map for now...

In [None]:
crawl_times_no_cache = []
crawl_times_cached = []

for i, url in enumerate(crawl_urls):
    print(f"Testing {i+1}/{len(crawl_urls)}: {url}")
    
    # No cache
    try:
        start = datetime.now()
        result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=1), limit=5)
        no_cache_time = (datetime.now() - start).total_seconds()
        crawl_times_no_cache.append(no_cache_time)
        print(f"  No cache: {no_cache_time:.2f}s")
    except Exception as e:
        print(f"  No cache: FAILED - {e}")
        crawl_times_no_cache.append(None)
    
    # Cached
    try:
        start = datetime.now()
        result = app.crawl_url(url, scrape_options=ScrapeOptions(maxAge=100000), limit=5)
        cached_time = (datetime.now() - start).total_seconds()
        crawl_times_cached.append(cached_time)
        print(f"  Cached: {cached_time:.2f}s")
    except Exception as e:
        print(f"  Cached: FAILED - {e}")
        crawl_times_cached.append(None)

In [None]:
# Calculate averages
valid_no_cache = [t for t in crawl_times_no_cache if t is not None]
valid_cached = [t for t in crawl_times_cached if t is not None]

if valid_no_cache and valid_cached:
    avg_no_cache = statistics.mean(valid_no_cache)
    avg_cached = statistics.mean(valid_cached)
    speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0
    
    print("CRAWL RESULTS:")
    print(f"Average no cache: {avg_no_cache:.2f}s")
    print(f"Average cached: {avg_cached:.2f}s")
    print(f"Speedup: {speedup:.1f}x faster with cache")
    print(f"Time saved: {avg_no_cache - avg_cached:.2f}s per crawl")

---

## Map

Hypothesis: Indexed Map should get more urls

In [72]:
# TODO: use crawl urls, compare with prod

In [None]:
map_times = []
map_url_counts = []

for i, url in enumerate(map_urls):
    print(f"Testing {i+1}/{len(map_urls)}: {url}")
    
    try:
        start = datetime.now()
        result = app.map_url(url)
        map_time = (datetime.now() - start).total_seconds()
        url_count = len(result.links) if hasattr(result, 'links') else 0
        
        map_times.append(map_time)
        map_url_counts.append(url_count)
        
        print(f"  Time: {map_time:.2f}s")
        print(f"  URLs found: {url_count}")
    except Exception as e:
        print(f"  FAILED - {e}")
        map_times.append(None)
        map_url_counts.append(0)

In [None]:
# Calculate averages
valid_times = [t for t in map_times if t is not None]
valid_counts = [c for c in map_url_counts if c > 0]

if valid_times:
    avg_time = statistics.mean(valid_times)
    avg_urls = statistics.mean(valid_counts) if valid_counts else 0
    
    print("MAP RESULTS:")
    print(f"Average time: {avg_time:.2f}s")
    print(f"Average URLs found: {avg_urls:.1f}")
    print(f"URLs per second: {avg_urls/avg_time:.1f}")

---