# Index Benchmark

---

In [104]:
from firecrawl import FirecrawlApp, ScrapeOptions
import os
from dotenv import load_dotenv
from datetime import datetime
import statistics
import requests

load_dotenv()

True

In [82]:
STAGING_API_URL = 'http://default-firecrawl-app-staging-service:3002/' # os.getenv("STAGING_API_URL") or None

if STAGING_API_URL is None:
    raise ValueError("STAGING_API_URL is not set")
    
app = FirecrawlApp(api_url=STAGING_API_URL, api_key='no-auth')

In [83]:
STAGING_API_URL

'http://default-firecrawl-app-staging-service:3002/'

In [141]:
scrape_urls=[
    'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)
    'https://httpbin.org', # - HTTP testing service (very reliable)
    'https://example.com', # - Standard test domain (minimal content)
    'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)
    'https://stackoverflow.com/questions', # - Stack Overflow questions page
    'https://www.wikipedia.org', # - Wikipedia main page (rich content)
    'https://jsonplaceholder.typicode.com', # - Fake API for testing
    'https://httpstat.us/200', # - HTTP status testing (minimal response)
    'https://www.reddit.com/r/programming', # - Reddit programming subreddit
    'https://docs.python.org/3/' # - Python documentation (structured docs)
]


crawl_urls = [
    "https://www.pcbgogo.com/*", # 7825
    "https://github.com/Uniswap/v4-core/*", # 7353
    "http://arcep.fr/actualites/*", # 9764
    "https://www.synapticure.com/*", # 7746
    "https://www.elecrow.com/*", # 8025
    "https://idfcfirstbank.com/*", # 9912
    "https://www.todaytix.com/*", # 7532
    "https://www.wheel-size.com/size/*", # 7102
    "http://www.drymerge.com/*", # 8422
    "https://telegramindex.org/*" # 5335
]


map_urls = []
for i in crawl_urls:
    map_urls.append(i.replace('*',''))

## Scrape

Hypothesis: Indexed scrapes are faster

In [85]:
scrape_times_no_cache = []
scrape_times_cached = []

for i, url in enumerate(scrape_urls):  # Test first 5 URLs
    print(f"Testing {i+1}/{len(scrape_urls)}: {url}")
    
    # No cache (maxAge=1)
    try:
        start = datetime.now()
        doc = app.scrape_url(url, maxAge=1)
        no_cache_time = (datetime.now() - start).total_seconds()
        scrape_times_no_cache.append(no_cache_time)
        print(f"  No cache: {no_cache_time:.2f}s")
    except Exception as e:
        print(f"  No cache: FAILED - {e}")
        scrape_times_no_cache.append(None)
    
    # Cached (maxAge=100000)
    try:
        start = datetime.now()
        doc = app.scrape_url(url, maxAge=100000)
        cached_time = (datetime.now() - start).total_seconds()
        scrape_times_cached.append(cached_time)
        print(f"  Cached: {cached_time:.2f}s")
    except Exception as e:
        print(f"  Cached: FAILED - {e}")
        scrape_times_cached.append(None)

Testing 1/10: https://news.ycombinator.com
  No cache: 2.39s
  Cached: 0.93s
Testing 2/10: https://httpbin.org
  No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.
  Cached: 4.79s
Testing 3/10: https://example.com
  No cache: FAILED - Request Timeout: Failed to scrape URL as the request timed out. Request timed out - No additional error details provided.
  Cached: 17.33s
Testing 4/10: https://github.com/microsoft/vscode
  No cache: 2.81s
  Cached: 1.06s
Testing 5/10: https://stackoverflow.com/questions
  No cache: 2.92s
  Cached: 1.16s
Testing 6/10: https://www.wikipedia.org
  No cache: 3.53s
  Cached: 0.78s
Testing 7/10: https://jsonplaceholder.typicode.com
  No cache: 3.54s
  Cached: 0.80s
Testing 8/10: https://httpstat.us/200
  No cache: 10.79s
  Cached: 0.54s
Testing 9/10: https://www.reddit.com/r/programming
  No cache: 4.53s
  Cached: 0.79s
Testing 10/10: https://docs.python.org/3/
  No cach

In [86]:
# Calculate averages
valid_no_cache = [t for t in scrape_times_no_cache if t is not None]
valid_cached = [t for t in scrape_times_cached if t is not None]

if valid_no_cache and valid_cached:
    avg_no_cache = statistics.mean(valid_no_cache)
    avg_cached = statistics.mean(valid_cached)
    speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0
    
    print("SCRAPE RESULTS:")
    print(f"Average no cache: {avg_no_cache:.2f}s")
    print(f"Average cached: {avg_cached:.2f}s")
    print(f"Speedup: {speedup:.1f}x faster with cache")
    print(f"Time saved: {avg_no_cache - avg_cached:.2f}s per request")

SCRAPE RESULTS:
Average no cache: 4.31s
Average cached: 2.87s
Speedup: 1.5x faster with cache
Time saved: 1.44s per request


---

## Crawl

--- for now used to improve map 

Hypothesis: Indexed crawls are faster

In [142]:
crawl_times_no_cache = []
crawl_times_cached = []

for i, url in enumerate(crawl_urls):
    try:
        print(f"Crawling {i+1}/{len(crawl_urls)}: {url}")
        result = app.crawl_url(url)
    except Exception as e:
        print(f"{url[0]} - Crawl FAILED - {e}")

Crawling 1/10: https://www.pcbgogo.com/*
Crawling 2/10: https://github.com/Uniswap/v4-core/*
Crawling 3/10: http://arcep.fr/actualites/*
Crawling 4/10: https://www.synapticure.com/*
Crawling 5/10: https://www.elecrow.com/*
Crawling 6/10: https://idfcfirstbank.com/*
Crawling 7/10: https://www.todaytix.com/*
Crawling 8/10: https://www.wheel-size.com/size/*
Crawling 9/10: http://www.drymerge.com/*
h - Crawl FAILED - HTTPConnectionPool(host='default-firecrawl-app-staging-service', port=3002): Max retries exceeded with url: /v1/crawl/31485d6f-ceeb-458b-b010-f5efc3286a0e (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1195ac0d0>: Failed to establish a new connection: [Errno 65] No route to host'))
Crawling 10/10: https://telegramindex.org/*


---

## Map

Hypothesis: Indexed Map should get more urls after crawl

In [143]:
STAGING_API_URL

'http://default-firecrawl-app-staging-service:3002/'

In [144]:
def map_request(url, ignore_index):
    """
    Make a map request and return the links
    """
    payload = {"url": url, "ignoreIndex": ignore_index, "limit": 10000}
    headers = {'Content-Type': 'application/json', "Authorization": "Bearer no-auth"}
    response = requests.post(STAGING_API_URL + "v1/map", headers=headers, json=payload)
    
    if response.status_code == 200:
        data = response.json()
        return data.get('links', [])
    else:
        return []

In [147]:
map_times_no_cache = []
map_times_cached = []
map_url_counts_no_cache = []
map_url_counts_cached = []

for i, url in enumerate(map_urls):
    print(f"Testing {i+1}/{len(map_urls)}: {url}")
    
    # No index (ignoreIndex=True)
    start = datetime.now()
    links_no_index = map_request(url, True)
    time_no_index = (datetime.now() - start).total_seconds()
    
    map_times_no_cache.append(time_no_index)
    map_url_counts_no_cache.append(len(links_no_index))
    print(f"  No index: {time_no_index:.2f}s, {len(links_no_index)} URLs")
    
    # With index (ignoreIndex=False)
    start = datetime.now()
    links_indexed = map_request(url, False)
    time_indexed = (datetime.now() - start).total_seconds()
    
    map_times_cached.append(time_indexed)
    map_url_counts_cached.append(len(links_indexed))
    print(f"  With index: {time_indexed:.2f}s, {len(links_indexed)} URLs")

Testing 1/10: https://www.pcbgogo.com/
  No index: 1.44s, 664 URLs
  With index: 1.70s, 664 URLs
Testing 2/10: https://github.com/Uniswap/v4-core/
  No index: 0.72s, 235 URLs
  With index: 0.75s, 235 URLs
Testing 3/10: http://arcep.fr/actualites/
  No index: 6.23s, 8059 URLs
  With index: 6.71s, 9048 URLs
Testing 4/10: https://www.synapticure.com/
  No index: 0.72s, 365 URLs
  With index: 0.94s, 365 URLs
Testing 5/10: https://www.elecrow.com/
  No index: 4.03s, 2747 URLs
  With index: 4.13s, 2747 URLs
Testing 6/10: https://idfcfirstbank.com/
  No index: 2.62s, 279 URLs
  With index: 4.02s, 279 URLs
Testing 7/10: https://www.todaytix.com/
  No index: 4.53s, 8933 URLs
  With index: 3.63s, 8933 URLs
Testing 8/10: https://www.wheel-size.com/size/
  No index: 10.89s, 10000 URLs
  With index: 10.73s, 10000 URLs
Testing 9/10: http://www.drymerge.com/
  No index: 32.94s, 3602 URLs
  With index: 37.07s, 3602 URLs
Testing 10/10: https://telegramindex.org/
  No index: 0.98s, 2 URLs
  With index: 

In [149]:
# Calculate averages
avg_time_no_cache = statistics.mean(map_times_no_cache)
avg_time_cached = statistics.mean(map_times_cached)
avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)
avg_urls_cached = statistics.mean(map_url_counts_cached)

time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0
url_difference = avg_urls_cached - avg_urls_no_cache
url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0

print("MAP RESULTS:")
print(f"Average time (no cache): {avg_time_no_cache:.2f}s")
print(f"Average time (cached): {avg_time_cached:.2f}s")
print(f"Time speedup: {time_speedup:.2f}x faster with cache")
print(f"Average URLs found (no cache): {avg_urls_no_cache:.1f}")
print(f"Average URLs found (cached): {avg_urls_cached:.1f}")
print(f"URL difference: {url_difference:+.1f} URLs with cache")
print(f"URL percentage: {url_percentage:.1f}% of no-cache results")

if url_difference > 0:
    print("✅ Cache finds MORE URLs")
elif url_difference < 0:
    print("⚠️  Cache finds FEWER URLs")
else:
    print("➡️  Cache finds SAME number of URLs")

MAP RESULTS:
Average time (no cache): 6.51s
Average time (cached): 7.09s
Time speedup: 0.92x faster with cache
Average URLs found (no cache): 3488.6
Average URLs found (cached): 3587.5
URL difference: +98.9 URLs with cache
URL percentage: 102.8% of no-cache results
✅ Cache finds MORE URLs


---