In [4]:
import asyncio
import re
from urllib.parse import urlparse
from crawl4ai import (
    AsyncUrlSeeder,
    AsyncWebCrawler,
    BrowserConfig,
    CacheMode,
    CrawlerRunConfig,
    SeedingConfig,
)
from url_normalize import url_normalize
from datetime import datetime


async def neuracrawl_discovery_fast(
    domain_url: str,
    max_pages: int = 5000,
    concurrency: int = 100,
    allowed_subdomains: list[str] | None = None,
    url_exclusion_patterns: list[str] | None = None,
    crawl_sample: int = 0,
    log_file_path: str = "neuracrawl_discovery.log",
) -> dict:
    """Fast URL discovery - sitemap+cc seeding, with optional crawl fallback.
    
    Args:
        crawl_sample: Crawl N pages for link discovery. If 0 and no sitemap found, crawls 50 pages.
        log_file_path: Path to log file for all output
    """
    allowed_subdomains = allowed_subdomains or []
    url_exclusion_patterns = url_exclusion_patterns or []
    exclusion_regexes = [re.compile(p) for p in url_exclusion_patterns]

    base_domain = urlparse(domain_url).netloc.replace("www.", "")
    allowed_domains = {base_domain} | {urlparse(u).netloc.replace("www.", "") for u in allowed_subdomains}

    def is_allowed(url: str) -> bool:
        try:
            domain = urlparse(url).netloc.replace("www.", "")
            if domain not in allowed_domains and not any(domain.endswith(f".{d}") for d in allowed_domains):
                return False
            return not any(r.search(url) for r in exclusion_regexes)
        except:
            return False

    def log(message: str, log_file):
        """Write to log file with timestamp."""
        timestamp = datetime.now().isoformat()
        log_line = f"[{timestamp}] {message}\n"
        log_file.write(log_line)
        log_file.flush()
        print(message)

    # Open log file for writing
    with open(log_file_path, "w", buffering=1) as log_file:
        log(f"üîç Seeding URLs for {domain_url} (sitemap+cc)...", log_file)

        # Phase 1: Sitemap + CommonCrawl seeding
        async with AsyncUrlSeeder() as url_seeder:
            seeding_config = SeedingConfig(
                "sitemap+cc",
                "*",
                True,
                concurrency=concurrency,
                hits_per_sec=int(concurrency / 10),
            )
            urls_data = await url_seeder.urls(domain_url, seeding_config)
            seed_urls = {url_data["url"] for url_data in urls_data}

        log(f"‚úì Seeded {len(seed_urls)} URLs", log_file)

        # Log all seed URLs
        for url in sorted(seed_urls):
            log_file.write(f"  SEED: {url}\n")
        log_file.flush()

        filtered_urls = set()
        for url in seed_urls:
            norm = url_normalize(url)
            if norm and is_allowed(norm):
                filtered_urls.add(norm)

        log(f"‚úì After filtering: {len(filtered_urls)} URLs", log_file)

        # Phase 2: Crawl if needed (either explicitly requested or no sitemap found)
        pages_to_crawl = crawl_sample
        if len(filtered_urls) == 0:
            log("‚ö†Ô∏è No sitemap found, falling back to spider crawl...", log_file)
            pages_to_crawl = max(crawl_sample, 100)  # Crawl at least 100 pages if no sitemap
            filtered_urls.add(domain_url)  # Start from homepage

        if pages_to_crawl > 0:
            log(f"üï∑Ô∏è Spider crawl of up to {pages_to_crawl} pages...", log_file)

            browser_config = BrowserConfig(viewport_width=1920, viewport_height=1080)
            crawler_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                page_timeout=15000,
                semaphore_count=concurrency,
                scan_full_page=False,
                wait_until="domcontentloaded",
            )

            to_crawl = list(filtered_urls)[:pages_to_crawl]
            crawled = set()
            discovered_links = set()

            async with AsyncWebCrawler(config=browser_config) as crawler:
                # Wave-based crawling like neuracrawl
                while to_crawl and len(crawled) < pages_to_crawl:
                    batch = to_crawl[:concurrency]
                    to_crawl = to_crawl[concurrency:]

                    results = await crawler.arun_many(batch, crawler_config)

                    for result in results:  # arun_many returns list, not async iterator
                        if result.success:
                            crawled.add(result.url)
                            log(f"  ‚úì {result.url}", log_file)
                            if result.links:
                                for link in result.links.get("internal", []) + result.links.get("external", []):
                                    href = link.get("href")
                                    if href:
                                        norm = url_normalize(href)
                                        if norm and is_allowed(norm) and norm not in crawled:
                                            discovered_links.add(norm)
                        else:
                            log(f"  ‚úó {result.url} ({result.error_message})", log_file)

                    # Add new URLs to crawl queue
                    new_urls = list(discovered_links - crawled - set(to_crawl))
                    to_crawl.extend(new_urls[:pages_to_crawl - len(crawled)])

                    log(f"  Wave: Crawled {len(crawled)} | Discovered {len(discovered_links)} | Queue {len(to_crawl)}", log_file)

            filtered_urls |= discovered_links
            log(f"‚úì Total discovered: {len(filtered_urls)} URLs", log_file)

        capped = set(list(filtered_urls)[:max_pages])
        log(f"üìä Final: {len(capped)} URLs", log_file)

        return {
            "domain": domain_url,
            "seed_count": len(seed_urls),
            "filtered_count": len(filtered_urls),
            "total": len(capped),
            "urls": capped,
        }

In [3]:


from nest_asyncio import apply
apply()

# ===== CUSTOMER NAME =====
CUSTOMER = "dachau"
# =====================================

# Derived paths
DOMAIN_URL = f"https://www.{CUSTOMER}.de"
LOG_FILE = f"{CUSTOMER}_discovery.log"
URLS_FILE = f"{CUSTOMER}_urls.txt"

result = await neuracrawl_discovery_fast(
    DOMAIN_URL,
    max_pages=5000,
    url_exclusion_patterns=[
        r"(aktuelles|meldungen|veranstaltungen-details|event|currentPage|dachauer-volksblatt|/dokumente/)",
        r"\.(ics|pdf|json|vcf|kml|gpx)",
        r"\?",
    ],
    crawl_sample=10000,
    log_file_path=LOG_FILE,
)

print(f"\nüìä Total: {result['total']} URLs")

with open(URLS_FILE, "w") as f:
    f.write("\n".join(sorted(result["urls"])))
print(f"‚úì Saved to {URLS_FILE}")
print(f"‚úì Log saved to {LOG_FILE}")

NameError: name 'neuracrawl_discovery_fast' is not defined

In [5]:
# neuracrawl-tuner
# Import all functionality from the library module

from neuracrawl_tuner_lib import (
    PROJECT_MANAGER,
    create_sitemap_urls_file,
    extract_sitemap_urls,
    extract_frequent_sitemap_urls,
    extract_url_extensions,
    extract_url_regexes,
    extract_interesting_urls,
    download_interesting_urls,
    extract_css_selectors,
    apply_css_selectors,
)


# neuracrawl Tuner


In [6]:
# Set your project name here

PROJECT_MANAGER.set_project(CUSTOMER)
create_sitemap_urls_file()

## Sitemap

In [3]:
extract_sitemap_urls()

In [4]:
extract_frequent_sitemap_urls(5)
extract_url_extensions()

Found 36 common URL areas with frequency >= 5


## Exclusion URL Regexes

In [5]:
await extract_url_regexes(
    "Exclude all events and news. Also exclude all pdf, json, xml, ics, vcf, txt."
)

INFO:neuracrawl_tuner_lib:Extracting URL regexes...
INFO:neuracrawl_tuner_lib:Analyzing frequent URL paths...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:neuracrawl_tuner_lib:Found 3 URL regexes.
INFO:neuracrawl_tuner_lib:Extracted 3 URL regexes.
INFO:neuracrawl_tuner_lib:Applying URL regexes to sitemap URLs...
INFO:neuracrawl_tuner_lib:Applied URL regexes. Found 239 excluded and 2131 non-excluded URLs.
INFO:neuracrawl_tuner_lib:Saved excluded and non-excluded URLs.


## Interesting URLs

In [6]:
await extract_interesting_urls("")

INFO:neuracrawl_tuner_lib:Extracting interesting URLs...
INFO:neuracrawl_tuner_lib:Found 2131 non-excluded URLs.
INFO:neuracrawl_tuner_lib:Processing prompt (1/5)...
INFO:neuracrawl_tuner_lib:Processing prompt (2/5)...
INFO:neuracrawl_tuner_lib:Processing prompt (3/5)...
INFO:neuracrawl_tuner_lib:Processing prompt (4/5)...
INFO:neuracrawl_tuner_lib:Processing prompt (5/5)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:neuracrawl_tuner_lib:Processed prompt (3/5).
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:neuracrawl_tuner_lib:Processed prompt (2/5).
INFO:neuracrawl_tuner_lib:Processed prompt (1/5).
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:neuracrawl_tuner_lib:Proces

In [7]:
await download_interesting_urls()

INFO:neuracrawl_tuner_lib:Downloading interesting URLs...
INFO:neuracrawl_tuner_lib:Found 75 URLs.


INFO:neuracrawl_tuner_lib:Downloaded 75 URLs.


## CSS Selectors

In [None]:
await extract_css_selectors(
)

INFO:neuracrawl_tuner_lib:Extracting CSS selectors...
INFO:neuracrawl_tuner_lib:Found 75 pages.
INFO:neuracrawl_tuner_lib:Pre-cleaning HTML for 75 pages...
INFO:neuracrawl_tuner_lib:Pre-cleaned HTML for 75 pages.
634701
538804
635214
564481
538261
545865
569643
1667657
573302
630466
571582
637713
729703
542760
596786
540300
539066
655132
655136
655038
655038
545260
564955
560204
559440
559532
560203
559417
558276
560559
557517
550740
558395
558383
557212
558398
558393
558400
560374
560204
557613
560733
552906
555288
655037
19466
556285
551324
551342
587663
551002
553070
582715
552198
655132
655132
634701
559574
634676
635090
530522
635090
561510
635090
634677
635090
634676
634701
634677
634677
520631
634701
655132
520631
655132
INFO:neuracrawl_tuner_lib:Processing prompt (1/75)...
INFO:neuracrawl_tuner_lib:Processing prompt (2/75)...
INFO:neuracrawl_tuner_lib:Processing prompt (3/75)...
INFO:neuracrawl_tuner_lib:Processing prompt (4/75)...
INFO:neuracrawl_tuner_lib:Processing prompt (5

[] viewer should use save_state.json, maybe only work with save_state.json instead of all the other files? then have a result object that gets plugged into the save_state.json that can include extra data?
- better method splitting, service classes, more models for example for counts of file extensions, frequent urls
- one file with all kept urls and one with all not kept urls
- say with what version we are working when starting sth, like v_000 (latest) or so
- auto ai feedback
- links are getting bad?
- maybe make feedback tab field editable?
- <a aria-label="Online-Dienst 'Bewohnerparkausweis beantragen / verwalten' starten" class="linklist-boxed__link" href="#" onclick="window.open('https://eservice.siegburg.de/bewohnerparken');return false;">
- remove img and figures
- error analysis like look at page xyz, why is this text at the bottom missing, ai gets the text, css selectors and html and markdown and will figure out the exact reason
- sitemap deduplizieren mit normalisierung
- rendered view of raw html for seeing if it was even in the originial html content of raw html
- for downloading the pages juse crawl 4 ai instead of the normal httpx WITH SAME SCROLLING ETC SETTINGS AS IN NEURACRAWL
- split up css selector extraction and application to be able to change underlaying interesting urls easily without purging the selectors
- exclude by css selector on website
- show website urls in the select of xcode app, also at bottom left, not only folder name

# Manual CSS Selector Application

In [8]:
apply_css_selectors()

INFO:neuracrawl_tuner_lib:Applying CSS selectors...
INFO:neuracrawl_tuner_lib:Loaded 32 CSS selectors.
INFO:neuracrawl_tuner_lib:Processing 75 pages...
INFO:neuracrawl_tuner_lib:Processed 75 pages.
INFO:neuracrawl_tuner_lib:Applied 32 CSS selectors to 75 pages. Saved as v_007.
