In [None]:
# Common

import asyncio
import logging
import os
import re
import shutil
import sys
from collections import Counter
from logging import getLogger
from pathlib import Path

import httpx
from bs4 import BeautifulSoup
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field

# Logging

logger = getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# Utilities


def convert_url_to_file_name(url: str) -> str:
    filename = url.replace("https://", "").replace("http://", "").replace("/", "_")
    return filename


def batch_items[T](
    items: list[T],
    max_item_count_per_batch: int,
) -> list[list[T]]:
    batches = [
        items[x_index : x_index + max_item_count_per_batch]
        for x_index in range(0, len(items), max_item_count_per_batch)
    ]
    return batches


## LLMs


async def call_structured_llm[T: BaseModel](
    system_prompt: str,
    output_model: type[T],
) -> T:
    system_message = SystemMessage(system_prompt)
    human_message = HumanMessage("Erledige die Aufgabe.")
    messages = [system_message, human_message]
    llm = ChatOpenAI(
        base_url="http://127.0.0.1:50025",
        api_key="litellm-api-key-1234",
        # model="bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0",
        model="gemini/gemini-2.5-pro",
        timeout=120,
        temperature=0.1,
    )
    structured_llm = llm.with_structured_output(
        output_model,
        method="json_schema",
    )
    llm_output = await structured_llm.ainvoke(messages)
    return llm_output


MAX_CONCURRENT_BATCH_COUNT = 3


async def call_structured_llm_batch[T: BaseModel](
    system_prompts: list[str],
    output_model: type[T],
) -> list[T]:
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_BATCH_COUNT)

    async def process_rate_limited(
        batch_index: int,
        system_prompt: str,
    ) -> T:
        async with semaphore:
            log_message = (
                f"Processing prompt ({batch_index + 1}/{len(system_prompts)})..."
            )
            logger.info(log_message)

            result = await call_structured_llm(system_prompt, output_model)

            log_message = f"Processed prompt ({batch_index + 1}/{len(system_prompts)})."
            logger.info(log_message)

            return result

    process_tasks = [
        process_rate_limited(x_index, x_system_prompt)
        for x_index, x_system_prompt in enumerate(system_prompts)
    ]

    all_results = await asyncio.gather(*process_tasks)
    return all_results


# Save state


class SaveStateDataPack(BaseModel):
    url: str = Field()  # The URL of the sample site
    raw_html: str = Field()  # The raw downloaded HTML
    cleaned_html: str = Field()  # Exclusion CSS selectors applied
    raw_markdown: str = Field()  # The raw HTML converted to Markdown
    cleaned_markdown: str = Field()  # The cleaned Markdown
    feedback: str = Field()  # AI generated feedback for iterations


class SaveState(BaseModel):
    versions_folder_path: Path = Field()
    cloneable_result_folder_path: Path = Field()
    data_packs: list[SaveStateDataPack] = Field()


def get_current_save_state_versions(folder_path: Path) -> list[str]:
    versions: list[str] = []

    for x_folder_path in folder_path.iterdir():
        if x_folder_path.is_dir() and x_folder_path.name.startswith("v_"):
            versions.append(x_folder_path.name)

    sorted_versions = sorted(versions)
    return sorted_versions


def get_latest_save_state_version(folder_path: Path) -> str:
    versions = get_current_save_state_versions(folder_path)
    if len(versions) == 0:
        return "v_000"
    latest_version = versions[-1]
    return latest_version


# AIGENERATED
def get_next_save_state_version(folder_path: Path) -> str:
    latest_version = get_latest_save_state_version(folder_path)
    if latest_version == "v_000" and not (folder_path / latest_version).exists():
        return "v_000"

    # Extract number from latest version (e.g., "v_000" -> 0)
    version_number = int(latest_version.split("_")[1])
    next_version_number = version_number + 1
    next_version = f"v_{next_version_number:03d}"
    return next_version


# AIGENERATED
def save_save_state(save_state: SaveState) -> None:
    # Create version folder
    next_version = get_next_save_state_version(save_state.versions_folder_path)
    version_folder_path = save_state.versions_folder_path / next_version
    version_folder_path.mkdir(exist_ok=True, parents=True)

    # Save the save state as JSON
    save_state_file_path = version_folder_path / "save_state.json"
    save_state_json = save_state.model_dump_json(indent=2)
    save_state_file_path.write_text(save_state_json)

    # Create results folder
    results_folder_path = version_folder_path / "results"
    results_folder_path.mkdir(exist_ok=True, parents=True)

    # Copy files from cloneable_result_folder_path to version's results folder for version history
    if save_state.cloneable_result_folder_path.exists():
        for x_item in save_state.cloneable_result_folder_path.iterdir():
            if x_item.is_file():
                shutil.copy2(x_item, results_folder_path / x_item.name)
            elif x_item.is_dir():
                shutil.copytree(
                    x_item, results_folder_path / x_item.name, dirs_exist_ok=True
                )

    # Save each data pack
    for x_index, x_data_pack in enumerate(save_state.data_packs):
        # Create sanitized folder name from URL using convert_url_to_file_name
        data_pack_folder_name = (
            f"{x_index:03d}_{convert_url_to_file_name(x_data_pack.url)}"
        )
        data_pack_folder_path = version_folder_path / data_pack_folder_name
        data_pack_folder_path.mkdir(exist_ok=True, parents=True)

        # Save URL
        url_file_path = data_pack_folder_path / "05_url.txt"
        url_file_path.write_text(x_data_pack.url)

        # Save raw HTML
        raw_html_file_path = data_pack_folder_path / "10_raw_html.html"
        raw_html_file_path.write_text(x_data_pack.raw_html)

        # Save cleaned HTML
        cleaned_html_file_path = data_pack_folder_path / "20_cleaned_html.html"
        cleaned_html_file_path.write_text(x_data_pack.cleaned_html)

        # Save raw markdown
        raw_markdown_file_path = data_pack_folder_path / "30_raw_markdown.md"
        raw_markdown_file_path.write_text(x_data_pack.raw_markdown)

        # Save cleaned markdown
        cleaned_markdown_file_path = data_pack_folder_path / "40_cleaned_markdown.md"
        cleaned_markdown_file_path.write_text(x_data_pack.cleaned_markdown)

        # Save feedback
        feedback_file_path = data_pack_folder_path / "50_feedback.txt"
        feedback_file_path.write_text(x_data_pack.feedback)


# AIGENERATED
def get_save_state(versions_folder_path: Path, version: str | None = None) -> SaveState:
    """
    Load a SaveState from a specific version or the latest version.

    Args:
        versions_folder_path: Path to the versions folder
        version: Version string (e.g., "v001") or None to get the latest version

    Returns:
        SaveState object loaded from the specified or latest version
    """
    if version is None:
        # Get the latest version by finding the highest version number
        version_folders = sorted(
            [
                d
                for d in versions_folder_path.iterdir()
                if d.is_dir() and d.name.startswith("v")
            ]
        )

        if not version_folders:
            raise ValueError(f"No version folders found in {versions_folder_path}")

        version_folder_path = version_folders[-1]
    else:
        version_folder_path = versions_folder_path / version

        if not version_folder_path.exists():
            raise ValueError(f"Version folder {version_folder_path} does not exist")

    # Load the save state JSON file
    save_state_file_path = version_folder_path / "save_state.json"

    if not save_state_file_path.exists():
        raise ValueError(f"Save state file not found at {save_state_file_path}")

    save_state_json = save_state_file_path.read_text(encoding="utf-8")
    save_state = SaveState.model_validate_json(save_state_json)

    return save_state


# Converters


def preclean_html(html: str) -> str:
    pass


def clean_html(html: str, exclusion_css_selectors: list[str]) -> str:
    pass


def convert_html_to_markdown(html: str) -> str:
    pass


def clean_markdown(markdown: str) -> str:
    pass


In [144]:
# Project

project: str = ""


def set_project(project_name) -> None:
    global project
    project = project_name


def get_project() -> str:
    global project
    return project


def get_project_dir_path() -> Path:
    global project
    project_dir_path = Path(os.getcwd(), "projects", project)
    project_dir_path.mkdir(exist_ok=True, parents=True)
    return project_dir_path


In [145]:
# Sitemap

## File helpers

### Sitemap URLS folder


def get_sitemap_urls_folder_path() -> Path:
    project_folder_path = get_project_dir_path()
    folder_path = project_folder_path / "05_sitemap_urls"
    folder_path.mkdir(exist_ok=True, parents=True)
    return folder_path


### Sitemap URLs file


def get_sitemap_urls_txt_file_path() -> Path:
    sitemap_urls_folder_path = get_sitemap_urls_folder_path()
    file_path = sitemap_urls_folder_path / "sitemap_urls.txt"
    return file_path


def save_sitemap_urls(urls: list[str]) -> None:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    urls_text = "\n".join(urls)
    with open(urls_txt_file_path, "w") as file:
        file.write(urls_text)


def get_saved_sitemap_urls() -> list[str]:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    with open(urls_txt_file_path, "r") as file:
        urls = [
            x_line.strip() for x_line in file.readlines() if len(x_line.strip()) > 0
        ]
    return urls


### Frequent Sitemap URLs file


def get_frequent_sitemap_urls_txt_file_path() -> Path:
    sitemap_urls_folder_path = get_sitemap_urls_folder_path()
    file_path = sitemap_urls_folder_path / "frequent_sitemap_urls.txt"
    return file_path


def save_frequent_sitemap_urls(frequent_urls_text: str) -> None:
    frequent_urls_txt_file_path = get_frequent_sitemap_urls_txt_file_path()
    with open(frequent_urls_txt_file_path, "w") as file:
        file.write(frequent_urls_text)


## Methods


def extract_sitemap_urls() -> None:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    url_regex = r"https?://[^\s<>\"']+"
    with open(urls_txt_file_path, "r") as urls_txt_file:
        text = urls_txt_file.read()
    urls = re.findall(url_regex, text)
    save_sitemap_urls(urls)


# AIGENERATED
def extract_frequent_sitemap_urls(min_frequency) -> None:
    """This function extracts common URL paths or path segments that appear frequently across URLs."""
    urls = get_saved_sitemap_urls()

    # Extract path segments from each URL
    path_segments = []

    for x_url in urls:
        # Remove protocol and domain
        if "://" in x_url:
            url_without_protocol = x_url.split("://", 1)[1]
            if "/" in url_without_protocol:
                domain_and_path = url_without_protocol.split("/", 1)
                if len(domain_and_path) > 1:
                    path = domain_and_path[1]
                    # Build up all parent paths
                    parts = path.rstrip("/").split("/")
                    for i in range(1, len(parts) + 1):
                        segment = "/".join(parts[:i]) + "/"
                        path_segments.append(segment)

    # Count frequency of each path segment
    segment_counter = Counter(path_segments)

    # Filter segments by minimum frequency
    common_segments = {
        segment: count
        for segment, count in segment_counter.items()
        if count >= min_frequency
    }

    # Sort by frequency (descending) and then alphabetically
    sorted_segments = sorted(common_segments.items(), key=lambda x: (-x[1], x[0]))

    # Write to output file
    frequent_sitemap_urls_text = "\n".join(
        [f"{x_count}\t{x_segment}" for x_segment, x_count in sorted_segments]
    )
    save_frequent_sitemap_urls(frequent_sitemap_urls_text)

    print(
        f"Found {len(sorted_segments)} common URL areas with frequency >= {min_frequency}"
    )

In [146]:
GENERAL_SYSTEM_PROMPT = """<Generell>
- neuracrawl
    - neuracrawl ist ein Webcrawler, welcher eine Ausgangsdomain bekommt und von dort dann aus deepcrawlt, also sich durch alle Links der Seite hangelt und immer weiter nach neuen Links sucht.
    - Er ist sehr gut darin, eine einzige Webseite sehr ausführlich zu crawlen.
    - URL Ausschließungen
        - Dabei schließt neuracrawl aber auch bestimmte URL Gruppen/Subpfade aus.
        - Zum Beispiel, kann es sein, dass wir bei einer Webseite alle Veranstaltungen oder Newsartikel ausschließen wollen, da wir die zum Beispiel nochmal getrennt über eine API strukturiert auslesen.
    - Markdown Extraktion
        - Dabei extrahiert er extrem sauberes Markdown, ohne Header, Footer, Cookiebanner, Werbeinhalten, etc.
        - Die Daten am Ende enthalten nur den reinen Inhalt der Webseite.
        - Dabei geht er subtraktiv vor, also entfernt alle Elemente, welche "Verschmutzungen" darstellen.
        - Dies ist immer ein Spiel zwischen "wir wollen alles entfernen, was nicht wirklicher Inhalt ist" und "wir wollen nichts entfernen, was zum wirklichen Inhalt gehört".
        - Unser Grundsatz ist, dass wir so nah wie möglich an den wirklichen Inhalt rankommen wollen, ohne dabei aber Informationen zu verlieren. Wir dürfen auf keinen Fall echte Informationen verlieren, egal, wo diese auf der Webseite stehen.
    - neuracrawl benötigt generell die folgenden Einstellungen:
        - Ausgangsdomain und erlaubt andere Domains, auf welche er kommen und crawlen darf.
        - URL-Ausschließ-Regexes, welche bestimmte URL Gruppen/Subpfade ausschließen
            - Zum Beispiel : "^.*/(aktuelles|amtsblatt)/.*$" oder "^.*\\.(?:ics|pdf).*$"
        - CSS-Ausschließ-Selektoren, welche bestimmte HTML-Elemente auf allen Seiten ausschließen
            - Zum Beispiel : "header", ".front-left" oder "#cc-size"

- neuracrawl tuner ist eine Sammlung an Funktionen, welche dabei helfen, die perfekten Werte für die obigen Einstellungen zu finden.
</Generell>"""

In [None]:
# Interesting URLs

## Prompts

INTERESTING_URLS_EXTRACTION_BATCH_SIZE = 500

INTERESTING_URLS_EXTRACTION_COMMON_SYSTEM_PROMPT = """<Prozess>
- Du bist Teil des folgenden Prozesses:
    - Um die CSS-Ausschließ-Selektoren zu bestimmen, müssen einige Sample Seiten der Webseite analysiert werden und auf diesen dann die CSS-Selektoren angewendet werden um zu schauen, ob sie den gewünschten Effekt haben.
    - Dazu werden zu erst aus der Sitemap einer Webseite interessante, diverse URLs ausgewählt, welche das Sample Set darstellen.
    - Dabei sollten diese Seiten besonders repräsentativ für die gesamte Webseite sein. Z. B. einmal die Startseite, dann eine Veranstaltungsseite, eine Newsseite, eine Übersichtsseite, eine Archivseite, eine Kontaktseite, eine Impressumseite, etc.
    - Also ein Set an Seiten, bei dem wir auch unterschiedliche Inhalte und Layoutstrukturen erwarten.
    - Natürlich können wir das nicht genau wissen, da wir nur die URLs sehen und aus diesen einfach von außen auswählen müssen. Tortzdem lässt sich an den URLs und Pfadsegmenten schon sehr gut ablesen, welche Seiten unterschiedliche Inhalte enthalten sollten.
    - Da eine Webseite tausende Seiten enthalten kann, gehen wir hierbei in Batches vor. Zuerst extrahieren mehrere KI-Agenten aus jeweils 500 URLs ein Sample Set und begründen ihre Auswahlen. Es ist wichtig eine gute Begründung zu geben, damit der zusammenfassende KI-Agent die Gedanken hinter den Auswahlen besser versteht.
    - Dann nimmt ein zweiter KI-Agent die Batches und kombiniert diese zu einem finalen Sample Set, indem er versucht die besten URLs auszuwählen. Dabei versucht er auf maximal 20 URLs zu kommen.
    - Das Sample Set wird dann später heruntergeladen und vom Nutzer analysiert.
</Prozess>"""

INTERESTING_URLS_EXTRACTION_BATCH_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welcher die Auswahl der URLs für das Sample Set vornimmt und dabei einen Batch von maximal 500 URLs bearbeitet. Du bist also nicht der, welcher am Ende die ganzen Batches zusammenfasst.
</Aufgabe>

<URLs>
- Hier sind die URLs, aus welchen du ein Sample Set auswählen sollst:
{urls_text}
</URLs>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer die ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welche die ganzen Batches zu einem finalen Sample Set zusammenfasst.
</Aufgabe>

<Batch Sample Sets>
- Hier sind die Batches, welche du zusammenfassen sollst:
{batch_llm_outputs_text}
</Batch Sample Sets>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer die ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

## LLM Output models


class InterestingUrlsSingleResultlLlmOutput(BaseModel):
    url: str = Field()
    reason: str = Field()


class InterestingUrlsFullResultLlmOutput(BaseModel):
    url_infos: list[InterestingUrlsSingleResultlLlmOutput] = Field()


## System prompt methods


def generate_interesting_urls_batch_system_prompt(
    urls: list[str], custom_instructions: str
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(INTERESTING_URLS_EXTRACTION_COMMON_SYSTEM_PROMPT)

    urls_text = "\n".join(
        [f"{x_index + 1}. {x_url}" for x_index, x_url in enumerate(urls)]
    )
    batch_system_prompt = INTERESTING_URLS_EXTRACTION_BATCH_SYSTEM_PROMPT.format(
        urls_text=urls_text, custom_instructions=custom_instructions
    )
    parts.append(batch_system_prompt)

    system_prompt = "\n\n".join(parts)
    return system_prompt


def generate_interesting_urls_summarizer_system_prompt(
    batch_full_result_llm_outputs: list[InterestingUrlsFullResultLlmOutput],
    custom_instructions: str,
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT)

    batch_llm_outputs_text = "\n\n".join(
        [
            f"Batch {x_index + 1}:\n"
            + "\n".join(
                [
                    f"- {y_single_result_llm_output.url}\n  Reason: {y_single_result_llm_output.reason}"
                    for y_single_result_llm_output in x_batch_full_resultllm_output.url_infos
                ]
            )
            for x_index, x_batch_full_resultllm_output in enumerate(
                batch_full_result_llm_outputs
            )
        ]
    )
    system_prompt = INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT.format(
        batch_llm_outputs_text=batch_llm_outputs_text,
        custom_instructions=custom_instructions,
    )
    return system_prompt


## File helpers

### Interesting URLs folder


def get_interesting_urls_folder_path() -> Path:
    project_folder_path = get_project_dir_path()
    file_path = project_folder_path / "10_interesting_urls"
    file_path.mkdir(exist_ok=True, parents=True)
    return file_path


### Results folder


def get_interesting_urls_results_folder_path() -> Path:
    interesting_urls_folder_path = get_interesting_urls_folder_path()
    results_folder_path = interesting_urls_folder_path / "results"
    results_folder_path.mkdir(exist_ok=True, parents=True)
    return results_folder_path


### Interesting URLs file


def save_interesting_urls(urls: list[str]) -> None:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    file_path = interesting_urls_results_folder_path / "interesting_urls.txt"
    urls_text = "\n".join(urls)
    with open(file_path, "w") as file:
        file.write(urls_text)


def get_interesting_urls() -> list[str]:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    interesting_urls_file_path = (
        interesting_urls_results_folder_path / "interesting_urls.txt"
    )
    with open(interesting_urls_file_path, "r") as file:
        urls = [
            x_line.strip() for x_line in file.readlines() if len(x_line.strip()) > 0
        ]
    return urls


### Interesting URLs reason file


def save_interesting_urls_reason(
    llm_output: InterestingUrlsFullResultLlmOutput,
) -> None:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    file_path = interesting_urls_results_folder_path / "interesting_urls_reasons.txt"
    reason = llm_output.model_dump_json(indent=2)
    with open(file_path, "w") as file:
        file.write(reason)


### Interesting URLs downloads folder


def get_interesting_urls_downloads_folder_path() -> Path:
    interesting_urls_folder_path = get_interesting_urls_folder_path()
    downloaded_folder_path = interesting_urls_folder_path / "downloads"
    downloaded_folder_path.mkdir(exist_ok=True)
    return downloaded_folder_path


### Interesting URLs downloads file


def save_downloaded_url(index: int, url: str, html_content: str) -> None:
    downloads_folder_path = get_interesting_urls_downloads_folder_path()
    index_file_name_part = f"{index:03d}"
    url_file_name_part = convert_url_to_file_name(url)
    full_file_name = f"{index_file_name_part}_{url_file_name_part}.html"
    file_path = downloads_folder_path / full_file_name
    with open(file_path) as file:
        file.write(html_content)


## Methods


async def download_interesting_urls() -> None:
    log_message = "Downloading interesting URLs..."
    logger.info(log_message)

    urls = get_interesting_urls()

    log_message = f"Found {len(urls)} URLs."
    logger.info(log_message)

    async with httpx.AsyncClient(timeout=60) as http_client:
        download_tasks = [http_client.get(x_url) for x_url in urls]
        responses = await asyncio.gather(*download_tasks)

    data_packs: list[SaveStateDataPack] = []

    for x_url, x_response in zip(urls, responses):
        soup = BeautifulSoup(x_response.text, "html.parser")
        prettified_html = soup.prettify()

        data_pack = SaveStateDataPack(
            url=x_url,
            raw_html=prettified_html,
            cleaned_html="",
            raw_markdown="",
            cleaned_markdown="",
            feedback="",
        )
        data_packs.append(data_pack)

    results_folder_path = get_interesting_urls_results_folder_path()
    downloads_folder_path = get_interesting_urls_downloads_folder_path()
    save_state = SaveState(
        versions_folder_path=downloads_folder_path,
        cloneable_result_folder_path=results_folder_path,
        data_packs=data_packs,
    )
    save_save_state(save_state)

    log_message = f"Downloaded {len(urls)} URLs."
    logger.info(log_message)


async def extract_interesting_urls(custom_instructions: str) -> None:
    log_message = "Extracting interesting URLs..."
    logger.info(log_message)

    urls = get_saved_sitemap_urls()

    log_message = f"Found {len(urls)} URLs."
    logger.info(log_message)

    url_batches = batch_items(urls, INTERESTING_URLS_EXTRACTION_BATCH_SIZE)
    batch_full_result_llm_outputs: list[InterestingUrlsFullResultLlmOutput] = []

    batch_system_prompts = [
        generate_interesting_urls_batch_system_prompt(x_url_batch, custom_instructions)
        for x_url_batch in url_batches
    ]
    batch_full_result_llm_outputs = await call_structured_llm_batch(
        batch_system_prompts,
        InterestingUrlsFullResultLlmOutput,
    )

    log_message = f"Summarizing {len(batch_full_result_llm_outputs)} batches..."
    logger.info(log_message)

    system_prompt = generate_interesting_urls_summarizer_system_prompt(
        batch_full_result_llm_outputs, custom_instructions
    )
    summarized_full_result_llm_output = await call_structured_llm(
        system_prompt,
        InterestingUrlsFullResultLlmOutput,
    )
    urls = [
        x_url_info.url for x_url_info in summarized_full_result_llm_output.url_infos
    ]

    log_message = f"Summarized {len(batch_full_result_llm_outputs)} batches and found {len(urls)} interesting URLs."
    logger.info(log_message)

    save_interesting_urls(urls)
    save_interesting_urls_reason(summarized_full_result_llm_output)

    log_message = f"Extracted {len(urls)} interesting URLs."
    logger.info(log_message)

    await download_interesting_urls()


In [None]:
# CSS selectors

## Prompts

CSS_SELECTORS_EXTRACTION_SYSTEM_PROMPT = """<Prozess>
- Du bist Teil des folgenden Prozesses:
    - Um die Ausschließ-CSS-Selektoren zu bestimmen, wird ein Sample Set an Unterseiten der Webseite analysiert.
    - Dabei werden die heruntergeladenen HTML-Seiten betrachtet und Ausschließ-CSS-Selektoren identifiziert, die "Verschmutzungen" entfernen (Header, Footer, Cookiebanner, Werbung, etc.).
    - Dabei soll alles entfernt werden, was nicht wirklichen Inhalt darstellt. Natürlich sollen Titel, Beschreibungen, Kontaktdaten, Öffnungszeiten, etc. alles bleiben.
    - Aber viel bei einer Webseite ist auch einfach um die eigentlichen Inhalte "drumherum", z. B. Nav, Footer, PopUps, Bedienungshilfen, Socialmedia Widgets, Breadcrumbs, etc. diese sollen alle entfernt werden.
    - Das Ziel ist es, CSS-Selektoren zu finden, welche auf allen Seiten anwendbar sind, da am Ende das gleiche Set für alle tausende Seiten der Webseite verwendet wird.
    - CSS-Selektoren sollten so generisch wie möglich gehalten werden und z. B. nicht auf bestimmte Titel auf bestimmten Seiten abzielen.
    - Es sollte auch immer das höchstmögliche Element targetiert werden, z. B. sollte natürlich nicht jeder Button einzelnd in einem Cookie-Banner entfernt werden, sondern direkt der ganze Banner oder wenn dieser im Footer ist, welcher auch weg soll, dann direkt der gesamte Footer. So minimieren wir die benötigte Anzahl an CSS-Selektoren.
    - Da mit etwa 20 Sample Seiten gearbeitet wird und das rohe HTML noch sehr lang ist, wird in Batches vorgegangen. Zuerst analysieren mehrere KI-Agenten immer 1 Seite und identifizieren perfekte CSS-Selektoren und begründen auch ihre Auswahl. Es ist wichtig eine gute Begründung zu geben, damit der zusammenfassende KI-Agent die Gedanken hinter den Auswahlen besser versteht. Zusätzlich wird zu jedem CSS-Selektor auch eine Beispiel-Zeilennummer aus dem HTML angegeben. Durch diese wird dann ein Beispiel-Code-Block aus dem HTML geschnitten und auch dem zusammenfassenden KI-Agenten gegeben, damit dieser die Entscheidungen besser nachvollziehen kann und auch beim Kombinieren noch selbst eine gute Entscheidungsgrundlage hat.
    - Dann nimmt ein zweiter KI-Agent die Batches und kombiniert diese zu einer finalen Liste an CSS-Selektoren, indem er versucht die CSS-Selektoren so generisch wie möglich zu kombinieren und versucht sicherzustellen, dass die CSS-Selektoren auf allen Seiten anwendbar sind. Dabei gibt es keine Limitierung für die Anzahl an CSS-Selektoren.
    - Diese Selektoren werden dann später in neuracrawl verwendet, um sauberes Markdown zu extrahieren.
</Prozess>"""

CSS_SELECTORS_EXTRACTION_BATCH_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welcher die CSS-Selektoren für eine einzelne HTML-Seite identifiziert. Du bist also nicht der, welcher am Ende die ganzen Batches zusammenfasst.
</Aufgabe>

<HTML>
- Hier ist die HTML-Seite, welche du analysieren sollst (mit Zeilennummern):
{html_text}
</HTML>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer dir ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

CSS_SELECTORS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welcher die ganzen Batches zu einer finalen Liste an CSS-Selektoren zusammenfasst.
</Aufgabe>

<Batch CSS Selectors>
- Hier sind die Batches, welche du zusammenfassen sollst:
{batch_llm_outputs_text}
</Batch CSS Selectors>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer dir ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

## LLM Output models


class CssSelectorSingleResultLlmOutput(BaseModel):
    css_selector: str = Field()
    reason: str = Field()
    example_line_number: int = Field()


class CssSelectorsFullResultLlmOutput(BaseModel):
    css_selector_infos: list[CssSelectorSingleResultLlmOutput] = Field()


## Models


class ExtendedCssSelectorSingleResult(CssSelectorSingleResultLlmOutput):
    example_html: str = Field()


class ExtendedCssSelectorsFullResult(BaseModel):
    url: str = Field()
    css_selector_infos: list[ExtendedCssSelectorSingleResult] = Field()


## System prompt methods


def generate_css_selectors_batch_system_prompt(
    html_content: str, custom_instructions: str
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(CSS_SELECTORS_EXTRACTION_SYSTEM_PROMPT)

    # Add line numbers to HTML
    html_lines = html_content.split("\n")
    numbered_html_lines = [
        f"{x_index + 1:5d} | {x_line}" for x_index, x_line in enumerate(html_lines)
    ]
    html_text = "\n".join(numbered_html_lines)
    batch_system_prompt = CSS_SELECTORS_EXTRACTION_BATCH_SYSTEM_PROMPT.format(
        html_text=html_text, custom_instructions=custom_instructions
    )
    parts.append(batch_system_prompt)

    system_prompt = "\n\n".join(parts)
    return system_prompt


def generate_css_selectors_summarizer_system_prompt(
    batch_llm_outputs: list[ExtendedCssSelectorsFullResult],
    custom_instructions: str,
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(CSS_SELECTORS_EXTRACTION_SYSTEM_PROMPT)

    batch_llm_outputs_text = "\n\n".join(
        [
            f"Batch {x_index + 1} (URL: {x_batch_llm_output.url}):\n"
            + "\n".join(
                [
                    f"- CSS Selector: {y_selector_output.css_selector}\n  Reason: {y_selector_output.reason}\n  Example HTML:\n{y_selector_output.example_html}"
                    for y_selector_output in x_batch_llm_output.css_selector_infos
                ]
            )
            for x_index, x_batch_llm_output in enumerate(batch_llm_outputs)
        ]
    )
    system_prompt = CSS_SELECTORS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT.format(
        batch_llm_outputs_text=batch_llm_outputs_text,
        custom_instructions=custom_instructions,
    )
    parts.append(system_prompt)

    system_prompt = "\n\n".join(parts)
    return system_prompt


## File helpers

### CSS selectors folder


def get_css_selectors_folder_path() -> Path:
    project_folder_path = get_project_dir_path()
    file_path = project_folder_path / "20_css_selectors"
    file_path.mkdir(exist_ok=True, parents=True)
    return file_path


### Results folder


def get_css_selectors_results_folder_path() -> Path:
    css_selectors_folder_path = get_css_selectors_folder_path()
    results_folder_path = css_selectors_folder_path / "results"
    results_folder_path.mkdir(exist_ok=True, parents=True)
    return results_folder_path


### CSS selectors file


def save_css_selectors(css_selectors: list[str]) -> None:
    css_selectors_results_folder_path = get_css_selectors_results_folder_path()
    file_path = css_selectors_results_folder_path / "css_selectors.txt"
    selectors_text = "\n".join(css_selectors)
    with open(file_path, "w") as file:
        file.write(selectors_text)


### CSS selectors reason file


def save_css_selectors_reason(
    llm_output: CssSelectorsFullResultLlmOutput,
) -> None:
    css_selectors_results_folder_path = get_css_selectors_results_folder_path()
    file_path = css_selectors_results_folder_path / "css_selectors_reasons.txt"
    reason = llm_output.model_dump_json(indent=2)
    with open(file_path, "w") as file:
        file.write(reason)


## Utilities


# AIGENERATED
def extract_example_html_lines(html_content: str, line_number: int) -> str:
    html_lines = html_content.split("\n")
    start_index = max(
        0, line_number - 4
    )  # -4 because line_number is 1-indexed and we want 3 lines before
    end_index = min(len(html_lines), line_number + 3)  # +3 to get 3 lines after

    example_lines = html_lines[start_index:end_index]
    numbered_example_lines = [
        f"{start_index + x_index + 1:5d} | {x_line}"
        for x_index, x_line in enumerate(example_lines)
    ]
    return "\n".join(numbered_example_lines)


def hydrate_full_results_to_extended_full_results(
    full_results: list[CssSelectorsFullResultLlmOutput],
    data_packs: list[SaveStateDataPack],
) -> list[ExtendedCssSelectorsFullResult]:
    extended_full_results: list[ExtendedCssSelectorsFullResult] = []

    for x_data_pack, x_llm_output in zip(data_packs, full_results):
        extended_single_results: list[ExtendedCssSelectorSingleResult] = []

        for x_css_selector_info in x_llm_output.css_selector_infos:
            example_html = extract_example_html_lines(
                x_data_pack.raw_html, x_css_selector_info.example_line_number
            )
            extended_single_result = ExtendedCssSelectorSingleResult(
                css_selector=x_css_selector_info.css_selector,
                reason=x_css_selector_info.reason,
                example_line_number=x_css_selector_info.example_line_number,
                example_html=example_html,
            )
            extended_single_results.append(extended_single_result)

        extended_full_result = ExtendedCssSelectorsFullResult(
            url=x_data_pack.url,
            css_selector_infos=extended_single_results,
        )
        extended_full_results.append(extended_full_result)

    return extended_full_results


## Methods


async def extract_css_selectors(
    custom_instructions: str, interesting_urls_save_state_version: str | None = None
) -> None:
    log_message = "Extracting CSS selectors..."
    logger.info(log_message)

    interesting_urls_downloads_folder_path = (
        get_interesting_urls_downloads_folder_path()
    )
    interesting_urls_save_state = get_save_state(
        interesting_urls_downloads_folder_path, interesting_urls_save_state_version
    )
    data_packs = interesting_urls_save_state.data_packs

    log_message = f"Found {len(data_packs)} pages."
    logger.info(log_message)

    batch_system_prompts = [
        generate_css_selectors_batch_system_prompt(
            x_data_pack.raw_html, custom_instructions
        )
        for x_data_pack in data_packs
    ]
    for x_prompt in batch_system_prompts:
        print(len(x_prompt))
    batch_full_results = await call_structured_llm_batch(
        batch_system_prompts,
        CssSelectorsFullResultLlmOutput,
    )
    extended_batch_full_results = hydrate_full_results_to_extended_full_results(
        batch_full_results, data_packs
    )

    log_message = f"Summarizing {len(extended_batch_full_results)} batches..."
    logger.info(log_message)

    system_prompt = generate_css_selectors_summarizer_system_prompt(
        extended_batch_full_results, custom_instructions
    )
    summarized_full_result = await call_structured_llm(
        system_prompt,
        CssSelectorsFullResultLlmOutput,
    )
    css_selectors = [
        x_selector_output.css_selector
        for x_selector_output in summarized_full_result.css_selector_infos
    ]

    log_message = f"Summarized {len(extended_batch_full_results)} batches and found {len(css_selectors)} CSS selectors."
    logger.info(log_message)

    save_css_selectors(css_selectors)
    save_css_selectors_reason(summarized_full_result)

    log_message = f"Extracted {len(css_selectors)} CSS selectors."
    logger.info(log_message)

In [149]:
raise Exception("Setup done.")

Exception: Setup done.

# neuracrawl Tuner


In [151]:
# Set your project name here

set_project("siegburg")

## Sitemap

In [152]:
extract_sitemap_urls()

In [153]:
extract_frequent_sitemap_urls(5)

Found 168 common URL areas with frequency >= 5


# Exclusion URL Regexes

In [None]:
await extract_url_regexes("")

## Interesting URLs

In [None]:
await extract_interesting_urls("", None)

INFO:__main__:Extracting interesting URLs...
INFO:__main__:Found 6883 URLs.
INFO:__main__:Processing prompt (1/14)...
INFO:__main__:Processing prompt (2/14)...
INFO:__main__:Processing prompt (3/14)...
INFO:__main__:Processing prompt (4/14)...
INFO:__main__:Processing prompt (5/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (4/14).
INFO:__main__:Processing prompt (6/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (5/14).
INFO:__main__:Processing prompt (7/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (1/14).
INFO:__main__:Processing prompt (8/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (2/14).
INFO:__main__:Processing prompt (9/14)...
INFO:httpx:HTTP Request: POST http://1

# CSS Selectors

In [163]:
await extract_css_selectors("", None)

INFO:__main__:Extracting CSS selectors...
INFO:__main__:Found 30 pages.
644374
441974
452629
440628
438164
496363
441218
469797
466240
427490
477674
479044
474692
457612
446291
450795
557665
533414
457424
458004
451562
442750
449574
444736
947116
5090
448890
525635
450429
467144
INFO:__main__:Processing prompt (1/30)...
INFO:__main__:Processing prompt (2/30)...
INFO:__main__:Processing prompt (3/30)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (2/30).
INFO:__main__:Processing prompt (4/30)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (1/30).
INFO:__main__:Processing prompt (5/30)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (3/30).
INFO:__main__:Processing prompt (6/30)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"

[] need funcs to create new project
[] viewer should use save_state.json, maybe only work with save_state.json instead of all the other files? then have a result object that gets plugged into the save_state.json that can include extra data?
- pre cleaning of images, scripts, links, styles etc
- then insta clean with css selectors and markdown conversion and markdown cleaning, all in here