In [119]:
# Common

import os
import re
import shutil
from collections import Counter
from pathlib import Path
from pydantic import BaseModel, Field
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
import logging
import sys
from logging import getLogger
import asyncio
import httpx
from bs4 import BeautifulSoup

# Logging

logger = getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# Utilities


def convert_url_to_file_name(url: str) -> str:
    filename = url.replace("https://", "").replace("http://", "").replace("/", "_")
    return filename


def batch_items[T](
    items: list[T],
    max_item_count_per_batch: int,
) -> list[list[T]]:
    batches = [
        items[x_index : x_index + max_item_count_per_batch]
        for x_index in range(0, len(items), max_item_count_per_batch)
    ]
    return batches


## LLMs


async def call_structured_llm[T: BaseModel](
    system_prompt: str,
    output_model: type[T],
) -> T:
    system_message = SystemMessage(system_prompt)
    human_message = HumanMessage("Erledige die Aufgabe.")
    messages = [system_message, human_message]
    llm = ChatOpenAI(
        base_url="http://127.0.0.1:50025",
        api_key="litellm-api-key-1234",
        model="bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0",
        timeout=120,
        temperature=0.1,
    )
    structured_llm = llm.with_structured_output(
        output_model,
        method="json_schema",
    )
    llm_output = await structured_llm.ainvoke(messages)
    return llm_output


MAX_CONCURRENT_BATCH_COUNT = 5


async def call_structured_llm_batch[T: BaseModel](
    system_prompts: list[str],
    output_model: type[T],
) -> list[T]:
    semaphore = asyncio.Semaphore(MAX_CONCURRENT_BATCH_COUNT)

    async def process_rate_limited(
        batch_index: int,
        system_prompt: str,
    ) -> T:
        async with semaphore:
            log_message = (
                f"Processing prompt ({batch_index + 1}/{len(system_prompts)})..."
            )
            logger.info(log_message)

            result = await call_structured_llm(system_prompt, output_model)

            log_message = f"Processed prompt ({batch_index + 1}/{len(system_prompts)})."
            logger.info(log_message)

            return result

    process_tasks = [
        process_rate_limited(x_index, x_system_prompt)
        for x_index, x_system_prompt in enumerate(system_prompts)
    ]

    all_results = await asyncio.gather(*process_tasks)
    return all_results


# App comaptible save state


class SaveStateDataPack(BaseModel):
    url: str = Field()
    raw_html: str = Field()
    cleaned_html: str = Field()
    raw_markdown: str = Field()
    cleaned_markdown: str = Field()
    feedback: str = Field()


class SaveState(BaseModel):
    versions_folder_path: Path = Field()
    cloneable_result_folder_path: Path = Field()
    data_packs: list[SaveStateDataPack] = Field()


# AIGENERATED
def get_current_versions(folder_path: Path) -> list[str]:
    """Get all existing version folders (e.g., ['v_000', 'v_001'])."""
    versions = []
    for item in folder_path.iterdir():
        if item.is_dir() and item.name.startswith("v_"):
            versions.append(item.name)

    return sorted(versions)


# AIGENERATED
def get_latest_version(folder_path: Path) -> str:
    """Get the latest version folder name."""
    versions = get_current_versions(folder_path)
    if not versions:
        return "v_000"
    return versions[-1]


# AIGENERATED
def get_next_version(folder_path: Path) -> str:
    """Get the next version folder name."""
    latest = get_latest_version(folder_path)
    if latest == "v_000" and not (folder_path / latest).exists():
        return "v_000"

    # Extract number from latest version (e.g., "v_000" -> 0)
    version_num = int(latest.split("_")[1])
    next_num = version_num + 1
    return f"v_{next_num:03d}"


# AIGENERATED
def save_save_state(save_state: SaveState) -> None:
    """Save the state to disk in the structured folder format."""
    # Create version folder
    version_folder_path = save_state.versions_folder_path / get_next_version(
        save_state.versions_folder_path
    )
    version_folder_path.mkdir(exist_ok=True, parents=True)

    # Create results folder
    results_folder_path = version_folder_path / "results"
    results_folder_path.mkdir(exist_ok=True, parents=True)

    # Copy files from cloneable_result_folder_path to version's results folder for version history
    if save_state.cloneable_result_folder_path.exists():
        for item in save_state.cloneable_result_folder_path.iterdir():
            if item.is_file():
                shutil.copy2(item, results_folder_path / item.name)
            elif item.is_dir():
                shutil.copytree(
                    item, results_folder_path / item.name, dirs_exist_ok=True
                )

    # Save each data pack
    for idx, data_pack in enumerate(save_state.data_packs):
        # Create sanitized folder name from URL using convert_url_to_file_name
        folder_name = f"{idx:03d}_{convert_url_to_file_name(data_pack.url)}"

        data_pack_folder_path = version_folder_path / folder_name
        data_pack_folder_path.mkdir(exist_ok=True, parents=True)

        # Save URL
        (data_pack_folder_path / "05_url.txt").write_text(
            data_pack.url, encoding="utf-8"
        )

        # Save raw HTML
        (data_pack_folder_path / "10_raw_html.html").write_text(
            data_pack.raw_html, encoding="utf-8"
        )

        # Save cleaned HTML
        (data_pack_folder_path / "20_cleaned_html.html").write_text(
            data_pack.cleaned_html, encoding="utf-8"
        )

        # Save raw markdown
        (data_pack_folder_path / "30_raw_markdown.md").write_text(
            data_pack.raw_markdown, encoding="utf-8"
        )

        # Save cleaned markdown
        (data_pack_folder_path / "40_cleaned_markdown.md").write_text(
            data_pack.cleaned_markdown, encoding="utf-8"
        )

        # Save feedback
        (data_pack_folder_path / "50_feedback.txt").write_text(
            data_pack.feedback, encoding="utf-8"
        )


In [120]:
# Project

project: str = ""


def set_project(project_name) -> None:
    global project
    project = project_name


def get_project() -> str:
    global project
    return project


def get_project_dir_path() -> Path:
    global project
    project_dir_path = Path(os.getcwd(), "projects", project)
    project_dir_path.mkdir(exist_ok=True, parents=True)
    return project_dir_path


In [121]:
# Sitemap

## Helpers

### Sitemap URLS folder


def get_sitemap_urls_folder_path() -> Path:
    project_folder_path = get_project_dir_path()
    folder_path = project_folder_path / "05_sitemap_urls"
    folder_path.mkdir(exist_ok=True, parents=True)
    return folder_path


### Sitemap URLs file


def get_sitemap_urls_txt_file_path() -> Path:
    sitemap_urls_folder_path = get_sitemap_urls_folder_path()
    file_path = sitemap_urls_folder_path / "sitemap_urls.txt"
    return file_path


def save_sitemap_urls(urls: list[str]) -> None:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    urls_text = "\n".join(urls)
    with open(urls_txt_file_path, "w") as file:
        file.write(urls_text)


def get_saved_sitemap_urls() -> list[str]:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    with open(urls_txt_file_path, "r") as file:
        urls = [
            x_line.strip() for x_line in file.readlines() if len(x_line.strip()) > 0
        ]
    return urls


### Frequent Sitemap URLs file


def get_frequent_sitemap_urls_txt_file_path() -> Path:
    sitemap_urls_folder_path = get_sitemap_urls_folder_path()
    file_path = sitemap_urls_folder_path / "frequent_sitemap_urls.txt"
    return file_path


def save_frequent_sitemap_urls(frequent_urls_text: str) -> None:
    frequent_urls_txt_file_path = get_frequent_sitemap_urls_txt_file_path()
    with open(frequent_urls_txt_file_path, "w") as file:
        file.write(frequent_urls_text)


## Methods


def extract_sitemap_urls() -> None:
    urls_txt_file_path = get_sitemap_urls_txt_file_path()
    url_regex = r"https?://[^\s<>\"']+"
    with open(urls_txt_file_path, "r") as urls_txt_file:
        text = urls_txt_file.read()
    urls = re.findall(url_regex, text)
    save_sitemap_urls(urls)


# AIGENERATED
def extract_frequent_sitemap_urls(min_frequency) -> None:
    """This function extracts common URL paths or path segments that appear frequently across URLs."""
    urls = get_saved_sitemap_urls()

    # Extract path segments from each URL
    path_segments = []

    for x_url in urls:
        # Remove protocol and domain
        if "://" in x_url:
            url_without_protocol = x_url.split("://", 1)[1]
            if "/" in url_without_protocol:
                domain_and_path = url_without_protocol.split("/", 1)
                if len(domain_and_path) > 1:
                    path = domain_and_path[1]
                    # Build up all parent paths
                    parts = path.rstrip("/").split("/")
                    for i in range(1, len(parts) + 1):
                        segment = "/".join(parts[:i]) + "/"
                        path_segments.append(segment)

    # Count frequency of each path segment
    segment_counter = Counter(path_segments)

    # Filter segments by minimum frequency
    common_segments = {
        segment: count
        for segment, count in segment_counter.items()
        if count >= min_frequency
    }

    # Sort by frequency (descending) and then alphabetically
    sorted_segments = sorted(common_segments.items(), key=lambda x: (-x[1], x[0]))

    # Write to output file
    frequent_sitemap_urls_text = "\n".join(
        [f"{x_count}\t{x_segment}" for x_segment, x_count in sorted_segments]
    )
    save_frequent_sitemap_urls(frequent_sitemap_urls_text)

    print(
        f"Found {len(sorted_segments)} common URL areas with frequency >= {min_frequency}"
    )

In [122]:
GENERAL_SYSTEM_PROMPT = """<Generell>
- neuracrawl
    - neuracrawl ist ein Webcrawler, welcher eine Ausgangsdomain bekommt und von dort dann aus deepcrawlt, also sich durch alle Links der Seite hangelt und immer weiter nach neuen Links sucht.
    - Er ist sehr gut darin, eine einzige Webseite sehr ausführlich zu crawlen.
    - URL Ausschließungen
        - Dabei schließt neuracrawl aber auch bestimmte URL Gruppen/Subpfade aus.
        - Zum Beispiel, kann es sein, dass wir bei einer Webseite alle Veranstaltungen oder Newsartikel ausschließen wollen, da wir die zum Beispiel nochmal getrennt über eine API strukturiert auslesen.
    - Markdown Extraktion
        - Dabei extrahiert er extrem sauberes Markdown, ohne Header, Footer, Cookiebanner, Werbeinhalten, etc.
        - Die Daten am Ende enthalten nur den reinen Inhalt der Webseite.
        - Dabei geht er subtraktiv vor, also entfernt alle Elemente, welche "Verschmutzungen" darstellen.
        - Dies ist immer ein Spiel zwischen "wir wollen alles entfernen, was nicht wirklicher Inhalt ist" und "wir wollen nichts entfernen, was zum wirklichen Inhalt gehört".
        - Unser Grundsatz ist, dass wir so nah wie möglich an den wirklichen Inhalt rankommen wollen, ohne dabei aber Informationen zu verlieren. Wir dürfen auf keinen Fall echte Informationen verlieren, egal, wo diese auf der Webseite stehen.
    - neuracrawl benötigt generell die folgenden Einstellungen:
        - Ausgangsdomain und erlaubt andere Domains, auf welche er kommen und crawlen darf.
        - URL-Ausschließ-Regexes, welche bestimmte URL Gruppen/Subpfade ausschließen
            - Zum Beispiel : "^.*/(aktuelles|amtsblatt)/.*$" oder "^.*\\.(?:ics|pdf).*$"
        - CSS-Ausschließ-Selektoren, welche bestimmte HTML-Elemente auf allen Seiten ausschließen
            - Zum Beispiel : "header", ".front-left" oder "#cc-size"

- neuracrawl tuner ist eine Sammlung an Funktionen, welche dabei helfen, die perfekten Werte für die obigen Einstellungen zu finden.
</Generell>"""

In [None]:
# Interesting URLs

## Prompts

INTERESTING_URLS_EXTRACTION_BATCH_SIZE = 500

INTERESTING_URLS_EXTRACTION_COMMON_SYSTEM_PROMPT = """<Prozess>
- Du bist Teil des folgenden Prozesses:
    - Um die CSS-Ausschließ-Selektoren zu bestimmen, müssen einige Sample Seiten der Webseite analysiert werden und auf diesen dann die CSS-Selektoren angewendet werden um zu schauen, ob sie den gewünschten Effekt haben.
    - Dazu werden zu erst aus der Sitemap einer Webseite interessante, diverse URLs ausgewählt, welche das Sample Set darstellen.
    - Dabei sollten diese Seiten besonders repräsentativ für die gesamte Webseite sein. Z. B. einmal die Startseite, dann eine Veranstaltungsseite, eine Newsseite, eine Übersichtsseite, eine Archivseite, eine Kontaktseite, eine Impressumseite, etc.
    - Also ein Set an Seiten, bei dem wir auch unterschiedliche Inhalte und Layoutstrukturen erwarten.
    - Natürlich können wir das nicht genau wissen, da wir nur die URLs sehen und aus diesen einfach von außen auswählen müssen. Tortzdem lässt sich an den URLs und Pfadsegmenten schon sehr gut ablesen, welche Seiten unterschiedliche Inhalte enthalten sollten.
    - Da eine Webseite tausende Seiten enthalten kann, gehen wir hierbei in Batches vor. Zuerst extrahieren KI-Agenten aus jeweils 500 URLs ein Sample Set und begründen ihre Auswahlen.
    - Dann nimmt ein zweiter KI-Agent die Batches und kombiniert diese zu einem finalen Sample Set.
    - Das Sample Set wird dann später heruntergeladen und vom Nutzer analysiert.
</Prozess>"""

INTERESTING_URLS_EXTRACTION_BATCH_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welcher die Auswahl der URLs für das Sample Set vornimmt und dabei einen Batch von maximal 500 URLs bearbeitet. Du bist also nicht der, welcher am Ende die ganzen Batches zusammenfasst.
</Aufgabe>

<URLs>
- Hier sind die URLs, aus welchen du ein Sample Set auswählen sollst:
{urls_text}
</URLs>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer die ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT = """<Aufgabe>
- Genauer gesagt, bist du der KI-Agent, welche die ganzen Batches zu einem finalen Sample Set zusammenfasst.
- Versuche maximal 20 URLs auszuwählen.
</Aufgabe>

<Batch Sample Sets>
- Hier sind die Batches, welche du zusammenfassen sollst:
{batch_llm_outputs_text}
</Batch Sample Sets>

<Zusatzanweisungen>
- Eventuell gibt der Nutzer die ein paar Zusatzanweisungen, um dich etwas mehr zu leiten. Die ursprüngliche Aufgabe bleibt, aber die Zusatzanweisungen können dir helfen, eine Auswahl zu treffen, welche mehr den Vorstellungen des Nutzers entspricht.
{custom_instructions}
</Zusatzanweisungen>"""

## LLM Output models


class InterestingUrlsExtractorUrlLlmOutput(BaseModel):
    url: str = Field()
    reason: str = Field()


class InterestingUrlsExtractorLlmOutput(BaseModel):
    urls: list[InterestingUrlsExtractorUrlLlmOutput] = Field()


## System prompt methods


def generate_interesting_urls_batch_system_prompt(
    urls: list[str], custom_instructions: str
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(INTERESTING_URLS_EXTRACTION_COMMON_SYSTEM_PROMPT)

    urls_text = "\n".join(
        [f"{x_index + 1}. {x_url}" for x_index, x_url in enumerate(urls)]
    )
    batch_system_prompt = INTERESTING_URLS_EXTRACTION_BATCH_SYSTEM_PROMPT.format(
        urls_text=urls_text, custom_instructions=custom_instructions
    )
    parts.append(batch_system_prompt)

    system_prompt = "\n\n".join(parts)
    return system_prompt


def generate_interesting_urls_summarizer_system_prompt(
    batch_llm_outputs: list[InterestingUrlsExtractorLlmOutput],
    custom_instructions: str,
) -> str:
    parts: list[str] = []
    parts.append(GENERAL_SYSTEM_PROMPT)
    parts.append(INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT)

    batch_llm_outputs_text = "\n\n".join(
        [
            f"Batch {x_index + 1}:\n"
            + "\n".join(
                [
                    f"- {url_output.url}\n  Reason: {url_output.reason}"
                    for url_output in x_batch_llm_output.urls
                ]
            )
            for x_index, x_batch_llm_output in enumerate(batch_llm_outputs)
        ]
    )
    system_prompt = INTERESTING_URLS_EXTRACTION_SUMMARIZER_SYSTEM_PROMPT.format(
        batch_llm_outputs_text=batch_llm_outputs_text,
        custom_instructions=custom_instructions,
    )
    return system_prompt


## Helpers

### Interesting URLs folder


def get_interesting_urls_folder_path() -> Path:
    project_folder_path = get_project_dir_path()
    file_path = project_folder_path / "10_interesting_urls"
    file_path.mkdir(exist_ok=True, parents=True)
    return file_path


### Results folder


def get_interesting_urls_results_folder_path() -> Path:
    interesting_urls_folder_path = get_interesting_urls_folder_path()
    results_folder_path = interesting_urls_folder_path / "results"
    results_folder_path.mkdir(exist_ok=True, parents=True)
    return results_folder_path


### Interesting URLs file


def save_interesting_urls(urls: list[str]) -> None:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    file_path = interesting_urls_results_folder_path / "interesting_urls.txt"
    urls_text = "\n".join(urls)
    with open(file_path, "w") as file:
        file.write(urls_text)


def get_interesting_urls() -> list[str]:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    interesting_urls_file_path = (
        interesting_urls_results_folder_path / "interesting_urls.txt"
    )
    with open(interesting_urls_file_path, "r") as file:
        urls = [
            x_line.strip() for x_line in file.readlines() if len(x_line.strip()) > 0
        ]
    return urls


def save_interesting_urls_reason(
    llm_output: InterestingUrlsExtractorLlmOutput,
) -> None:
    interesting_urls_results_folder_path = get_interesting_urls_results_folder_path()
    file_path = interesting_urls_results_folder_path / "interesting_urls_reasons.txt"
    reason = llm_output.model_dump_json(indent=2)
    with open(file_path, "w") as file:
        file.write(reason)


### Interesting URLs downloads folder


def get_interesting_urls_downloads_folder_path() -> Path:
    interesting_urls_folder_path = get_interesting_urls_folder_path()
    downloaded_folder_path = interesting_urls_folder_path / "downloads"
    downloaded_folder_path.mkdir(exist_ok=True)
    return downloaded_folder_path


### Interesting URLs downloads file


def save_downloaded_url(index: int, url: str, html_content: str) -> None:
    downloads_folder_path = get_interesting_urls_downloads_folder_path()
    index_file_name_part = f"{index:03d}"
    url_file_name_part = convert_url_to_file_name(url)
    full_file_name = f"{index_file_name_part}_{url_file_name_part}.html"
    file_path = downloads_folder_path / full_file_name
    with open(file_path) as file:
        file.write(html_content)


## Methods


async def download_interesting_urls() -> None:
    log_message = "Downloading interesting URLs..."
    logger.info(log_message)

    urls = get_interesting_urls()

    log_message = f"Found {len(urls)} URLs."
    logger.info(log_message)

    async with httpx.AsyncClient(timeout=60) as http_client:
        download_tasks = [http_client.get(x_url) for x_url in urls]
        responses = await asyncio.gather(*download_tasks)

    data_packs: list[SaveStateDataPack] = []

    for x_url, x_response in zip(urls, responses):
        soup = BeautifulSoup(x_response.text, "html.parser")
        prettified_html = soup.prettify()

        data_pack = SaveStateDataPack(
            url=x_url,
            raw_html=prettified_html,
            cleaned_html="",
            raw_markdown="",
            cleaned_markdown="",
            feedback="",
        )
        data_packs.append(data_pack)

    results_folder_path = get_interesting_urls_results_folder_path()
    downloads_folder_path = get_interesting_urls_downloads_folder_path()
    save_state = SaveState(
        versions_folder_path=downloads_folder_path,
        cloneable_result_folder_path=results_folder_path,
        data_packs=data_packs,
    )
    save_save_state(save_state)

    log_message = f"Downloaded {len(urls)} URLs."
    logger.info(log_message)


async def extract_interesting_urls(custom_instructions: str) -> None:
    log_message = "Extracting interesting URLs..."
    logger.info(log_message)

    urls = get_saved_sitemap_urls()

    log_message = f"Found {len(urls)} URLs."
    logger.info(log_message)

    url_batches = batch_items(urls, INTERESTING_URLS_EXTRACTION_BATCH_SIZE)
    batch_llm_outputs: list[InterestingUrlsExtractorLlmOutput] = []

    batch_system_prompts = [
        generate_interesting_urls_batch_system_prompt(x_url_batch, custom_instructions)
        for x_url_batch in url_batches
    ]

    batch_llm_outputs = await call_structured_llm_batch(
        batch_system_prompts,
        InterestingUrlsExtractorLlmOutput,
    )

    log_message = f"Summarizing {len(batch_llm_outputs)} batches..."
    logger.info(log_message)

    system_prompt = generate_interesting_urls_summarizer_system_prompt(
        batch_llm_outputs, custom_instructions
    )
    llm_output = await call_structured_llm(
        system_prompt,
        InterestingUrlsExtractorLlmOutput,
    )
    urls = [x_url_output.url for x_url_output in llm_output.urls]

    log_message = f"Summarized {len(batch_llm_outputs)} batches and found {len(urls)} interesting URLs."
    logger.info(log_message)

    save_interesting_urls(urls)
    save_interesting_urls_reason(llm_output)

    log_message = f"Extracted {len(urls)} interesting URLs."
    logger.info(log_message)

    await download_interesting_urls()


In [124]:
raise Exception("Setup done.")

Exception: Setup done.

# neuracrawl Tuner


In [125]:
# Set your project name here

set_project("siegburg")

## Sitemap

In [126]:
extract_sitemap_urls()

In [127]:
extract_frequent_sitemap_urls(5)

Found 168 common URL areas with frequency >= 5


## Interesting URLs

In [128]:
await extract_interesting_urls("")

INFO:__main__:Extracting interesting URLs...
INFO:__main__:Found 6883 URLs.
INFO:__main__:Processing prompt (1/14)...
INFO:__main__:Processing prompt (2/14)...
INFO:__main__:Processing prompt (3/14)...
INFO:__main__:Processing prompt (4/14)...
INFO:__main__:Processing prompt (5/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (4/14).
INFO:__main__:Processing prompt (6/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (2/14).
INFO:__main__:Processing prompt (7/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (3/14).
INFO:__main__:Processing prompt (8/14)...
INFO:httpx:HTTP Request: POST http://127.0.0.1:50025/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processed prompt (5/14).
INFO:__main__:Processing prompt (9/14)...
INFO:httpx:HTTP Request: POST http://1

## Exclusion CSS Selectors

In [None]:
extract_exclusion_css_selectors("custom instructions", based on what version?)
# insta apply
# insta html to markdown
# insta markdown clean
# insta autofeedback, per document give markdown, html, cleaned html
# insta file with css selectors

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1106298633.py, line 1)

my viewer needs to be ok with urls file, and a folder with one folder with subfolders that are numbers, then /markdown.md, cleaned-html, raw-html, url.txt with one line beeing the url, + one file feeedback.txt. feedback txt should also be editable inside the swift app.

need funcs to create new project