In [52]:
## Export to other training material or knowledge bases
## This below is to export the getting started guide concatenated as a single markdown file, referencing all images flat in the same folder.
## Some component and layout replacements are specific for Elixir's FAIR cook book (https://faircookbook.elixir-europe.org/)

import re

# List of relative paths to MD or MDX files

files = [
    "src/content/docs/start-here/index.mdx",
    "src/content/docs/start-here/create-arc-scaffold.mdx",
    "src/content/docs/start-here/investigation.mdx",
    "src/content/docs/start-here/study.mdx",
    "src/content/docs/start-here/check-point1.mdx",
    "src/content/docs/start-here/assays.mdx",
    "src/content/docs/start-here/share.mdx",
    "src/content/docs/start-here/check-point2.mdx",
    "src/content/docs/start-here/sops.mdx",
    "src/content/docs/start-here/data-analysis/index.mdx",
    "src/content/docs/start-here/data-analysis/option1-virtual-assay.mdx",
    "src/content/docs/start-here/data-analysis/option2-cwl.mdx",
    "src/content/docs/start-here/datamap.mdx",
    "src/content/docs/start-here/check-point3.mdx",
    "src/content/docs/start-here/validate-arc.mdx",
    "src/content/docs/start-here/publish-arc.mdx",
    "src/content/docs/start-here/check-point4.mdx",
]

viola_opening = r'<p style="display:inline-flex; align-items:center; margin:0.5em 0;">\n\t<img src="https://raw.githubusercontent.com/nfdi4plants/nfdi4plants.knowledgebase/5c0b130090e1b8ad4ecf60b168ba6070aeca0d7f/src/assets/images/viola/viola-avatar.jpg" alt="Viola" style="width:80px; height:80px; border-radius:50%; margin-right:0.5em;">\n\t<span style="background:#f1f1f1; padding:0.5em 0.8em; border-radius:1em;">\n'
viola_closing = r'\t</span>\n</p>\n\n'


# Replacements

replacements = [
    (r"^import .*[\r\n]*", ""),  
    (r":::tip", r"```{admonition} Tip\n:class: tip "),
    (r":::note", r"```{note}"),
    (r":::", "```"),
    
    (r'<Card icon="pen" title="ARCitect">', "````{dropdown} ARCitect\n:open:"),
    (r'<Card title="ARCitect installation" icon="setting">', "````{dropdown} ARCitect installation\n:open:"),
    (r'<Card title="DataPLANT Account" icon="setting">', "````{dropdown} DataPLANT Account\n:open:"),
    (r'<Card icon="gitlab" title="PLANTDataHUB">', "````{dropdown} PLANTDataHUB\n:open:"),
    (r'<Card title="Your ARC\'s name" icon="approve-check" >', r"````{note}"),
    
    (r'</Card>.*[\r]*', "````"),
    (r'<Steps>.*[\r\n]*', ""),
    (r'</Steps>.*[\r\n]*', ""),
    (r'<ViolaSays>.*[\r\n]*', viola_opening),
    (r'</ViolaSays>.*[\r]*', viola_closing),
    (r"\n\s*\n(?=\s*</span>)", "\n"),
    (r"^[ \t]+(?=\d+\.)", "",),
    (r"^[ \t]+(?=```)", ""),
    (r"@images/start-here/", "./assets/"),
    
]



def extract_title_and_content(content):
    """Removes YAML frontmatter and extracts the title."""
    yaml_match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL)
    if yaml_match:
        yaml_block = yaml_match.group(1)
        content = content[yaml_match.end():]
        title_match = re.search(r"^title:\s*(.+)", yaml_block, re.MULTILINE)
        title = title_match.group(1).strip() if title_match else "Untitled"
    else:
        title = "Untitled"
    return title, content.strip()

def substitute_image_imports(text):
    """Find imports of images and inline them into <img> tags."""
    mapping = {}
    
    # Find imports like: import XY from "@images/start-here/file.svg"
    for match in re.finditer(r'^import\s+(\w+)\s+from\s+[\'"](@images/[^\'"]+)[\'"]', text, flags=re.MULTILINE):
        var, path = match.groups()
        # Replace @images/start-here/ with ./assets/
        fixed_path = path.replace("@images/start-here/", "./assets/")
        mapping[var] = fixed_path

    # Replace <img src={Var.src} ...> with <img src="fixed_path" ...>
    for var, fixed_path in mapping.items():
        text = re.sub(
            rf'<img\s+([^>]*?)src=\{{\s*{var}\.src\s*\}}([^>]*?)>',
            rf'<img \1src="{fixed_path}"\2>',
            text
        )
    return text

def apply_replacements(text, replacements):
    """Run regex replacements with multiline support."""
    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text, flags=re.MULTILINE)
    return text

def substitute_file_tree(text):
    """Convert <FileTree>...</FileTree> blocks to markdown file trees."""
    def repl(match):
        inner = match.group(1).strip()
        # Convert lines starting with '-' into list/tree format
        lines = [line.strip(" -") for line in inner.splitlines() if line.strip()]
        tree = ["```", lines[0]]
        for l in lines[1:-1]:
            tree.append(f"├── {l}")
        tree.append(f"└── {lines[-1]}")
        tree.append("```")
        return "\n".join(tree)
    
    return re.sub(r"<FileTree>(.*?)</FileTree>", repl, text, flags=re.DOTALL)

def concatenate_files(files, replacements):
    output = []
    for path in files:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()

        title, body = extract_title_and_content(content)
        
        # Step 1: substitute imported image vars
        body = substitute_image_imports(body)
        # Step 2: substitute file trees
        body = substitute_file_tree(body)
        # Step 3: apply cleanup replacements (like removing imports)
        body = apply_replacements(body, replacements)

        output.append(f"# {title}\n\n{body}")
    return "\n\n".join(output)

if __name__ == "__main__":
    combined = concatenate_files(files, replacements)
    with open("combined.md", "w", encoding="utf-8") as out:
        out.write(combined)