In [53]:
import os
import sys
from pathlib import Path

from marko import Markdown
from marko.md_renderer import MarkdownRenderer
from marko.inline import RawText, Link, CodeSpan
from marko.block import FencedCode, CodeBlock

import re

# Append resources dir to path
p = os.getcwd()+'/resources/'
sys.path.append(p)

from utils import *


In [54]:
# # save mn5 version of config
# save_mn5_config()

In [57]:
# add links to subfolders for the analysis and processing folders
def get_branch_url():
    """
    Construct the GitHub URL for the current branch of the repository.

    This function determines the name of the current Git branch using
    ``git branch --show-current`` and constructs a URL pointing to the
    root of that branch on GitHub. The base GitHub repository URL must
    be provided by ``load_resources()`` as ``m['']``.

    Returns
    -------
    str
        A URL string pointing to the root of the current branch on GitHub,
        formatted as: ``<gh_url>tree/<branch_name>/``.

    Notes
    -----
    - Example output for the ``main`` branch:
      ``https://github.com/user/repo/tree/main/``.
    """
    m = load_resources()
    cmd = 'git branch --show-current'
    b = run_cmd(cmd)
    b = b.strip()
    branch_url = f"{m['gh_url']}tree/{b}/"
    return branch_url

def find_missing_subdirs(d):
    """
    Identify subdirectories within a given directory that are not referenced in its README.

    This function checks each subdirectory of the specified directory `d` and compares
    its name against entries in the corresponding `README.md` file. Certain permanent
    directories (e.g., 'rules', 'template_snakemake') are ignored. Subdirectories not
    found in the README are collected and returned.

    Parameters
    ----------
    d : str
        Name of the parent directory to scan. Typically 'processing'.

    Returns
    -------
    list of pathlib.Path
        List of subdirectory paths that are missing from the README. If all subdirectories
        are listed in the README, the list will be empty.

    Notes
    -----
    - Permanent directories defined for 'processing' are ['rules', 'template_snakemake'].
    - The function assumes the README is located at '../<d>/README.md'.
    - Only immediate subdirectories of `d` are checked, not nested ones.
    """

    to_update = []

    if d == 'processing':
        perm_dirs = ['rules', 'template_snakemake', '.ipynb_checkpoints']
    elif d == 'analysis':
        perm_dirs = ['.ipynb_checkpoints']
    else: perm_dirs = []

    readme = f'{d}/README.md'
    

    # loop through each valid subdir
    for sub_d in Path(f'{d}/').glob('*/'):
        stem_sub_d = sub_d.stem
        if stem_sub_d in perm_dirs: continue
        if not sub_d.is_dir(): continue

        fmt_sub_d = f'[{stem_sub_d}]'
        if any(fmt_sub_d in line for line in open(readme)): continue
        to_update.append(sub_d)
        
    return to_update

def add_missing_subdirs_to_readme(d, missing_dirs):
    
    """
    Append missing subdirectory entries to the appropriate section of a README.

    This function reads the README file for a given top-level directory (`d`) and
    appends bullet points for any subdirectories listed in `missing_dirs` that
    are not already present. The new bullets are inserted just before the next
    section header in the README.

    Parameters
    ----------
    d : str
        Top-level directory name. Currently, only 'processing' and 'analysis'
        are supported.
    missing_dirs : list of pathlib.Path
        List of subdirectory paths that should be added to the README as bullet points.

    Notes
    -----
    - For 'processing', bullets are added under the section titled
      '## Subfolder descriptions'.
    - Each bullet is formatted as:
        * [<subdirectory_name>](<repo_url>/<subdirectory_path>/): # TODO!!
    - The function preserves all other content and headers in the README.

    """
    
    # load resources to get GH URL
    m = load_resources()   
    inserted = False
    
    
    # if d == 'processing':
    header = "## Subfolder descriptions"
    # elif d == 'analysis':
        # raise ValueError('You havent made this yet')
    
    # Read the README
    readme = f'{d}/README.md'
    with open(readme, 'r') as infile:
        lines = infile.readlines()

    output_lines = []
    inside_section = False

    for i, line in enumerate(lines):
        output_lines.append(line)

        # find first relevant bullet
        if header in line:
            inside_section = True
            continue

        # Detect end of bullet list (next header)
        if inside_section:
            if line.startswith("## "):
                # insert new bullets just before the break
                last_bullet_idx = max(i for i, line in enumerate(output_lines) if "* ["  in line.strip())
                for i, sub_d in enumerate(missing_dirs):
                    stem_sub_d = sub_d.stem
                    output_lines.insert(last_bullet_idx+i+1, f"* [{stem_sub_d}]({stem_sub_d}/): # TODO!! ")
                output_lines.insert(-2, '\n')
                inside_section = False
                inserted = True
    
    # if we had to wait for end of file
    if inserted == False:
        for i, sub_d in enumerate(missing_dirs):
            stem_sub_d = sub_d.stem
            output_lines.append(f"* [{stem_sub_d}]({stem_sub_d}/): # TODO!! ")
            output_lines.append('\n')

    # Write back updated README
    with open(readme, 'w') as outfile:
        outfile.writelines(output_lines)
                     
    # write to user where README entries have been written
    if len(missing_dirs) > 0:
        print(f"Added README entries to {readme} for ")
        for sub_d in missing_dirs:
            print(f'- {sub_d.stem}')
        print()
        
# all missing dirs to output note to user eventually
all_missing_dirs = []

# processing
d = 'processing'
missing_dirs = find_missing_subdirs(d)
add_missing_subdirs_to_readme(d, missing_dirs)
all_missing_dirs += missing_dirs

# analysis
d = 'analysis'
missing_dirs = find_missing_subdirs(d)
add_missing_subdirs_to_readme(d, missing_dirs)
all_missing_dirs += missing_dirs

In [146]:
# add links to all relevant unlinked files and directories in READMEs
# anchored "normal" pattern
_pat_normal = re.compile(
    r'^([\W\s]*?)'              # leading non-word/space
    r'((?:\.\./)*\.?[\w./-]+?)' # the path-ish thing (lazy)
    r'([^\w./-]*)$'             # trailing punctuation
)

# fallback: find candidate path-like substrings
_candidate_re = re.compile(r'(?:\.\./)*\.?[\w./-]+')

_PUNCT_TO_TRIM = set('.,:;)]}\'"')

def _trim_trailing_punct(core: str):
    trailing = []
    while core and core[-1] in _PUNCT_TO_TRIM:
        trailing.append(core[-1])
        core = core[:-1]
    return core, ''.join(reversed(trailing))

def extract_parts(s: str):
    """
    Extract (leading, core_path, trailing) from a token string.
    Returns None if no path-like core is found.
    """
    m = _pat_normal.match(s)
    if m:
        p1, core, p2 = m.groups()
        core, extra = _trim_trailing_punct(core)
        p2 = extra + p2
        return p1, core, p2

    candidates = list(_candidate_re.finditer(s))
    if not candidates:
        return None

    best = max(candidates, key=lambda mm: mm.end() - mm.start())
    core = best.group(0)
    start, end = best.start(), best.end()
    core_trimmed, extra = _trim_trailing_punct(core)
    leading = s[:start]
    trailing = extra + s[end:]
    return leading, core_trimmed, trailing


def format_link(word):
    """Format link in markdown depending on if it's a directory or a file."""
    if Path(word).is_dir():
        return f'[{word}]({word})'
    else:
        return f'[`{word}`]({word})'


# ---- main transformer ----

def link_files(node, files):
    """
    Walk a Markdown AST node and replace unlinked filenames/dirs
    with markdown links to those paths.

    Parameters
    ----------
    node : mdit_py_plugins node
        AST node to transform.
    files : set of str
        Set of known file/directory paths to link.

    Returns
    -------
    node : mdit_py_plugins node
        Node with links inserted where appropriate.
    """

    # skip entire code blocks
    if isinstance(node, (FencedCode, CodeBlock)):
        return node

    # skip inline code spans
    if isinstance(node, CodeSpan):
        return node

    # skip existing links
    if isinstance(node, Link):
        return node

    if isinstance(node, RawText):
        tokens = re.findall(r'\S+|\s+', node.children)
        new_nodes = []
        for tok in tokens:
            if tok.isspace():
                new_nodes.append(RawText(tok))
                continue
            parts = extract_parts(tok)
            if parts:
                p1, core, p2 = parts
                if core in files:
                    replaced = format_link(core)
                    new_nodes.append(RawText(p1 + replaced + p2))
                    continue

            # nothing to replace
            new_nodes.append(RawText(tok))

        return new_nodes

    # recurse into children
    if hasattr(node, "children"):
        new_children = []
        for child in node.children:
            replaced = link_files(child, files)
            if isinstance(replaced, list):
                new_children.extend(replaced)
            else:
                new_children.append(replaced)
        node.children = new_children

    return node



In [155]:
# get all git files that are md
cmd = 'git ls-files'
files = run_cmd(cmd).splitlines()
dirs = [str(Path(f).parent) for f in files]
md_files = [f for f in list(set(files)|set(dirs)) if f != '.' and f.endswith('.md')]

# get all git files
cmd = 'git ls-files'
files = run_cmd(cmd).splitlines()
dirs = [str(Path(f).parent) for f in files]
files = [f for f in list(set(files)|set(dirs)) if f != '.']
dirs = [f'{f}/' for f in files if Path(f).is_dir()] # add the trailing / version of dirs
files += dirs 

# loop through all md files to edit
# md_files = ['README_test.md']
# md_files = ['TEST_revised.md']
# md_files = ['analysis/README.md']
for md_file in md_files:
    rel_files = []
    for file in files:
        
        # compute relative path from md_file's directory
        rel_path = os.path.relpath(file, Path(md_file).parent)
        rel_files.append(rel_path)

    md = Markdown()
    with open(md_file, 'r') as f:
        content = f.read()
        doc = md.parse(content)

    doc = link_files(doc, rel_files)

    md = Markdown(renderer=MarkdownRenderer)
    doc = md.render(doc)

    # write to new md file
    with open({md_file}, 'w') as ofile:
        ofile.write(doc)

In [None]:
# gh action that checks for broken links