In [76]:
import os
import sys
from pathlib import Path

# Append resources dir to path
p = os.path.dirname(os.getcwd())+'/resources/'
sys.path.append(p)

from utils import *


In [77]:
# # save mn5 version of config
# save_mn5_config()

In [91]:
def get_branch_url():
    """
    Construct the GitHub URL for the current branch of the repository.

    This function determines the name of the current Git branch using
    ``git branch --show-current`` and constructs a URL pointing to the
    root of that branch on GitHub. The base GitHub repository URL must
    be provided by ``load_resources()`` as ``m['gh_url']``.

    Returns
    -------
    str
        A URL string pointing to the root of the current branch on GitHub,
        formatted as: ``<gh_url>tree/<branch_name>/``.

    Notes
    -----
    - Example output for the ``main`` branch:
      ``https://github.com/user/repo/tree/main/``.
    """
    m = load_resources()
    cmd = 'git branch --show-current'
    b = run_cmd(cmd)
    b = b.strip()
    branch_url = f"{m['gh_url']}tree/{b}/"
    return branch_url

def find_missing_subdirs(d):
    """
    Identify subdirectories within a given directory that are not referenced in its README.

    This function checks each subdirectory of the specified directory `d` and compares
    its name against entries in the corresponding `README.md` file. Certain permanent
    directories (e.g., 'rules', 'template_snakemake') are ignored. Subdirectories not
    found in the README are collected and returned.

    Parameters
    ----------
    d : str
        Name of the parent directory to scan. Typically 'processing'.

    Returns
    -------
    list of pathlib.Path
        List of subdirectory paths that are missing from the README. If all subdirectories
        are listed in the README, the list will be empty.

    Notes
    -----
    - Permanent directories defined for 'processing' are ['rules', 'template_snakemake'].
    - The function assumes the README is located at '../<d>/README.md'.
    - Only immediate subdirectories of `d` are checked, not nested ones.
    """

    to_update = []

    if d == 'processing':
        perm_dirs = ['rules', 'template_snakemake', '.ipynb_checkpoints']
    elif d == 'analysis':
        perm_dirs = ['.ipynb_checkpoints']
    else: perm_dirs = []

    readme = f'../{d}/README.md'

    # loop through each valid subdir
    for sub_d in Path(f'../{d}/').glob('*/'):
        stem_sub_d = sub_d.stem
        if stem_sub_d in perm_dirs: continue
        if not sub_d.is_dir(): continue

        fmt_sub_d = f'[{stem_sub_d}]'
        if any(fmt_sub_d in line for line in open(readme)): continue
        to_update.append(sub_d)
        
    return to_update

def add_missing_subdirs_to_readme(d, missing_dirs):
    
    """
    Append missing subdirectory entries to the appropriate section of a README.

    This function reads the README file for a given top-level directory (`d`) and
    appends bullet points for any subdirectories listed in `missing_dirs` that
    are not already present. The new bullets are inserted just before the next
    section header in the README.

    Parameters
    ----------
    d : str
        Top-level directory name. Currently, only 'processing' and 'analysis'
        are supported.
    missing_dirs : list of pathlib.Path
        List of subdirectory paths that should be added to the README as bullet points.

    Notes
    -----
    - The function assumes the README is located at '../<d>/README.md'.
    - For 'processing', bullets are added under the section titled
      '## Subfolder descriptions'.
    - Each bullet is formatted as:
        * [<subdirectory_name>](<repo_url>/<subdirectory_path>/): # TODO!!
    - The variable `m['gh_url']` must be defined externally to provide the repository URL.
    - The function preserves all other content and headers in the README.

    """
    
    # load resources to get GH URL
    m = load_resources()   
    inserted = False
    
    
    # if d == 'processing':
    header = "## Subfolder descriptions"
    # elif d == 'analysis':
        # raise ValueError('You havent made this yet')
    
    # Read the README
    readme = f'../{d}/README.md'
    with open(readme, 'r') as infile:
        lines = infile.readlines()

    output_lines = []
    inside_section = False

    for i, line in enumerate(lines):
        output_lines.append(line)

        # find first relevant bullet
        if header in line:
            inside_section = True
            continue

        # Detect end of bullet list (next header)
        if inside_section:
            if line.startswith("## "):
                # insert new bullets just before the break
                last_bullet_idx = max(i for i, line in enumerate(output_lines) if "* ["  in line.strip())
                for i, sub_d in enumerate(missing_dirs):
                    stem_sub_d = sub_d.stem
                    output_lines.insert(last_bullet_idx+i+1, f"* [{stem_sub_d}]({m['gh_url']}/{sub_d}/): # TODO!! \n")
                output_lines.insert(-2, '\n')
                inside_section = False
                inserted = True
    
    # if we had to wait for end of file
    if inserted == False:
        for i, sub_d in enumerate(missing_dirs):
            stem_sub_d = sub_d.stem
            output_lines.append(f"* [{stem_sub_d}]({stem_sub_d}/): # TODO!! ")
            output_lines.append('\n')

    # Write back updated README
    with open(readme, 'w') as outfile:
        outfile.writelines(output_lines)
                     
    # write to user where README entries have been written
    if len(missing_dirs) > 0:
        print(f"Added README entries to {readme.split('../')[1]} for ")
        for sub_d in missing_dirs:
            print(f'- {sub_d.stem}')
        print()

In [92]:
# all missing dirs to output note to user eventually
all_missing_dirs = []

# processing
d = 'processing'
missing_dirs = find_missing_subdirs(d)
add_missing_subdirs_to_readme(d, missing_dirs)
all_missing_dirs += missing_dirs

# analysis
d = 'analysis'
missing_dirs = find_missing_subdirs(d)
add_missing_subdirs_to_readme(d, missing_dirs)
all_missing_dirs += missing_dirs

In [84]:
print(get_branch_url())
print(load_resources()['gh_url'])

http://github.com/pclavell/project_template/tree/gh_actions/
http://github.com/pclavell/project_template/


In [40]:
# replace all README links using the GH url for correct branch, if neccessary
for readme in Path(f'../').rglob('README.md'):
       
    # repo_path = Path(str(readme).split('../')[1]).parents[0]
    repo_path = readme.parents[0]
    print(repo_path)
    
    files = [str(f).split('../')[1] for f in repo_path.rglob('*')]
    files = sorted(files, key=lambda f: len(Path(f).parts), reverse=True)
    print(files)
    
    # parse readme to see if any of these files are mentioned here
    with open(readme, 'r') as infile:
    #     for line in infile:
            
        
    break

## Going to try a different strategy 250910

In [98]:
# get all git files
cmd = 'git ls-files'
files = run_cmd(cmd).splitlines()
dirs = [str(Path(f).parent) for f in files]
files = [f for f in list(set(files)|set(dirs)) if f != '.']
files

['config.yml',
 'save_mn5_config.py',
 'utils.r',
 'utils.py',
 'resources.yml',
 'TEST.md',
 'git_push_companion.ipynb',
 'smk_utils.py',
 'init_gh_url.py']

In [119]:
# TODO 
files = ['analysis', 'analysis/template.R']

In [120]:
# markdown parsing one word at a time
from marko import Markdown
from marko.ast_renderer import ASTRenderer
md_files = ['TEST.md']
# md_files = ['TEST_2.md']



In [127]:
from pathlib import Path
from marko import Markdown
from marko.md_renderer import MarkdownRenderer
from marko.inline import RawText, Link, CodeSpan
from marko.block import FencedCode, CodeBlock

def format_link(word):
    """
    Format link in markdown depending on if it's a 
    directory or a file
    """
    if Path(word).is_dir():
        l = f'[{word}]({word})'
    else:
        l = f'[`{word}`]({word})'
    return l

def link_files(node, files):
    """

    """
    
    # skip entire code blocks
    if isinstance(node, (FencedCode, CodeBlock)):
        return node

    # skip inline code spans
    if isinstance(node, CodeSpan):
        return node
    
    # skip existing links
    if isinstance(node, Link):
        return node


    # If raw text, split and replace words
    if isinstance(node, RawText):
        words = []
        for word in node.children.split():
            add_period = False
            if word.endswith('.'):
                add_period = True
                word = word[:-1]
            if word in files:
                words.append(format_link(word))
            else:
                words.append(word)
            if add_period == True:
                words[-1] += '.'
                add_period = False
            
        words = ' '.join(words)
        print(words)
        md = Markdown()
        words = md.parse(words)
        print(words)
        
        return words

    # If it has children, recurse
    if hasattr(node, "children"):
        new_children = []
        for child in node.children:
            replaced = link_files(child, files)
            if isinstance(replaced, list):
                new_children.extend(replaced)
            else:
                new_children.append(replaced)
        node.children = new_children
    return node

md = Markdown()
with open(md_files[0], 'r') as f:
    content = f.read()
    doc = md.parse(content)
    
print(doc)

doc = link_files(doc, files)
    
md = Markdown(renderer=MarkdownRenderer)
doc = md.render(doc)

# Write to new md file
with open('TEST_revised.md', 'w') as ofile:
    ofile.write(doc)

<Document children=[<Paragraph children=[<RawText children='We want to look in the analysis directory.'>]>,
 <BlankLine children=[]>,
 <Paragraph children=[<RawText children='To see how to run R code, take a look at analysis/template.R.'>]>,
 <BlankLine children=[]>,
 <Paragraph children=[<RawText children='Hello, '>,
 <Link children=[<RawText children='This is a link'>]>]>,
 <BlankLine children=[]>,
 <FencedCode children=[<RawText children='this is code analysis\nthis is some more code analysis\n'>]>,
 <BlankLine children=[]>,
 <HTMLBlock children=[]>,
 <BlankLine children=[]>,
 <List children=[<ListItem children=[<Paragraph children=[<RawText children='This is a bullet point'>]>]>,
 <ListItem children=[<List children=[<ListItem children=[<Paragraph children=[<RawText children='this is a nested bullet point'>]>]>]>]>,
 <ListItem children=[<List children=[<ListItem children=[<Paragraph children=[<RawText children='this is a nested bullet point analysis.'>]>]>]>]>]>]>
We want to look in

In [None]:
# (ast['children'][0]['children'][0]).get('element')
# ast['children'][0]['children'][0]

{'element': 'raw_text',
 'children': 'We want to look in the analysis directory.',
 'escape': True}

## testing gh workflow 250909

In [74]:
from pathlib import Path
import re

files = [
    'analysis/template.R',
    'analysis'
]

files = [Path(f) for f in files]
repo_path = Path('.')  # adjust if needed

# Convert to relative POSIX paths
files_rel = [f.relative_to(repo_path).as_posix() for f in files]

# Build a separate step for each file
steps_yaml = ""

for f in files_rel:
    escaped_file = re.escape(f)
    f = Path(f)
    
    if f.is_dir():
        regex = rf"(^|\W)({escaped_file})(\W|$)"
        replacement = rf"\1[{f}]({get_branch_url()}{f})\3"
    else:
        regex = rf"(^|\W)({escaped_file})(\W|$)"
        replacement = rf"\1[`{f}`]({get_branch_url()}{f})\3"


#     if f.is_dir():
#         regex = rf"\b{escaped_file}\b(?!/)"
        
#         replacement = f"[{f}]({get_branch_url()}{f})"
#     else:
#         regex = rf"\b{escaped_file}\b"
#         replacement = f"[`{f}`]({get_branch_url()}{f})"


    step = f"""
      - name: Replace references to {f}
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: '{regex}'
          replace: '{replacement}'
          include: 'template_user/resources/TEST.md'
          regex: true
    """
    steps_yaml += step

# Full GitHub Action YAML
yaml_snippet = f"""
name: Link Files in README

on:
  push:
    paths:
      - 'template_user/resources/TEST.md'  # Change to '**/*.md' after testing

jobs:
  link-files:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
{steps_yaml}
"""

print(yaml_snippet)

# Save workflow
workflow_dir = Path('../../.github/workflows')  # relative to resources/
workflow_dir.mkdir(parents=True, exist_ok=True)

workflow_file = workflow_dir / 'link-files-test.yml'
workflow_file.write_text(yaml_snippet)

workflow_file



name: Link Files in README

on:
  push:
    paths:
      - 'template_user/resources/TEST.md'  # Change to '**/*.md' after testing

jobs:
  link-files:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Replace references to analysis/template.R
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: '(^|\W)(analysis/template\.R)(\W|$)'
          replace: '\1[`analysis/template.R`](http://github.com/pclavell/project_template/tree/gh_actions/analysis/template.R)\3'
          include: 'template_user/resources/TEST.md'
          regex: true
    
      - name: Replace references to analysis
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: '(^|\W)(analysis)(\W|$)'
          replace: '\1[`analysis`](http://github.com/pclavell/project_template/tree/gh_actions/analysis)\3'
          include: 'template_user/resources/TEST.md'
          regex: true
    



PosixPath('../../.github/workflows/link-files-test.yml')

In [75]:
from pathlib import Path
import re
import uuid

# -------------------------
# User inputs / setup
# -------------------------
files = [
    'analysis/template.R',
    'analysis'
]

repo_path = Path('.')  # adjust if needed
target_md = Path('template_user/resources/TEST.md')
workflow_dir = Path('../../.github/workflows')  # relative to resources/
workflow_file = workflow_dir / 'link-files-test.yml'

# assume get_branch_url() is defined elsewhere as you requested
# def get_branch_url(): return "https://github.com/me/repo/blob/main/"

# -------------------------
# Helper: YAML single-quote escape
# -------------------------
def yaml_single_quote(s: str) -> str:
    # YAML single-quoted string: single quotes are escaped by doubling them.
    return "'" + s.replace("'", "''") + "'"

# -------------------------
# Read current file and extract existing inline markdown links
# (we'll mask these and restore later)
# -------------------------
content = ''
if target_md.exists():
    content = target_md.read_text(encoding='utf-8')

# find inline markdown links like [text](url)
link_pattern = re.compile(r'\[[^\]]+\]\([^\)]+\)')
existing_links = [m.group(0) for m in link_pattern.finditer(content)]

# Build mapping original -> placeholder (unique)
masked_links = []
for i, orig in enumerate(existing_links, start=1):
    placeholder = f"__MDLINK_{i}__"
    masked_links.append((orig, placeholder))

# -------------------------
# Prepare file list (relative POSIX), sort by descending length to avoid recursion
# -------------------------
files = [Path(f) for f in files]
files_rel = [f.relative_to(repo_path).as_posix() for f in files]
# sort by length (longest first) to prevent recursive partial replacements
files_sorted = sorted(files_rel, key=lambda s: len(s), reverse=True)

# -------------------------
# Helper to create a unique placeholder for each file replacement
# -------------------------
def make_file_placeholder(i: int) -> str:
    # include short uuid so placeholders are extremely unlikely to collide with real text
    return f"__FILE_REPL_{i}_{uuid.uuid4().hex[:8].upper()}__"

# -------------------------
# Build workflow steps YAML
# -------------------------
steps_yaml = ""

# 1) checkout step (keep this first)
steps_yaml += """      - uses: actions/checkout@v4
"""

# 2) mask existing markdown links (literal replace, regex: false)
for orig, placeholder in masked_links:
    steps_yaml += f"""      - name: Mask existing MD link -> {placeholder}
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: {yaml_single_quote(orig)}
          replace: {yaml_single_quote(placeholder)}
          include: '{target_md.as_posix()}'
          regex: false
"""

# 3) replace each file path with a unique placeholder (regex-based find; replacement literal placeholder)
file_placeholders = []  # list of tuples (file_rel, placeholder, is_dir_bool)
for i, f_rel in enumerate(files_sorted, start=1):
    p = Path(f_rel)
    escaped_file = re.escape(f_rel)           # escape for regex
    # pattern: use \b boundary on both sides (RE2/Go accepts \b). This is simple and avoids lookarounds.
    pattern = rf"\b{escaped_file}\b"
    placeholder = make_file_placeholder(i)
    file_placeholders.append((f_rel, placeholder, p.is_dir()))

    steps_yaml += f"""      - name: Replace occurrences of {f_rel} with placeholder
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: {yaml_single_quote(pattern)}
          replace: {yaml_single_quote(placeholder)}
          include: '{target_md.as_posix()}'
          regex: true
"""

# 4) restore placeholders -> real Markdown links (literal replacements, regex: false)
#    (We use get_branch_url() here as you asked; do not redefine it.)
for f_rel, placeholder, is_dir in file_placeholders:
    # Build target replacement Markdown
    if is_dir:
        replacement = f"[{f_rel}]({get_branch_url()}{f_rel})"
    else:
        replacement = f"[`{f_rel}`]({get_branch_url()}{f_rel})"

    steps_yaml += f"""      - name: Restore placeholder {placeholder} -> link for {f_rel}
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: {yaml_single_quote(placeholder)}
          replace: {yaml_single_quote(replacement)}
          include: '{target_md.as_posix()}'
          regex: false
"""

# 5) finally, restore any original markdown links we masked earlier
for orig, placeholder in masked_links:
    steps_yaml += f"""      - name: Restore original MD link {placeholder} -> original
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: {yaml_single_quote(placeholder)}
          replace: {yaml_single_quote(orig)}
          include: '{target_md.as_posix()}'
          regex: false
"""

# -------------------------
# Wrap into a full workflow YAML
# -------------------------
yaml_snippet = f"""
name: Link Files in README

on:
  push:
    paths:
      - '{target_md.as_posix()}'  # Change to '**/*.md' after testing

jobs:
  link-files:
    runs-on: ubuntu-latest
    steps:
{steps_yaml}
"""

# ensure workflow dir exists and write
workflow_dir.mkdir(parents=True, exist_ok=True)
workflow_file.write_text(yaml_snippet, encoding='utf-8')

print(f"Wrote workflow to: {workflow_file.resolve()}")
print("Workflow preview:\n")
print(yaml_snippet)


Wrote workflow to: /Users/fairliereese/Documents/programming/mele_lab/projects/project_template/.github/workflows/link-files-test.yml
Workflow preview:


name: Link Files in README

on:
  push:
    paths:
      - 'template_user/resources/TEST.md'  # Change to '**/*.md' after testing

jobs:
  link-files:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Replace occurrences of analysis/template.R with placeholder
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: '\banalysis/template\.R\b'
          replace: '__FILE_REPL_1_1819D1B9__'
          include: 'template_user/resources/TEST.md'
          regex: true
      - name: Replace occurrences of analysis with placeholder
        uses: jacobtomlinson/gha-find-replace@v3
        with:
          find: '\banalysis\b'
          replace: '__FILE_REPL_2_889E0153__'
          include: 'template_user/resources/TEST.md'
          regex: true
      - name: Restore placeholder __FILE_REP

In [42]:
# # Build one regex that matches any file in backticks or brackets
# import re
# # files = ['analysis/template.R', 'resources/resources.yml', 'resources']
# files = ['analysis']
# escaped_files = [re.escape(f) for f in files]
# pattern = r'(`(' + '|'.join(escaped_files) + r')`|\[(' + '|'.join(escaped_files) + r')\])'
# pattern

In [43]:
# readme = 'TEST.md'

# files = [str(f).split('../')[1] for f in repo_path.rglob('*')]
# files = sorted(files, key=lambda f: len(Path(f).parts), reverse=True)
# print(files)

# for f in files:
#     lines = []
#     with open(readme, 'r') as infile:
#         for line in infile:
#             # if 

In [None]:
# replace already-there links with the one using the proper / updated branch

In [None]:
# gh action to check for broken links