[doctools] Extract code blocks from every doc into _tmp/code-blocks

For validating the documentation.
oilshell · May 18, 2021 · 0e51118 · 0e51118
1 parent fe9390e
commit 0e51118
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 9 deletions.
diff --git a/build/doc.sh b/build/doc.sh
@@ -149,8 +149,8 @@ split-and-render() {
   #head _tmp/doc/*
   #return
 
-  local code_output=_tmp/code-blocks/$name.txt
-  cmark --code-output $code_output ${prefix}_meta.json ${prefix}_content.md > $out
+  local code_out=_tmp/code-blocks/$name.txt
+  cmark --code-block-output $code_out ${prefix}_meta.json ${prefix}_content.md > $out
   log "$prefix -> (doctools/cmark) -> $out"
 }
 

diff --git a/doctools/cmark.py b/doctools/cmark.py
@@ -262,6 +262,11 @@ def Render(opts, meta, in_file, out_file, use_fastlex=True):
   html = md2html(in_file.read())
 
   if use_fastlex:
+    if opts.code_block_output:
+      with open(opts.code_block_output, 'w') as f:
+        f.write('# %s: code blocks extracted from Markdown/HTML\n\n' % opts.code_block_output)
+        text = oil_doc.ExtractCode(html, f)
+
     html = oil_doc.RemoveComments(html)
 
     # Hack for allowing tables without <p> in cells, which CommonMark seems to require?
@@ -273,11 +278,6 @@ def Render(opts, meta, in_file, out_file, use_fastlex=True):
 
     html = oil_doc.HighlightCode(html, meta.get('default_highlighter'))
 
-    if opts.code_output:
-      log('TODO: output to %s', opts.code_output)
-      with open(opts.code_output, 'w') as f:
-        f.write('TODO')
-
   # h2 is the title.  h1 is unused.
   if opts.toc_tags:
     toc_tags = opts.toc_tags
@@ -326,7 +326,7 @@ def Options():
       help='Hack for old blog posts')
 
   p.add_option(
-      '--code-output', dest='code_output',
+      '--code-block-output', dest='code_block_output',
       default=False,
       help='Extract and print code blocks to this file')
 

diff --git a/doctools/oil_doc.py b/doctools/oil_doc.py
@@ -406,7 +406,7 @@ def HighlightCode(s, default_highlighter):
 
             else:  # language-*: Use Pygments
 
-              # We REMOVIE the original <pre><code> because Pygments gives you a <pre> already
+              # We REMOVE the original <pre><code> because Pygments gives you a <pre> already
 
               # We just read closing </code>, and the next one should be </pre>.
               try:
@@ -434,6 +434,66 @@ def HighlightCode(s, default_highlighter):
   return f.getvalue()
 
 
+def ExtractCode(s, f):
+  """Print code blocks to a plain text file.
+
+  So we can at least validate the syntax.
+
+  Similar to the algorithm code above: 
+  
+  1. Collect what's inside <pre><code> ...
+  2. Decode &amp; -> &,e tc. and return it
+  """
+  out = html.Output(s, f)
+  tag_lexer = html.TagLexer(s)
+
+  block_num = 0
+  pos = 0
+  it = html.ValidTokens(s)
+
+  while True:
+    try:
+      tok_id, end_pos = next(it)
+    except StopIteration:
+      break
+
+    if tok_id == html.StartTag:
+      tag_lexer.Reset(pos, end_pos)
+      if tag_lexer.TagName() == 'pre':
+        pre_start_pos = pos
+        pos = end_pos
+
+        try:
+          tok_id, end_pos = next(it)
+        except StopIteration:
+          break
+
+        tag_lexer.Reset(pos, end_pos)
+        if tok_id == html.StartTag and tag_lexer.TagName() == 'code':
+
+          css_class = tag_lexer.GetAttr('class')
+          code_start_pos = end_pos
+
+          out.SkipTo(code_start_pos)
+          out.Print('# block %d' % block_num)
+          out.Print('\n')
+
+          slash_code_left, slash_code_right = \
+              html.ReadUntilEndTag(it, tag_lexer, 'code')
+
+          text = html.ToText(s, code_start_pos, slash_code_left)
+          out.SkipTo(slash_code_left)
+
+          out.Print(text)
+          out.Print('\n')
+
+          block_num += 1
+
+    pos = end_pos
+
+  #out.PrintTheRest()
+
+
 class ShellSession(object):
   """
   TODO: Pass this to HighlightCode as a plugin