Skip to content

Commit

Permalink
Strip all HTML when getting the title from the first H1 tag (#3564)
Browse files Browse the repository at this point in the history
Not stripping it was a bug, and also inconsistent with how ToC titles are extracted.
  • Loading branch information
oprypin committed Feb 8, 2024
1 parent d6fcc56 commit e755aae
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 67 deletions.
12 changes: 7 additions & 5 deletions mkdocs/structure/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from urllib.parse import urljoin, urlsplit, urlunsplit

import markdown
import markdown.extensions.toc
import markdown.htmlparser # type: ignore
import markdown.postprocessors
import markdown.treeprocessors
Expand Down Expand Up @@ -549,7 +550,7 @@ def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]) -> None:

class _ExtractTitleTreeprocessor(markdown.treeprocessors.Treeprocessor):
title: str | None = None
postprocessors: Sequence[markdown.postprocessors.Postprocessor] = ()
md: markdown.Markdown

def run(self, root: etree.Element) -> etree.Element:
for el in root:
Expand All @@ -561,14 +562,15 @@ def run(self, root: etree.Element) -> etree.Element:
# Extract the text only, recursively.
title = ''.join(el.itertext())
# Unescape per Markdown implementation details.
for pp in self.postprocessors:
title = pp.run(title)
self.title = title
title = markdown.extensions.toc.stashedHTML2text(
title, self.md, strip_entities=False
)
self.title = title.strip()
break
return root

def _register(self, md: markdown.Markdown) -> None:
self.postprocessors = tuple(md.postprocessors)
self.md = md
md.treeprocessors.register(self, "mkdocs_extract_title", priority=-1) # After the end.


Expand Down
2 changes: 1 addition & 1 deletion mkdocs/structure/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, title: str, id: str, level: int) -> None:
self.children = []

title: str
"""The text of the item."""
"""The text of the item, as HTML."""

@property
def url(self) -> str:
Expand Down
112 changes: 53 additions & 59 deletions mkdocs/tests/structure/page_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import unittest
from unittest import mock

import markdown

from mkdocs.config.defaults import MkDocsConfig
from mkdocs.structure.files import File, Files
from mkdocs.structure.pages import Page, _RelativePathTreeprocessor
from mkdocs.structure.pages import Page, _ExtractTitleTreeprocessor, _RelativePathTreeprocessor
from mkdocs.tests.base import dedent, tempdir

DOCS_DIR = os.path.join(
Expand Down Expand Up @@ -315,9 +317,16 @@ def test_page_title_from_markdown(self):
self.assertEqual(pg.parent, None)
self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'Welcome to MkDocs')
pg.render(cfg, fl)
pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'Welcome to MkDocs')

def _test_extract_title(self, content, expected, extensions={}):
md = markdown.Markdown(extensions=list(extensions.keys()), extension_configs=extensions)
extract_title_ext = _ExtractTitleTreeprocessor()
extract_title_ext._register(md)
md.convert(content)
self.assertEqual(extract_title_ext.title, expected)

_SETEXT_CONTENT = dedent(
'''
Welcome to MkDocs Setext
Expand All @@ -327,46 +336,37 @@ def test_page_title_from_markdown(self):
'''
)

@tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT})
def test_page_title_from_setext_markdown(self, docs_dir):
cfg = load_config()
fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
self.assertIsNone(pg.title)
pg.read_source(cfg)
self.assertEqual(pg.title, 'Testing setext title')
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Setext')

@tempdir(files={'testing_setext_title.md': _SETEXT_CONTENT})
def test_page_title_from_markdown_stripped_anchorlinks(self, docs_dir):
cfg = MkDocsConfig()
cfg.site_name = 'example'
cfg.markdown_extensions = {'toc': {'permalink': '&'}}
self.assertEqual(cfg.validate(), ([], []))
fl = File('testing_setext_title.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Setext')
def test_page_title_from_setext_markdown(self):
self._test_extract_title(
self._SETEXT_CONTENT,
expected='Welcome to MkDocs Setext',
)

_FORMATTING_CONTENT = dedent(
'''
# \\*Hello --- *beautiful* `world`
def test_page_title_from_markdown_stripped_anchorlinks(self):
self._test_extract_title(
self._SETEXT_CONTENT,
extensions={'toc': {'permalink': '&'}},
expected='Welcome to MkDocs Setext',
)

Hi.
'''
)
def test_page_title_from_markdown_strip_formatting(self):
self._test_extract_title(
'''# \\*Hello --- *beautiful* `wor<dl>`''',
extensions={'smarty': {}},
expected='*Hello &mdash; beautiful wor&lt;dl&gt;',
)

@tempdir(files={'testing_formatting.md': _FORMATTING_CONTENT})
def test_page_title_from_markdown_strip_formatting(self, docs_dir):
cfg = load_config()
cfg.markdown_extensions.append('smarty')
fl = File('testing_formatting.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, '*Hello &mdash; beautiful world')
def test_page_title_from_markdown_strip_raw_html(self):
self._test_extract_title(
'''# Hello <b>world</b>''',
expected='Hello world',
)

def test_page_title_from_markdown_strip_image(self):
self._test_extract_title(
'''# Hi ![😄](hah.png)''',
expected='Hi', # TODO: Should the alt text of the image be extracted?
)

_ATTRLIST_CONTENT = dedent(
'''
Expand All @@ -376,24 +376,18 @@ def test_page_title_from_markdown_strip_formatting(self, docs_dir):
'''
)

@tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT})
def test_page_title_from_markdown_stripped_attr_list(self, docs_dir):
cfg = load_config()
cfg.markdown_extensions.append('attr_list')
fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Attr')
def test_page_title_from_markdown_stripped_attr_list(self):
self._test_extract_title(
self._ATTRLIST_CONTENT,
extensions={'attr_list': {}},
expected='Welcome to MkDocs Attr',
)

@tempdir(files={'testing_attr_list.md': _ATTRLIST_CONTENT})
def test_page_title_from_markdown_preserved_attr_list(self, docs_dir):
cfg = load_config()
fl = File('testing_attr_list.md', docs_dir, docs_dir, use_directory_urls=True)
pg = Page(None, fl, cfg)
pg.read_source(cfg)
pg.render(cfg, fl)
self.assertEqual(pg.title, 'Welcome to MkDocs Attr { #welcome }')
def test_page_title_from_markdown_preserved_attr_list(self):
self._test_extract_title(
self._ATTRLIST_CONTENT,
expected='Welcome to MkDocs Attr { #welcome }',
)

def test_page_title_from_meta(self):
cfg = load_config(docs_dir=DOCS_DIR)
Expand All @@ -418,7 +412,7 @@ def test_page_title_from_meta(self):
self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'A Page Title')
self.assertEqual(pg.toc, [])
pg.render(cfg, fl)
pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'A Page Title')

def test_page_title_from_filename(self):
Expand All @@ -443,7 +437,7 @@ def test_page_title_from_filename(self):
self.assertEqual(pg.parent, None)
self.assertEqual(pg.previous_page, None)
self.assertEqual(pg.title, 'Page title')
pg.render(cfg, fl)
pg.render(cfg, Files([fl]))
self.assertEqual(pg.title, 'Page title')

def test_page_title_from_capitalized_filename(self):
Expand Down Expand Up @@ -704,7 +698,7 @@ def test_page_render(self):
pg.read_source(cfg)
self.assertEqual(pg.content, None)
self.assertEqual(pg.toc, [])
pg.render(cfg, [fl])
pg.render(cfg, Files([fl]))
self.assertTrue(
pg.content.startswith('<h1 id="welcome-to-mkdocs">Welcome to MkDocs</h1>\n')
)
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [
"click >=7.0",
"Jinja2 >=2.11.1",
"markupsafe >=2.0.1",
"Markdown >=3.3.6",
"Markdown >=3.4.1",
"PyYAML >=5.1",
"watchdog >=2.0",
"ghp-import >=1.0",
Expand All @@ -57,7 +57,7 @@ min-versions = [
"click ==7.0",
"Jinja2 ==2.11.1",
"markupsafe ==2.0.1",
"Markdown ==3.3.6",
"Markdown ==3.4.1",
"PyYAML ==5.1",
"watchdog ==2.0",
"ghp-import ==1.0",
Expand Down

0 comments on commit e755aae

Please sign in to comment.