From d5254ab087e21771561b684cf564eb778d476c00 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 31 May 2023 08:09:00 -0400 Subject: [PATCH] Add stubs package for lxml. --- changelog.d/15697.misc | 1 + mypy.ini | 3 -- poetry.lock | 25 ++++++++-- pyproject.toml | 1 + synapse/media/oembed.py | 29 ++++++----- synapse/media/preview_html.py | 74 +++++++++++++++++++--------- tests/media/test_html_preview.py | 18 ++++++- tests/media/test_oembed.py | 2 +- tests/media/test_url_previewer.py | 2 +- tests/rest/media/test_url_preview.py | 2 +- 10 files changed, 108 insertions(+), 49 deletions(-) create mode 100644 changelog.d/15697.misc diff --git a/changelog.d/15697.misc b/changelog.d/15697.misc new file mode 100644 index 000000000000..93ceaeafc9b9 --- /dev/null +++ b/changelog.d/15697.misc @@ -0,0 +1 @@ +Improve type hints. diff --git a/mypy.ini b/mypy.ini index 56cd1d560ea8..1038b7d8c7a4 100644 --- a/mypy.ini +++ b/mypy.ini @@ -60,9 +60,6 @@ ignore_missing_imports = True [mypy-ijson.*] ignore_missing_imports = True -[mypy-lxml] -ignore_missing_imports = True - # https://github.com/msgpack/msgpack-python/issues/448 [mypy-msgpack] ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 0879e64cf176..d8964f5719de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "alabaster" @@ -1215,6 +1215,21 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.7)"] +[[package]] +name = "lxml-stubs" +version = "0.4.0" +description = "Type annotations for the lxml package" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "lxml-stubs-0.4.0.tar.gz", hash = "sha256:184877b42127256abc2b932ba8bd0ab5ea80bd0b0fee618d16daa40e0b71abee"}, + {file = "lxml_stubs-0.4.0-py3-none-any.whl", hash = "sha256:3b381e9e82397c64ea3cc4d6f79d1255d015f7b114806d4826218805c10ec003"}, +] + +[package.extras] +test = ["coverage[toml] (==5.2)", "pytest (>=6.0.0)", "pytest-mypy-plugins (==1.9.3)"] + [[package]] name = "markdown-it-py" version = "2.2.0" @@ -3409,22 +3424,22 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"] +all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] oidc = ["authlib"] opentracing = ["jaeger-client", "opentracing"] postgres = ["psycopg2", "psycopg2cffi", "psycopg2cffi-compat"] -redis = ["txredisapi", "hiredis"] +redis = ["hiredis", "txredisapi"] saml2 = ["pysaml2"] sentry = ["sentry-sdk"] systemd = ["systemd-python"] -test = ["parameterized", "idna"] +test = ["idna", "parameterized"] url-preview = ["lxml"] user-search = ["pyicu"] [metadata] lock-version = "2.0" python-versions = "^3.7.1" -content-hash = "ef3a16dd66177f7141239e1a2d3e07cc14c08f1e4e0c5127184d022bc062da52" +content-hash = "7ad11e62a675e09444cf33ca2de3216fc4efc5874a2575e54d95d577a52439d3" diff --git a/pyproject.toml b/pyproject.toml index 7227bc752316..4476f57ca74f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -314,6 +314,7 @@ black = ">=22.3.0" ruff = "0.0.265" # Typechecking +lxml-stubs = ">=0.4.0" mypy = "*" mypy-zope = "*" types-bleach = ">=4.1.0" diff --git a/synapse/media/oembed.py b/synapse/media/oembed.py index c0eaf04be544..7a9d2a82d431 100644 --- a/synapse/media/oembed.py +++ b/synapse/media/oembed.py @@ -14,7 +14,7 @@ import html import logging import urllib.parse -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List, Optional, cast import attr @@ -98,7 +98,7 @@ def get_oembed_url(self, url: str) -> Optional[str]: # No match. return None - def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: + def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]: """ Search an HTML document for oEmbed autodiscovery information. @@ -109,18 +109,20 @@ def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: The URL to use for oEmbed information, or None if no URL was found. """ # Search for link elements with the proper rel and type attributes. - for tag in tree.xpath( - "//link[@rel='alternate'][@type='application/json+oembed']" + for tag in cast( + List["etree._Element"], + tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"), ): if "href" in tag.attrib: - return tag.attrib["href"] + return cast(str, tag.attrib["href"]) # Some providers (e.g. Flickr) use alternative instead of alternate. - for tag in tree.xpath( - "//link[@rel='alternative'][@type='application/json+oembed']" + for tag in cast( + List["etree._Element"], + tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"), ): if "href" in tag.attrib: - return tag.attrib["href"] + return cast(str, tag.attrib["href"]) return None @@ -212,11 +214,11 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: return OEmbedResult(open_graph_response, author_name, cache_age) -def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: +def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]: results = [] - for tag in tree.xpath("//*/" + tag_name): + for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)): if "src" in tag.attrib: - results.append(tag.attrib["src"]) + results.append(cast(str, tag.attrib["src"])) return results @@ -244,11 +246,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> parser = etree.HTMLParser(recover=True, encoding="utf-8") # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(html_body, parser) + # TODO Develop of lxml-stubs has this correct. + tree = etree.fromstring(html_body, parser) # type: ignore[arg-type] # The data was successfully parsed, but no tree was found. if tree is None: - return + return # type: ignore[unreachable] # Attempt to find interesting URLs (images, videos, embeds). if "og:image" not in open_graph_response: diff --git a/synapse/media/preview_html.py b/synapse/media/preview_html.py index 516d0434f095..cfb8d7535457 100644 --- a/synapse/media/preview_html.py +++ b/synapse/media/preview_html.py @@ -24,6 +24,7 @@ Optional, Set, Union, + cast, ) if TYPE_CHECKING: @@ -115,7 +116,7 @@ def _get_html_media_encodings( def decode_body( body: bytes, uri: str, content_type: Optional[str] = None -) -> Optional["etree.Element"]: +) -> Optional["etree._Element"]: """ This uses lxml to parse the HTML document. @@ -152,11 +153,12 @@ def decode_body( # Attempt to parse the body. Returns None if the body was successfully # parsed, but no tree was found. - return etree.fromstring(body, parser) + # TODO Develop of lxml-stubs has this correct. + return etree.fromstring(body, parser) # type: ignore[arg-type] def _get_meta_tags( - tree: "etree.Element", + tree: "etree._Element", property: str, prefix: str, property_mapper: Optional[Callable[[str], Optional[str]]] = None, @@ -175,9 +177,14 @@ def _get_meta_tags( Returns: A map of tag name to value. """ + # This actually returns Dict[str, str], but the caller sets this as a variable + # which is Dict[str, Optional[str]]. results: Dict[str, Optional[str]] = {} - for tag in tree.xpath( - f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" + for tag in cast( + List["etree._Element"], + tree.xpath( + f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]" + ), ): # if we've got more than 50 tags, someone is taking the piss if len(results) >= 50: @@ -187,14 +194,15 @@ def _get_meta_tags( ) return {} - key = tag.attrib[property] + key = cast(str, tag.attrib[property]) if property_mapper: - key = property_mapper(key) + new_key = property_mapper(key) # None is a special value used to ignore a value. - if key is None: + if new_key is None: continue + key = new_key - results[key] = tag.attrib["content"] + results[key] = cast(str, tag.attrib["content"]) return results @@ -219,7 +227,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]: return "og" + key[7:] -def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: +def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]: """ Parse the HTML document into an Open Graph response. @@ -247,7 +255,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: # "og:video:height" : "720", # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - og = _get_meta_tags(tree, "property", "og") + og: Dict[str, Optional[str]] = _get_meta_tags(tree, "property", "og") # TODO: Search for properties specific to the different Open Graph types, # such as article: meta tags, e.g.: @@ -276,15 +284,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: if "og:title" not in og: # Attempt to find a title from the title tag, or the biggest header on the page. - title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") + title = cast( + List["etree._ElementUnicodeResult"], + tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"), + ) if title: og["og:title"] = title[0].strip() else: og["og:title"] = None if "og:image" not in og: - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" + meta_image = cast( + List["etree._ElementUnicodeResult"], + tree.xpath( + "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" + ), ) # If a meta image is found, use it. if meta_image: @@ -293,7 +307,10 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: # Try to find images which are larger than 10px by 10px. # # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") + images = cast( + List["etree._Element"], + tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"), + ) images = sorted( images, key=lambda i: ( @@ -302,20 +319,26 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: ) # If no images were found, try to find *any* images. if not images: - images = tree.xpath("//img[@src][1]") + images = cast(List["etree._Element"], tree.xpath("//img[@src][1]")) if images: - og["og:image"] = images[0].attrib["src"] + og["og:image"] = cast(str, images[0].attrib["src"]) # Finally, fallback to the favicon if nothing else. else: - favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") + favicons = cast( + List["etree._ElementUnicodeResult"], + tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"), + ) if favicons: og["og:image"] = favicons[0] if "og:description" not in og: # Check the first meta description tag for content. - meta_description = tree.xpath( - "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" + meta_description = cast( + List["etree._ElementUnicodeResult"], + tree.xpath( + "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" + ), ) # If a meta description is found with content, use it. if meta_description: @@ -332,7 +355,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: return og -def parse_html_description(tree: "etree.Element") -> Optional[str]: +def parse_html_description(tree: "etree._Element") -> Optional[str]: """ Calculate a text description based on an HTML document. @@ -368,6 +391,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: "canvas", "img", "picture", + # etree.Comment is a function which creates an etree._Comment element. + # The "tag" attribute of an etree._Comment instance is confusingly the + # etree.Comment function instead of a string. etree.Comment, } @@ -381,8 +407,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]: def _iterate_over_text( - tree: Optional["etree.Element"], - tags_to_ignore: Set[Union[str, "etree.Comment"]], + tree: Optional["etree._Element"], + tags_to_ignore: Set[object], stack_limit: int = 1024, ) -> Generator[str, None, None]: """Iterate over the tree returning text nodes in a depth first fashion, @@ -402,7 +428,7 @@ def _iterate_over_text( # This is a stack whose items are elements to iterate over *or* strings # to be returned. - elements: List[Union[str, "etree.Element"]] = [tree] + elements: List[Union[str, "etree._Element"]] = [tree] while elements: el = elements.pop() diff --git a/tests/media/test_html_preview.py b/tests/media/test_html_preview.py index e7da75db3ee0..ea84bb3d3d53 100644 --- a/tests/media/test_html_preview.py +++ b/tests/media/test_html_preview.py @@ -24,7 +24,7 @@ try: import lxml except ImportError: - lxml = None + lxml = None # type: ignore[assignment] class SummarizeTestCase(unittest.TestCase): @@ -160,6 +160,7 @@ def test_simple(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -176,6 +177,7 @@ def test_comment(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -195,6 +197,7 @@ def test_comment2(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual( @@ -217,6 +220,7 @@ def test_script(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -231,6 +235,7 @@ def test_missing_title(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -246,6 +251,7 @@ def test_missing_title(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Title"}) @@ -261,6 +267,7 @@ def test_h1_as_title(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -281,6 +288,7 @@ def test_empty_description(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"}) @@ -296,6 +304,7 @@ def test_missing_title_and_broken_h1(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -324,6 +333,7 @@ def test_xml(self) -> None: FooSome text. """.strip() tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -338,6 +348,7 @@ def test_invalid_encoding(self) -> None: """ tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -353,6 +364,7 @@ def test_invalid_encoding2(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) @@ -367,6 +379,7 @@ def test_windows_1252(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) @@ -380,6 +393,7 @@ def test_twitter_tag(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual( og, @@ -401,6 +415,7 @@ def test_twitter_tag(self) -> None: """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual( og, @@ -419,6 +434,7 @@ def test_nested_nodes(self) -> None: with a cheeky SVG and some tail text """ tree = decode_body(html, "http://example.com/test.html") + assert tree is not None og = parse_html_to_open_graph(tree) self.assertEqual( og, diff --git a/tests/media/test_oembed.py b/tests/media/test_oembed.py index c8bf8421daf1..3bc19cb1ccd0 100644 --- a/tests/media/test_oembed.py +++ b/tests/media/test_oembed.py @@ -28,7 +28,7 @@ try: import lxml except ImportError: - lxml = None + lxml = None # type: ignore[assignment] class OEmbedTests(HomeserverTestCase): diff --git a/tests/media/test_url_previewer.py b/tests/media/test_url_previewer.py index 3c4c7d676520..46ecde534441 100644 --- a/tests/media/test_url_previewer.py +++ b/tests/media/test_url_previewer.py @@ -24,7 +24,7 @@ try: import lxml except ImportError: - lxml = None + lxml = None # type: ignore[assignment] class URLPreviewTests(unittest.HomeserverTestCase): diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py index 170fb0534ad5..05d5e39cabd4 100644 --- a/tests/rest/media/test_url_preview.py +++ b/tests/rest/media/test_url_preview.py @@ -40,7 +40,7 @@ try: import lxml except ImportError: - lxml = None + lxml = None # type: ignore[assignment] class URLPreviewTests(unittest.HomeserverTestCase):