Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Add stubs package for lxml.
Browse files Browse the repository at this point in the history
  • Loading branch information
clokep committed May 31, 2023
1 parent c01343d commit d5254ab
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 49 deletions.
1 change: 1 addition & 0 deletions changelog.d/15697.misc
@@ -0,0 +1 @@
Improve type hints.
3 changes: 0 additions & 3 deletions mypy.ini
Expand Up @@ -60,9 +60,6 @@ ignore_missing_imports = True
[mypy-ijson.*]
ignore_missing_imports = True

[mypy-lxml]
ignore_missing_imports = True

# https://github.com/msgpack/msgpack-python/issues/448
[mypy-msgpack]
ignore_missing_imports = True
Expand Down
25 changes: 20 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Expand Up @@ -314,6 +314,7 @@ black = ">=22.3.0"
ruff = "0.0.265"

# Typechecking
lxml-stubs = ">=0.4.0"
mypy = "*"
mypy-zope = "*"
types-bleach = ">=4.1.0"
Expand Down
29 changes: 16 additions & 13 deletions synapse/media/oembed.py
Expand Up @@ -14,7 +14,7 @@
import html
import logging
import urllib.parse
from typing import TYPE_CHECKING, List, Optional
from typing import TYPE_CHECKING, List, Optional, cast

import attr

Expand Down Expand Up @@ -98,7 +98,7 @@ def get_oembed_url(self, url: str) -> Optional[str]:
# No match.
return None

def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Expand All @@ -109,18 +109,20 @@ def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
for tag in cast(
List["etree._Element"],
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return tag.attrib["href"]
return cast(str, tag.attrib["href"])

# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
for tag in cast(
List["etree._Element"],
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
):
if "href" in tag.attrib:
return tag.attrib["href"]
return cast(str, tag.attrib["href"])

return None

Expand Down Expand Up @@ -212,11 +214,11 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, author_name, cache_age)


def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]:
results = []
for tag in tree.xpath("//*/" + tag_name):
for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)):
if "src" in tag.attrib:
results.append(tag.attrib["src"])
results.append(cast(str, tag.attrib["src"]))
return results


Expand Down Expand Up @@ -244,11 +246,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
parser = etree.HTMLParser(recover=True, encoding="utf-8")

# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)
# TODO Develop of lxml-stubs has this correct.
tree = etree.fromstring(html_body, parser) # type: ignore[arg-type]

# The data was successfully parsed, but no tree was found.
if tree is None:
return
return # type: ignore[unreachable]

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
Expand Down
74 changes: 50 additions & 24 deletions synapse/media/preview_html.py
Expand Up @@ -24,6 +24,7 @@
Optional,
Set,
Union,
cast,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -115,7 +116,7 @@ def _get_html_media_encodings(

def decode_body(
body: bytes, uri: str, content_type: Optional[str] = None
) -> Optional["etree.Element"]:
) -> Optional["etree._Element"]:
"""
This uses lxml to parse the HTML document.
Expand Down Expand Up @@ -152,11 +153,12 @@ def decode_body(

# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body, parser)
# TODO Develop of lxml-stubs has this correct.
return etree.fromstring(body, parser) # type: ignore[arg-type]


def _get_meta_tags(
tree: "etree.Element",
tree: "etree._Element",
property: str,
prefix: str,
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
Expand All @@ -175,9 +177,14 @@ def _get_meta_tags(
Returns:
A map of tag name to value.
"""
# This actually returns Dict[str, str], but the caller sets this as a variable
# which is Dict[str, Optional[str]].
results: Dict[str, Optional[str]] = {}
for tag in tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
for tag in cast(
List["etree._Element"],
tree.xpath(
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
),
):
# if we've got more than 50 tags, someone is taking the piss
if len(results) >= 50:
Expand All @@ -187,14 +194,15 @@ def _get_meta_tags(
)
return {}

key = tag.attrib[property]
key = cast(str, tag.attrib[property])
if property_mapper:
key = property_mapper(key)
new_key = property_mapper(key)
# None is a special value used to ignore a value.
if key is None:
if new_key is None:
continue
key = new_key

results[key] = tag.attrib["content"]
results[key] = cast(str, tag.attrib["content"])

return results

Expand All @@ -219,7 +227,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]:
return "og" + key[7:]


def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
Expand Down Expand Up @@ -247,7 +255,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

og = _get_meta_tags(tree, "property", "og")
og: Dict[str, Optional[str]] = _get_meta_tags(tree, "property", "og")

# TODO: Search for properties specific to the different Open Graph types,
# such as article: meta tags, e.g.:
Expand Down Expand Up @@ -276,15 +284,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:

if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
title = cast(
List["etree._ElementUnicodeResult"],
tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"),
)
if title:
og["og:title"] = title[0].strip()
else:
og["og:title"] = None

if "og:image" not in og:
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
meta_image = cast(
List["etree._ElementUnicodeResult"],
tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
),
)
# If a meta image is found, use it.
if meta_image:
Expand All @@ -293,7 +307,10 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# Try to find images which are larger than 10px by 10px.
#
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = cast(
List["etree._Element"],
tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"),
)
images = sorted(
images,
key=lambda i: (
Expand All @@ -302,20 +319,26 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
)
# If no images were found, try to find *any* images.
if not images:
images = tree.xpath("//img[@src][1]")
images = cast(List["etree._Element"], tree.xpath("//img[@src][1]"))
if images:
og["og:image"] = images[0].attrib["src"]
og["og:image"] = cast(str, images[0].attrib["src"])

# Finally, fallback to the favicon if nothing else.
else:
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
favicons = cast(
List["etree._ElementUnicodeResult"],
tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"),
)
if favicons:
og["og:image"] = favicons[0]

if "og:description" not in og:
# Check the first meta description tag for content.
meta_description = tree.xpath(
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
meta_description = cast(
List["etree._ElementUnicodeResult"],
tree.xpath(
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
),
)
# If a meta description is found with content, use it.
if meta_description:
Expand All @@ -332,7 +355,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
return og


def parse_html_description(tree: "etree.Element") -> Optional[str]:
def parse_html_description(tree: "etree._Element") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
Expand Down Expand Up @@ -368,6 +391,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"canvas",
"img",
"picture",
# etree.Comment is a function which creates an etree._Comment element.
# The "tag" attribute of an etree._Comment instance is confusingly the
# etree.Comment function instead of a string.
etree.Comment,
}

Expand All @@ -381,8 +407,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:


def _iterate_over_text(
tree: Optional["etree.Element"],
tags_to_ignore: Set[Union[str, "etree.Comment"]],
tree: Optional["etree._Element"],
tags_to_ignore: Set[object],
stack_limit: int = 1024,
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
Expand All @@ -402,7 +428,7 @@ def _iterate_over_text(

# This is a stack whose items are elements to iterate over *or* strings
# to be returned.
elements: List[Union[str, "etree.Element"]] = [tree]
elements: List[Union[str, "etree._Element"]] = [tree]
while elements:
el = elements.pop()

Expand Down

0 comments on commit d5254ab

Please sign in to comment.