Join GitHub today
GitHub is home to over 20 million developers working together to host and review code, manage projects, and build software together.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
Already on GitHub? Sign in to your account
Change the way we summarize URLs #973
Conversation
ara4n
commented on an outdated diff
Aug 2, 2016
synapse/rest/media/v1/preview_url_resource.py
| - text = re.sub(r'[\t ]+', ' ', text) | ||
| - text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text) | ||
| - text = text.strip()[:500] | ||
| - og['og:description'] = text if text else None | ||
| + | ||
| + description = description.strip() | ||
| + description = re.sub(r'[\t ]+', ' ', description) | ||
| + description = re.sub(r'[\t \r\n]*[\r\n]+', '\n', description) | ||
| + | ||
| + # If the concatenation of paragraphs to get above MIN_SIZE | ||
| + # took us over MAX_SIZE, then we need to truncate mid paragraph | ||
| + if len(description) > MAX_SIZE: | ||
| + new_desc = "" | ||
| + | ||
| + # This splits the paragraph into words, but keeping the | ||
| + # (proceeding) whitespace intact so we can easily concat |
|
|
ara4n
commented on an outdated diff
Aug 2, 2016
synapse/rest/media/v1/preview_url_resource.py
| + # took us over MAX_SIZE, then we need to truncate mid paragraph | ||
| + if len(description) > MAX_SIZE: | ||
| + new_desc = "" | ||
| + | ||
| + # This splits the paragraph into words, but keeping the | ||
| + # (proceeding) whitespace intact so we can easily concat | ||
| + # words back together. | ||
| + for match in re.finditer("\s*\S+", description): | ||
| + word = match.group() | ||
| + | ||
| + # Keep adding words while the total length is less than | ||
| + # MAX_SIZE. | ||
| + if len(word) + len(new_desc) < MAX_SIZE: | ||
| + new_desc += word | ||
| + else: | ||
| + # At thi point the next word *will* take us over |
|
|
ara4n
commented on an outdated diff
Aug 2, 2016
synapse/rest/media/v1/preview_url_resource.py
| + new_desc += word | ||
| + else: | ||
| + # At thi point the next word *will* take us over | ||
| + # MAX_SIZE, but we also want to ensure that its not | ||
| + # a huge word. If it is add it anyway and we'll | ||
| + # truncate later. | ||
| + if len(new_desc) < MIN_SIZE: | ||
| + new_desc += word | ||
| + break | ||
| + | ||
| + # Double check that we're not over the limit | ||
| + if len(new_desc) > MAX_SIZE: | ||
| + new_desc = new_desc[:MAX_SIZE] | ||
| + | ||
| + # We always add an ellipsis because at the very least | ||
| + # we chooped mid paragraph. |
|
|
ara4n
and 1 other
commented on an outdated diff
Aug 2, 2016
synapse/rest/media/v1/preview_url_resource.py
| + else: | ||
| + # At thi point the next word *will* take us over | ||
| + # MAX_SIZE, but we also want to ensure that its not | ||
| + # a huge word. If it is add it anyway and we'll | ||
| + # truncate later. | ||
| + if len(new_desc) < MIN_SIZE: | ||
| + new_desc += word | ||
| + break | ||
| + | ||
| + # Double check that we're not over the limit | ||
| + if len(new_desc) > MAX_SIZE: | ||
| + new_desc = new_desc[:MAX_SIZE] | ||
| + | ||
| + # We always add an ellipsis because at the very least | ||
| + # we chooped mid paragraph. | ||
| + description = new_desc.strip() + "…" |
ara4n
Owner
|
ara4n
and 1 other
commented on an outdated diff
Aug 2, 2016
synapse/rest/media/v1/preview_url_resource.py
| @@ -329,20 +331,74 @@ def _calc_og(self, tree, media_info, requester): | ||
| # ...or if they are within a <script/> or <style/> tag. | ||
| # This is a very very very coarse approximation to a plain text | ||
| # render of the page. | ||
| - text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " | ||
| - "ancestor::aside | ancestor::footer | " | ||
| - "ancestor::script | ancestor::style)]" + | ||
| - "[ancestor::body]") | ||
| - text = '' | ||
| + | ||
| + # We don't just use XPATH here as that is slow on some machines. | ||
| + | ||
| + cloned_tree = deepcopy(tree.find("body")) |
erikjohnston
Owner
|
|
lgtm modulo a test :) |
ara4n
assigned
erikjohnston
Aug 2, 2016
erikjohnston
added some commits
Aug 2, 2016
erikjohnston
merged commit a5d7968
into
develop
Aug 4, 2016
10 checks passed
Flake8 + Packaging (Commit)
Build #1365 origin/erikj/xpath_fix succeeded in 56 sec
Details
Flake8 + Packaging (Merged PR)
Build finished.
Details
Sytest Dendron (Commit)
Build #464 origin/erikj/xpath_fix succeeded in 9 min 3 sec
Details
Sytest Dendron (Merged PR)
Build finished.
Details
Sytest Postgres (Commit)
Build #1296 origin/erikj/xpath_fix succeeded in 5 min 42 sec
Details
Sytest Postgres (Merged PR)
Build finished.
Details
Sytest SQLite (Commit)
Build #1330 origin/erikj/xpath_fix succeeded in 4 min 3 sec
Details
Sytest SQLite (Merged PR)
Build finished.
Details
Unit Tests (Commit)
Build #1401 origin/erikj/xpath_fix succeeded in 2 min 8 sec
Details
Unit Tests (Merged PR)
Build finished.
Details
richvdh
deleted the
erikj/xpath_fix
branch
Dec 1, 2016
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
erikjohnston commentedAug 2, 2016
Using XPath is slow on some machines (for unknown reasons), so use a different approach to get a list of text nodes.
Try to generate a summary that respect paragraph and then word boundaries, adding ellipses when appropriate.