Skip to content

Commit

Permalink
favor_precision with Trafilatura and add first tests (all pass) #86
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed May 7, 2024
1 parent 3d0792a commit 58d480d
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
2 changes: 1 addition & 1 deletion mcmetadata/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ class TrafilaturaExtractor(AbstractExtractor):

def extract(self, url: str, html_text: str, include_metadata: bool = False):
results = trafilatura.bare_extraction(html_text, only_with_metadata=include_metadata, url=url,
include_images=include_metadata)
include_images=include_metadata, favor_precision=True)
image_urls = []
if include_metadata:
# pull out the images embedded in the markdown
Expand Down
15 changes: 15 additions & 0 deletions mcmetadata/test/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,21 @@ def test_too_short_content(self):
except BadContentError:
assert True

def test_no_related_links_1(self):
url = 'https://web.archive.org/web/20240507150742/https://www.ibtimes.co.uk/falling-inflation-shifts-focus-when-ecb-could-cut-rates-1722106'
results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA)
closing_str = 'Copyright AFP 2023. All rights reserved.'
# tailing links that shouldn't be included
copyright_index = results['text'].index('Copyright AFP 2023. All rights reserved.')
trailing_content = results['text'][copyright_index + len(closing_str):]
assert len(trailing_content) < 20

def test_no_related_links_3(self):
url = 'https://web.archive.org/web/20240507151403/https://www.bfmtv.com/cote-d-azur/nice-25-personnes-expulsees-lors-d-operations-anti-squat-menees-dans-le-quartier-des-liserons_AN-202312150639.html'
results = self._fetch_and_validate(url, content.METHOD_TRAFILATURA)
most_read_header = "Les plus lus" # visual sidebar content of most read articles
assert most_read_header not in results['text']


if __name__ == "__main__":
unittest.main()

0 comments on commit 58d480d

Please sign in to comment.