Skip to content

Commit

Permalink
Add test for nbsp issue #15
Browse files Browse the repository at this point in the history
  • Loading branch information
lorey committed Jul 7, 2022
1 parent ee10ef7 commit 6ba0a3a
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
from mlscraper.html import HTMLExactTextMatch
Expand Down Expand Up @@ -47,6 +48,20 @@ def test_find_all(self, stackoverflow_samples):
nodes = page.find_all("/users/624900/jterrace")
assert nodes

def test_find_all_with_text_with_noise(self):
html = b"<html><body><p>bla karl bla</p></body></html>"
page = Page(html)
assert all(
not isinstance(html_match, HTMLExactTextMatch)
for html_match in page.find_all("karl")
)

@pytest.mark.skip("no fuzzy matching yet")
def test_find_all_with_nbsp(self):
html = "<html><body><p>123&nbsp;€</body></html>".encode()
page = Page(html)
assert len(page.find_all("123 €")) > 0


def test_equality():
# we want to make sure that equal html does not result in equality
Expand Down Expand Up @@ -88,15 +103,6 @@ def test_find_text_with_whitespace():
assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)


def test_find_text_with_noise():
html = b"<html><body><p>bla karl bla</p></body></html>"
page = Page(html)
assert all(
not isinstance(html_match, HTMLExactTextMatch)
for html_match in page.find_all("karl")
)


def test_get_relative_depth():
html = b"<html><body><p>bla karl bla</p></body></html>"
page = Page(html)
Expand Down

0 comments on commit 6ba0a3a

Please sign in to comment.