From 244fc8665092133e6c6e0763f7948b03cef71e53 Mon Sep 17 00:00:00 2001 From: Lindsey Sawatzky Date: Sat, 24 Mar 2018 12:12:10 -0700 Subject: [PATCH 1/2] Handle erroneous wiki responses for HTML which include 'Edit' links. --- tests/extract_html_format_test.py | 16 ++++++++++++++++ tests/mock_data.py | 30 ++++++++++++++++++++++++++++++ wikipediaapi/wikipedia.py | 2 +- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/extract_html_format_test.py b/tests/extract_html_format_test.py index 855981e..bd1eda0 100644 --- a/tests/extract_html_format_test.py +++ b/tests/extract_html_format_test.py @@ -108,3 +108,19 @@ def test_text(self): "

Text for section 5.1\n\n\n

" ) ) + + def test_with_erroneous_edit(self): + page = self.wiki.page('Test_Edit') + self.maxDiff = None + section = page.section_by_title('Section with Edit') + self.assertEqual(section.title, 'Section with Edit') + self.assertEqual( + page.text, + ( + "

Summary text\n\n

\n\n" + + "

Section 1

\n" + + "

Text for section 1

\n\n" + "

Section with Edit

\n" + + "

Text for section with edit\n\n\n

" + ) + ) diff --git a/tests/mock_data.py b/tests/mock_data.py index e13d134..822e8da 100644 --- a/tests/mock_data.py +++ b/tests/mock_data.py @@ -108,6 +108,36 @@ def wikipedia_api_request(page, params): } } }, + 'en:action=query&prop=extracts&titles=Test_Edit&': { + "batchcomplete": "", + "warnings": { + "extracts": { + "*": "\"exlimit\" was too large for a whole article extracts request, lowered to 1." + } + }, + "query": { + "normalized": [ + { + "from": "Test_Edit", + "to": "Test Edit" + } + ], + "pages": { + "4": { + "pageid": 4, + "ns": 0, + "title": "Test Edit", + "extract": ( + "

Summary text\n\n

\n" + + "

Section 1

\n" + + "

Text for section 1

\n\n\n" + + "

Section with EditEdit

\n" + + "

Text for section with edit\n\n\n

" + ) + } + } + } + }, 'en:action=query&inprop=protection|talkid|watched|watchers|visitingwatchers|notificationtimestamp|subjectid|url|readable|preload|displaytitle&prop=info&titles=Test_1&': { "batchcomplete": "", "query": { diff --git a/wikipediaapi/wikipedia.py b/wikipediaapi/wikipedia.py index 48eaa3d..a05021e 100644 --- a/wikipediaapi/wikipedia.py +++ b/wikipediaapi/wikipedia.py @@ -65,7 +65,7 @@ class Namespace(object): ExtractFormat.HTML: re.compile( r'\n? *]*?>(]*><\/span>)? *' + '(]*>)? *(]*><\/span>)? *(.*?) *' + - '(<\/span>)?<\/h\d>\n?' + '(<\/span>)?(Edit<\/span>)?<\/h\d>\n?' # Example page with 'Edit' erroneous links: https://bit.ly/2ui4FWs ), # ExtractFormat.PLAIN.value: re.compile(r'\n\n *(===*) (.*?) (===*) *\n'), } From 0e0c84a5d3ba18926a66ae9207b927719a18f400 Mon Sep 17 00:00:00 2001 From: Lindsey Sawatzky Date: Sat, 24 Mar 2018 12:40:20 -0700 Subject: [PATCH 2/2] Maintain < 80 line length standard. --- wikipediaapi/wikipedia.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wikipediaapi/wikipedia.py b/wikipediaapi/wikipedia.py index a05021e..029e493 100644 --- a/wikipediaapi/wikipedia.py +++ b/wikipediaapi/wikipedia.py @@ -65,7 +65,9 @@ class Namespace(object): ExtractFormat.HTML: re.compile( r'\n? *]*?>(]*><\/span>)? *' + '(]*>)? *(]*><\/span>)? *(.*?) *' + - '(<\/span>)?(Edit<\/span>)?<\/h\d>\n?' # Example page with 'Edit' erroneous links: https://bit.ly/2ui4FWs + '(<\/span>)?(Edit<\/span>)?<\/h\d>\n?' + # ^^^^ + # Example page with 'Edit' erroneous links: https://bit.ly/2ui4FWs ), # ExtractFormat.PLAIN.value: re.compile(r'\n\n *(===*) (.*?) (===*) *\n'), }