diff --git a/CHANGELOG.md b/CHANGELOG.md index db1b44a..d79ef2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change log level of "Video at {url} has not yet been translated into {requested_lang_code}" messages from warning to debug (way too verbose) + +### Fixed + +- Restore functionality to resist temporary bad TED responses when parsing video pages (#209) + ## [3.0.2] - 2024-06-24 ### Changed diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..e3ae2cc --- /dev/null +++ b/codecov.yml @@ -0,0 +1,11 @@ +coverage: + status: + project: + default: + informational: true + patch: + default: + informational: true + changes: + default: + informational: true diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py index 1f2df51..12ff7cb 100644 --- a/src/ted2zim/scraper.py +++ b/src/ted2zim/scraper.py @@ -12,7 +12,7 @@ import dateutil.parser import jinja2 import yt_dlp -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from kiwixstorage import KiwixStorage from pif import get_public_ip from slugify import slugify @@ -821,15 +821,32 @@ def extract_info_from_video_page( try: soup = BeautifulSoup(html_content, features="html.parser") - json_data = json.loads( - soup.find( - "script", attrs={"id": "__NEXT_DATA__"} - ).string # pyright: ignore - )["props"]["pageProps"]["videoData"] + next_data_tag = soup.find("script", attrs={"id": "__NEXT_DATA__"}) + + # TED is sometimes inconsistant in sending HTML content, it sometimes sends + # the HTML without the required script containing the talks data, so we + # retry after 5 seconds + if ( + not next_data_tag + or not isinstance(next_data_tag, Tag) + or not isinstance(next_data_tag.string, str) + ): + logger.debug( + "Insufficient data returned by server, __NEXT_DATA__ script not " + "found in HTML page. Retrying in 5 seconds..." + ) + time.sleep(5) + return self.extract_info_from_video_page( + url, retry_count=retry_count + 1 + ) + + json_data = json.loads(next_data_tag.string)["props"]["pageProps"][ + "videoData" + ] requested_lang_code = self.get_lang_code_from_url(url) if requested_lang_code and json_data["language"] != requested_lang_code: - logger.warning( + logger.debug( f"Video at {url} has not yet been translated into " f"{requested_lang_code}" )