From ff37534a726dd81af5a8054d3709669332c78ec7 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 24 Jul 2024 15:18:23 -0400 Subject: [PATCH 1/3] Convert durations to ISO8601 format (podcast episodes) --- learning_resources/etl/podcast.py | 3 ++- learning_resources/etl/podcast_test.py | 4 +-- learning_resources/etl/utils.py | 34 ++++++++++++++++++++++++++ learning_resources/etl/utils_test.py | 26 ++++++++++++++++++++ 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index cd466443c0..bc01071c9a 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -12,6 +12,7 @@ from learning_resources.constants import LearningResourceType from learning_resources.etl.constants import ETLSource +from learning_resources.etl.utils import iso8601_duration from learning_resources.models import PodcastEpisode from main.utils import clean_data, frontend_absolute_url, now_in_utc @@ -174,7 +175,7 @@ def transform_episode(rss_data, offered_by, topics, parent_image): "podcast_episode": { "episode_link": rss_data.link.text if rss_data.link else None, "duration": ( - rss_data.find("itunes:duration").text + iso8601_duration(rss_data.find("itunes:duration").text) if rss_data.find("itunes:duration") else None ), diff --git a/learning_resources/etl/podcast_test.py b/learning_resources/etl/podcast_test.py index f48ec454c8..1cdbe6c3e3 100644 --- a/learning_resources/etl/podcast_test.py +++ b/learning_resources/etl/podcast_test.py @@ -162,7 +162,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "published": True, "podcast_episode": { "episode_link": "https://soundcloud.com/podcast/episode1", - "duration": "00:17:16", + "duration": "PT17M16S", "rss": episodes_rss[0].prettify(), }, "resource_type": LearningResourceType.podcast_episode.name, @@ -185,7 +185,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "published": True, "podcast_episode": { "episode_link": "https://soundcloud.com/podcast/episode2", - "duration": "00:17:16", + "duration": "PT17M16S", "rss": episodes_rss[1].prettify(), }, "resource_type": LearningResourceType.podcast_episode.name, diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index a5a69875d6..341793153b 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -21,6 +21,7 @@ import rapidjson import requests from django.conf import settings +from django.utils.dateparse import parse_duration from django.utils.functional import SimpleLazyObject from django.utils.text import slugify from tika import parser as tika_parser @@ -703,3 +704,36 @@ def parse_certification(offeror, runs_data): if (availability and availability != AvailabilityType.archived.value) ] ) + + +def iso8601_duration(duration_str: str) -> str or None: + """ + Parse the duration from a string and return it in ISO-8601 format + + Args: + duration_str (str): The duration as a string in one of various formats + + Returns: + str: the duration in ISO-8601 format + """ + if not duration_str: + return None + delta = parse_duration(duration_str) + if delta is None: + return None + + # Extract components + hours, remainder = divmod(delta.total_seconds(), 3600) + minutes, seconds = divmod(remainder, 60) + + # Build the duration string + if hours or minutes or seconds: + duration = "PT" + if hours: + duration += f"{int(hours)}H" + if minutes: + duration += f"{int(minutes)}M" + if seconds: + duration += f"{int(seconds or 0)}S" + return duration + return "PT0S" diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 363da4a74a..86b5845eb9 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -460,3 +460,29 @@ def test_calc_checksum(previous_archive, identical): def test_get_department_id_by_name(dept_name, dept_id): """Test that the correct department ID (if any) is returned""" assert utils.get_department_id_by_name(dept_name) == dept_id + + +@pytest.mark.parametrize( + ("duration_str", "expected"), + [ + ("1:00:00", "PT1H"), + ("1:30:04", "PT1H30M4S"), + ("00:00", "PT0S"), + ("00:00:00", "PT0S"), + ("00:01:00", "PT1M"), + ("01:00:00", "PT1H"), + ("00:00:01", "PT1S"), + ("02:59", "PT2M59S"), + ("72:59", "PT1H12M59S"), + ("3675", "PT1H1M15S"), + ("5", "PT5S"), + ("PT1H30M4S", "PT1H30M4S"), + ("", None), + (None, None), + ("bad_duration", None), + ("PTBarnum", None), + ], +) +def test_parse_duration(duration_str, expected): + """Test that parse_duration returns the expected duration""" + assert utils.iso8601_duration(duration_str) == expected From acd332494cd25b35e9545385e5b5c3fefb9efc4b Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 24 Jul 2024 15:58:06 -0400 Subject: [PATCH 2/3] Simplify the new function a bit --- learning_resources/etl/utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 341793153b..491e2549eb 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -722,18 +722,12 @@ def iso8601_duration(duration_str: str) -> str or None: if delta is None: return None - # Extract components hours, remainder = divmod(delta.total_seconds(), 3600) minutes, seconds = divmod(remainder, 60) - # Build the duration string if hours or minutes or seconds: - duration = "PT" - if hours: - duration += f"{int(hours)}H" - if minutes: - duration += f"{int(minutes)}M" - if seconds: - duration += f"{int(seconds or 0)}S" - return duration + hour_duration = f"{int(hours)}H" if hours else "" + minute_duration = f"{int(minutes)}M" if minutes else "" + second_duration = f"{int(seconds)}S" if seconds else "" + return f"PT{hour_duration}{minute_duration}{second_duration}" return "PT0S" From c86e4609db3939e9eb65bcecfa6aac72bf953d5c Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Thu, 25 Jul 2024 14:54:31 -0400 Subject: [PATCH 3/3] Log a warning if a non-empty/blank duration cannot be parsed into a timedelta --- learning_resources/etl/utils.py | 1 + learning_resources/etl/utils_test.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index 491e2549eb..6d62c70c7e 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -720,6 +720,7 @@ def iso8601_duration(duration_str: str) -> str or None: return None delta = parse_duration(duration_str) if delta is None: + log.warning("Could not parse duration string %s", duration_str) return None hours, remainder = divmod(delta.total_seconds(), 3600) diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py index 86b5845eb9..f0ba8e9f1e 100644 --- a/learning_resources/etl/utils_test.py +++ b/learning_resources/etl/utils_test.py @@ -483,6 +483,8 @@ def test_get_department_id_by_name(dept_name, dept_id): ("PTBarnum", None), ], ) -def test_parse_duration(duration_str, expected): +def test_parse_duration(mocker, duration_str, expected): """Test that parse_duration returns the expected duration""" + mock_warn = mocker.patch("learning_resources.etl.utils.log.warning") assert utils.iso8601_duration(duration_str) == expected + assert mock_warn.call_count == (1 if duration_str and expected is None else 0)