From a59ba09350b0a976f2b06e9eefab88fd9cf8a7df Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Mon, 8 Jul 2024 13:09:20 -0400 Subject: [PATCH 01/10] change podcast/episode readable_ids, make sure unpublished ones get deindexed --- learning_resources/etl/loaders.py | 23 +++++++++++++++---- learning_resources/etl/podcast.py | 6 ++--- ...0056_alter_learningresource_readable_id.py | 17 ++++++++++++++ learning_resources/models.py | 2 +- 4 files changed, 39 insertions(+), 9 deletions(-) create mode 100644 learning_resources/migrations/0056_alter_learningresource_readable_id.py diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py index b2076c60a5..91020fedf1 100644 --- a/learning_resources/etl/loaders.py +++ b/learning_resources/etl/loaders.py @@ -682,6 +682,10 @@ def load_podcast(podcast_data: dict) -> LearningResource: LearningResource.objects.filter(id__in=unpublished_episode_ids).update( published=False ) + bulk_resources_unpublished_actions( + unpublished_episode_ids, + LearningResourceType.podcast_episode.name, + ) episode_ids.extend(unpublished_episode_ids) learning_resource.resources.set( episode_ids, @@ -719,13 +723,22 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]: # unpublish the podcasts and episodes we're no longer tracking ids = [podcast.id for podcast in podcast_resources] - LearningResource.objects.filter( + unpublished_podcasts = LearningResource.objects.filter( resource_type=LearningResourceType.podcast.name - ).exclude(id__in=ids).update(published=False) - LearningResource.objects.filter( + ).exclude(id__in=ids) + unpublished_podcasts.update(published=False) + bulk_resources_unpublished_actions( + unpublished_podcasts.values_list("id", flat=True), + LearningResourceType.podcast.name, + ) + unpublished_episodes = LearningResource.objects.filter( resource_type=LearningResourceType.podcast_episode.name - ).exclude(parents__parent__in=ids).update(published=False) - + ).exclude(parents__parent__in=ids) + unpublished_episodes.update(published=False) + bulk_resources_unpublished_actions( + unpublished_episodes.values_list("id", flat=True), + LearningResourceType.podcast_episode.name, + ) return podcast_resources diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 332dc944b0..73de6e8981 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -12,7 +12,7 @@ from learning_resources.constants import LearningResourceType from learning_resources.etl.constants import ETLSource -from learning_resources.etl.utils import clean_data, generate_readable_id +from learning_resources.etl.utils import clean_data from learning_resources.models import PodcastEpisode from main.utils import frontend_absolute_url, now_in_utc @@ -143,7 +143,7 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): rss_data.guid.string = f"{podcast_id}: {rss_data.guid.text}" return { - "readable_id": generate_readable_id(rss_data.title.text[:95]), + "readable_id": rss_data.guid.text, "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast_episode.name, "title": rss_data.title.text, @@ -203,7 +203,7 @@ def transform(extracted_podcasts): apple_podcasts_url = config_data.get("apple_podcasts_url") google_podcasts_url = config_data.get("google_podcasts_url") title = config_data.get("podcast_title", rss_data.channel.title.text) - podcast_id = generate_readable_id(title[:95]) + podcast_id = rss_data.guid.text yield { "readable_id": podcast_id, diff --git a/learning_resources/migrations/0056_alter_learningresource_readable_id.py b/learning_resources/migrations/0056_alter_learningresource_readable_id.py new file mode 100644 index 0000000000..69ee099aeb --- /dev/null +++ b/learning_resources/migrations/0056_alter_learningresource_readable_id.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.13 on 2024-07-08 16:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0055_alter_relationship_options_ordering"), + ] + + operations = [ + migrations.AlterField( + model_name="learningresource", + name="readable_id", + field=models.CharField(max_length=512), + ), + ] diff --git a/learning_resources/models.py b/learning_resources/models.py index 5e1fa5a9ad..4de9909f74 100644 --- a/learning_resources/models.py +++ b/learning_resources/models.py @@ -185,7 +185,7 @@ class LearningResource(TimestampedModel): *([item.name for item in LearningResourceType]), ] - readable_id = models.CharField(max_length=128, null=False, blank=False) + readable_id = models.CharField(max_length=512, null=False, blank=False) title = models.CharField(max_length=256) description = models.TextField(null=True, blank=True) # noqa: DJ001 full_description = models.TextField(null=True, blank=True) # noqa: DJ001 From e45b6840e6a0c5005acc7b7311d4102ddf082217 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Mon, 8 Jul 2024 16:06:16 -0400 Subject: [PATCH 02/10] mgmt command for migrating learningpath/userlist relationships from obsolete unpublished resources to published replacements --- .../commands/migrate_list_resources.py | 63 +++++++++++++++++++ learning_resources/utils.py | 53 ++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 learning_resources/management/commands/migrate_list_resources.py diff --git a/learning_resources/management/commands/migrate_list_resources.py b/learning_resources/management/commands/migrate_list_resources.py new file mode 100644 index 0000000000..45b2cc142e --- /dev/null +++ b/learning_resources/management/commands/migrate_list_resources.py @@ -0,0 +1,63 @@ +"""Management command for migrating unpublished resource path/list relationships""" + +from django.core.management import BaseCommand + +from learning_resources.utils import migrate_list_resources +from main.utils import now_in_utc + + +class Command(BaseCommand): + """ + Migrate relationships in learningpaths and userlists from unpublished + resources to published replacement resources. + """ + + help = "Migrate relationships from unpublished resources to published resources." + + def add_arguments(self, parser): + parser.add_argument("resource_type", type=str, help="Resource type to migrate") + parser.add_argument( + "match_field", type=str, help="Resource field to match resources by" + ) + parser.add_argument( + "from_source", type=str, help="ETL Source for unpublished resources" + ) + parser.add_argument( + "to_source", type=str, help="ETL Source for published resources" + ) + parser.add_argument( + "--delete", + dest="delete", + action="store_true", + help="Delete unpublished resources after migrating relationships", + ) + super().add_arguments(parser) + + def handle(self, *args, **options): # noqa: ARG002 + """ + Migrate relationships in learningpaths and userlists from unpublished + resources to published replacement resources. + """ + resource_type = options["resource_type"] + match_field = options["match_field"] + from_source = options["from_source"] + to_source = options["to_source"] + delete = options["delete"] + + self.stdout.write( + f"Migrate {resource_type} relationships from " + f"{from_source} to {to_source}, matching on {match_field}" + ) + start = now_in_utc() + unpublished, matching = migrate_list_resources( + resource_type, + match_field, + from_source, + to_source, + delete_unpublished=delete, + ) + total_seconds = (now_in_utc() - start).total_seconds() + self.stdout.write( + f"Processed {unpublished} resources and found {matching} " + f"published matches, took {total_seconds} seconds" + ) diff --git a/learning_resources/utils.py b/learning_resources/utils.py index e22d43230d..a98fcf4e02 100644 --- a/learning_resources/utils.py +++ b/learning_resources/utils.py @@ -16,6 +16,7 @@ from learning_resources.constants import ( GROUP_STAFF_LISTS_EDITORS, + LearningResourceRelationTypes, semester_mapping, ) from learning_resources.hooks import get_plugin_manager @@ -24,9 +25,11 @@ LearningResourceDepartment, LearningResourceOfferor, LearningResourcePlatform, + LearningResourceRelationship, LearningResourceRun, LearningResourceSchool, LearningResourceTopic, + UserListRelationship, ) from main.utils import generate_filepath @@ -559,3 +562,53 @@ def add_parent_topics_to_learning_resource(resource): for topic in resource.topics.all(): if topic.parent: _walk_lr_topic_parents(resource, topic.parent) + + +def migrate_list_resources( + resource_type: str, + matching_field: str, + from_source: str, + to_source: str, + *, + delete_unpublished: bool = False, +) -> tuple[int, int]: + """ + Migrate learningpath/userlist resources that have been replaced with + a new resource object. + + Args: + resource_type (str): the resource type + matching_field (str): the unique field to match identical resources + from_source (str): the ETL source of unpublished resources + to_source (str): the ETL source of published resources + delete_unpublished (bool): whether to delete the unpublished resources + + Returns: + tuple[int, int]: the number of unpublished and matching published resources + """ + unpublished_resources = LearningResource.objects.filter( + resource_type=resource_type, published=False, etl_source=from_source + ) + unpublished_count = 0 + published_count = 0 + for resource in unpublished_resources: + unpublished_count += 1 + unique_value = getattr(resource, matching_field) + published_replacement = LearningResource.objects.filter( + resource_type=resource_type, + published=True, + etl_source=to_source, + **{matching_field: unique_value}, + ).first() + if published_replacement: + published_count += 1 + LearningResourceRelationship.objects.filter( + relation_type=LearningResourceRelationTypes.LEARNING_PATH_ITEMS.value, + child=resource, + ).update(child=published_replacement) + UserListRelationship.objects.filter(child=resource).update( + child=published_replacement + ) + if delete_unpublished: + unpublished_resources.delete() + return unpublished_count, published_count From b6afe7cfc1511fd226b104a8834b64108c400ea9 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Mon, 8 Jul 2024 16:15:24 -0400 Subject: [PATCH 03/10] update openapi spec --- .pre-commit-config.yaml | 2 +- openapi/specs/v1.yaml | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f50778607d..40cc1a88b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: check-toml - id: debug-statements - repo: https://github.com/pre-commit/mirrors-prettier - rev: v4.0.0-alpha.8 + rev: v3.1.0 hooks: - id: prettier types_or: diff --git a/openapi/specs/v1.yaml b/openapi/specs/v1.yaml index 1517ae72fc..630d0ed113 100644 --- a/openapi/specs/v1.yaml +++ b/openapi/specs/v1.yaml @@ -7531,7 +7531,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -7598,7 +7598,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -9815,7 +9815,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -9882,7 +9882,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10065,7 +10065,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -10132,7 +10132,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10452,7 +10452,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -10519,7 +10519,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10951,7 +10951,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -11018,7 +11018,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -11191,7 +11191,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -11258,7 +11258,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 From 742af5ce302a66d268e7f8de39983d94a05a9f7e Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Tue, 9 Jul 2024 10:31:41 -0400 Subject: [PATCH 04/10] Fix some tests --- learning_resources/etl/podcast.py | 11 +++-------- learning_resources/etl/podcast_test.py | 14 +++----------- test_html/test_podcast.rss | 1 + 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 73de6e8981..9f21997b07 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -125,7 +125,7 @@ def extract(): log.exception("Invalid rss url %s", rss_url) -def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): +def transform_episode(rss_data, offered_by, topics, parent_image): """ Transform a podcast episode into our normalized data @@ -140,8 +140,6 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): normalized podcast episode data """ - rss_data.guid.string = f"{podcast_id}: {rss_data.guid.text}" - return { "readable_id": rss_data.guid.text, "etl_source": ETLSource.podcast.name, @@ -203,10 +201,9 @@ def transform(extracted_podcasts): apple_podcasts_url = config_data.get("apple_podcasts_url") google_podcasts_url = config_data.get("google_podcasts_url") title = config_data.get("podcast_title", rss_data.channel.title.text) - podcast_id = rss_data.guid.text yield { - "readable_id": podcast_id, + "readable_id": rss_data.guid.text, "title": title, "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast.name, @@ -217,9 +214,7 @@ def transform(extracted_podcasts): "url": config_data["website"], "topics": topics, "episodes": ( - transform_episode( - episode_rss, offered_by, topics, image, podcast_id - ) + transform_episode(episode_rss, offered_by, topics, image) for episode_rss in rss_data.find_all("item") ), "podcast": { diff --git a/learning_resources/etl/podcast_test.py b/learning_resources/etl/podcast_test.py index 3012c6de62..bd65fd959f 100644 --- a/learning_resources/etl/podcast_test.py +++ b/learning_resources/etl/podcast_test.py @@ -123,22 +123,14 @@ def test_transform(mock_github_client, title, topics, offered_by): ) expected_title = title if title else "A Podcast" - expected_readable_id = ( - "custom-titleb04b26d38dd63a2c829393e9e075927d" - if title - else "a-podcast7e3a1ebb0c4d3196ba4c7f8254af4d2d" - ) expected_offered_by = {"name": offered_by} if offered_by else None episodes_rss = list(bs(rss_content(), "xml").find_all("item")) - for episode in episodes_rss: - episode.guid.string = f"{expected_readable_id}: {episode.guid.text}" - expected_results = [ { - "readable_id": expected_readable_id, + "readable_id": "tag:soundcloud,2010:tracks/1857159426", "etl_source": ETLSource.podcast.name, "title": expected_title, "offered_by": expected_offered_by, @@ -155,7 +147,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "topics": expected_topics, "episodes": [ { - "readable_id": "episode15ede89915db9342fb76bc91918d22016", + "readable_id": "tag:soundcloud,2010:tracks/numbers1", "etl_source": ETLSource.podcast.name, "title": "Episode1", "offered_by": expected_offered_by, @@ -178,7 +170,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "topics": expected_topics, }, { - "readable_id": "episode205c066df9ed531e48c6414f6e72d3b96", + "readable_id": "tag:soundcloud,2010:tracks/numbers2", "etl_source": ETLSource.podcast.name, "title": "Episode2", "offered_by": expected_offered_by, diff --git a/test_html/test_podcast.rss b/test_html/test_podcast.rss index ef6e7f9a17..a712805392 100644 --- a/test_html/test_podcast.rss +++ b/test_html/test_podcast.rss @@ -26,6 +26,7 @@ https://awebsite.mit.edu/ + tag:soundcloud,2010:tracks/1857159426 tag:soundcloud,2010:tracks/numbers1 Episode1 From a4344e6fcd997b194c6cf1a29bc25f252c9d165c Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Tue, 9 Jul 2024 13:09:46 -0400 Subject: [PATCH 05/10] Addtional test --- learning_resources/utils.py | 4 +-- learning_resources/utils_test.py | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/learning_resources/utils.py b/learning_resources/utils.py index a98fcf4e02..628d496746 100644 --- a/learning_resources/utils.py +++ b/learning_resources/utils.py @@ -595,12 +595,12 @@ def migrate_list_resources( unpublished_count += 1 unique_value = getattr(resource, matching_field) published_replacement = LearningResource.objects.filter( + **{matching_field: unique_value}, resource_type=resource_type, published=True, etl_source=to_source, - **{matching_field: unique_value}, ).first() - if published_replacement: + if published_replacement is not None: published_count += 1 LearningResourceRelationship.objects.filter( relation_type=LearningResourceRelationTypes.LEARNING_PATH_ITEMS.value, diff --git a/learning_resources/utils_test.py b/learning_resources/utils_test.py index 8e83946a5e..75cb016d90 100644 --- a/learning_resources/utils_test.py +++ b/learning_resources/utils_test.py @@ -12,20 +12,26 @@ CONTENT_TYPE_FILE, CONTENT_TYPE_PDF, CONTENT_TYPE_VIDEO, + LearningResourceRelationTypes, ) from learning_resources.etl.utils import get_content_type from learning_resources.factories import ( CourseFactory, + LearningPathFactory, + LearningResourceFactory, LearningResourceRunFactory, LearningResourceTopicFactory, + UserListFactory, ) from learning_resources.models import ( + LearningResource, LearningResourceOfferor, LearningResourcePlatform, LearningResourceTopic, ) from learning_resources.utils import ( add_parent_topics_to_learning_resource, + migrate_list_resources, upsert_topic_data, ) @@ -324,3 +330,58 @@ def test_upsert_offered_by(mocker): ) assert offeror.name == offered_by_data["fields"]["name"] mock_upsert.assert_any_call(offeror, overwrite=True) + + +@pytest.mark.parametrize( + ("matching_field", "from_source", "to_source", "matches", "delete_old"), + [ + ("url", "podcast", "podcast", True, False), + ("url", "podcast", "podcast", True, True), + ("url", "podcast", "xpro", False, False), + ("readable_id", "podcast", "podcast", False, False), + ], +) +def test_migrate_list_resources( + matching_field, from_source, to_source, matches, delete_old +): + """Test that the migrate_list_resources function works as expected.""" + original_podcasts = LearningResourceFactory.create_batch( + 5, is_podcast=True, etl_source=from_source, published=False + ) + podcast_path = LearningPathFactory.create().learning_resource + podcast_path.resources.set( + original_podcasts, + through_defaults={ + "relation_type": LearningResourceRelationTypes.LEARNING_PATH_ITEMS + }, + ) + podcast_list = UserListFactory.create() + podcast_list.resources.set(original_podcasts) + + new_podcasts = [ + LearningResourceFactory.create( + is_podcast=True, url=old_podcast.url, etl_source=from_source + ) + for old_podcast in original_podcasts + ] + + results = migrate_list_resources( + "podcast", matching_field, from_source, to_source, delete_unpublished=delete_old + ) + podcast_path.refresh_from_db() + podcast_list.refresh_from_db() + + assert results == ((5, 5) if matches else (5, 0)) + list_podcasts = ( + new_podcasts if matches else (original_podcasts if not delete_old else []) + ) + for podcast in list_podcasts: + assert podcast in podcast_path.resources.all() + assert podcast in podcast_list.resources.all() + if delete_old: + assert ( + LearningResource.objects.filter( + id__in=[podcast.id for podcast in original_podcasts] + ).count() + == 0 + ) From f35018edad5915ae85a2efc974692a71a6099032 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Tue, 9 Jul 2024 13:51:03 -0400 Subject: [PATCH 06/10] revert precommit config change --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 40cc1a88b9..f50778607d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: - id: check-toml - id: debug-statements - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.1.0 + rev: v4.0.0-alpha.8 hooks: - id: prettier types_or: From bd40353ca2585aaf49fcb555f77f9da9930f82a9 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Tue, 9 Jul 2024 14:09:48 -0400 Subject: [PATCH 07/10] Rename the new utility function --- ...grate_list_resources.py => transfer_list_resources.py} | 4 ++-- learning_resources/utils.py | 6 +++--- learning_resources/utils_test.py | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) rename learning_resources/management/commands/{migrate_list_resources.py => transfer_list_resources.py} (94%) diff --git a/learning_resources/management/commands/migrate_list_resources.py b/learning_resources/management/commands/transfer_list_resources.py similarity index 94% rename from learning_resources/management/commands/migrate_list_resources.py rename to learning_resources/management/commands/transfer_list_resources.py index 45b2cc142e..03b4ab4e36 100644 --- a/learning_resources/management/commands/migrate_list_resources.py +++ b/learning_resources/management/commands/transfer_list_resources.py @@ -2,7 +2,7 @@ from django.core.management import BaseCommand -from learning_resources.utils import migrate_list_resources +from learning_resources.utils import transfer_list_resources from main.utils import now_in_utc @@ -49,7 +49,7 @@ def handle(self, *args, **options): # noqa: ARG002 f"{from_source} to {to_source}, matching on {match_field}" ) start = now_in_utc() - unpublished, matching = migrate_list_resources( + unpublished, matching = transfer_list_resources( resource_type, match_field, from_source, diff --git a/learning_resources/utils.py b/learning_resources/utils.py index 628d496746..90b3795cab 100644 --- a/learning_resources/utils.py +++ b/learning_resources/utils.py @@ -564,7 +564,7 @@ def add_parent_topics_to_learning_resource(resource): _walk_lr_topic_parents(resource, topic.parent) -def migrate_list_resources( +def transfer_list_resources( resource_type: str, matching_field: str, from_source: str, @@ -573,8 +573,8 @@ def migrate_list_resources( delete_unpublished: bool = False, ) -> tuple[int, int]: """ - Migrate learningpath/userlist resources that have been replaced with - a new resource object. + Migrate unpublished learningpath/userlist resources that have + been replaced with new resource objects. Args: resource_type (str): the resource type diff --git a/learning_resources/utils_test.py b/learning_resources/utils_test.py index 75cb016d90..177d7a0c5f 100644 --- a/learning_resources/utils_test.py +++ b/learning_resources/utils_test.py @@ -31,7 +31,7 @@ ) from learning_resources.utils import ( add_parent_topics_to_learning_resource, - migrate_list_resources, + transfer_list_resources, upsert_topic_data, ) @@ -341,10 +341,10 @@ def test_upsert_offered_by(mocker): ("readable_id", "podcast", "podcast", False, False), ], ) -def test_migrate_list_resources( +def test_transfer_list_resources( matching_field, from_source, to_source, matches, delete_old ): - """Test that the migrate_list_resources function works as expected.""" + """Test that the transfer_list_resources function works as expected.""" original_podcasts = LearningResourceFactory.create_batch( 5, is_podcast=True, etl_source=from_source, published=False ) @@ -365,7 +365,7 @@ def test_migrate_list_resources( for old_podcast in original_podcasts ] - results = migrate_list_resources( + results = transfer_list_resources( "podcast", matching_field, from_source, to_source, delete_unpublished=delete_old ) podcast_path.refresh_from_db() From 28f03bb2d1dd3161319122c500666a4774ddf825 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 10 Jul 2024 09:48:14 -0400 Subject: [PATCH 08/10] Use image variable for transforming podcast --- learning_resources/etl/loaders.py | 2 ++ learning_resources/etl/podcast.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py index 91020fedf1..c09c5d4428 100644 --- a/learning_resources/etl/loaders.py +++ b/learning_resources/etl/loaders.py @@ -723,9 +723,11 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]: # unpublish the podcasts and episodes we're no longer tracking ids = [podcast.id for podcast in podcast_resources] + log.info("PODCAST ID COUNT: %d", len(ids)) unpublished_podcasts = LearningResource.objects.filter( resource_type=LearningResourceType.podcast.name ).exclude(id__in=ids) + log.info("UNPUBLISHED PODCAST ID COUNT: %d", len(unpublished_podcasts)) unpublished_podcasts.update(published=False) bulk_resources_unpublished_actions( unpublished_podcasts.values_list("id", flat=True), diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 9f21997b07..96a0d64942 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -184,7 +184,7 @@ def transform(extracted_podcasts): for rss_data, config_data in extracted_podcasts: try: image = ( - rss_data.channel.find("itunes:image")["href"] + {"url": rss_data.channel.find("itunes:image")["href"]} if rss_data.channel.find("itunes:image") else None ) @@ -209,7 +209,7 @@ def transform(extracted_podcasts): "resource_type": LearningResourceType.podcast.name, "offered_by": offered_by, "description": clean_data(rss_data.channel.description.text), - "image": {"url": rss_data.channel.find("itunes:image")["href"]}, + "image": image, "published": True, "url": config_data["website"], "topics": topics, From b0399da9b8c9cfb965fd5a06fb6f45190d4e3304 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 10 Jul 2024 10:57:30 -0400 Subject: [PATCH 09/10] Turns out plenty of rss feeds don't have guids except for episodes --- learning_resources/etl/loaders.py | 2 -- learning_resources/etl/podcast.py | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py index c09c5d4428..91020fedf1 100644 --- a/learning_resources/etl/loaders.py +++ b/learning_resources/etl/loaders.py @@ -723,11 +723,9 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]: # unpublish the podcasts and episodes we're no longer tracking ids = [podcast.id for podcast in podcast_resources] - log.info("PODCAST ID COUNT: %d", len(ids)) unpublished_podcasts = LearningResource.objects.filter( resource_type=LearningResourceType.podcast.name ).exclude(id__in=ids) - log.info("UNPUBLISHED PODCAST ID COUNT: %d", len(unpublished_podcasts)) unpublished_podcasts.update(published=False) bulk_resources_unpublished_actions( unpublished_podcasts.values_list("id", flat=True), diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 96a0d64942..42ac0ef34e 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup as bs # noqa: N813 from dateutil.parser import parse from django.conf import settings +from django.utils.text import slugify from requests.exceptions import HTTPError from learning_resources.constants import LearningResourceType @@ -141,7 +142,8 @@ def transform_episode(rss_data, offered_by, topics, parent_image): """ return { - "readable_id": rss_data.guid.text, + "readable_id": rss_data.guid.text + or slugify(rss_data.link.text if rss_data.link else rss_data.enclosure["url"]), "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast_episode.name, "title": rss_data.title.text, @@ -203,7 +205,7 @@ def transform(extracted_podcasts): title = config_data.get("podcast_title", rss_data.channel.title.text) yield { - "readable_id": rss_data.guid.text, + "readable_id": slugify(rss_data.channel.link or config_data["rss_url"]), "title": title, "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast.name, From 9fbb9de88e323a59c5d4f5cefa330410946649f2 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Wed, 10 Jul 2024 11:39:27 -0400 Subject: [PATCH 10/10] refactor readable ids --- learning_resources/etl/podcast.py | 30 ++++++++++++++++++-------- learning_resources/etl/podcast_test.py | 15 ++++++------- test_html/test_podcast.rss | 1 - 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 42ac0ef34e..a6e40379a8 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -8,7 +8,6 @@ from bs4 import BeautifulSoup as bs # noqa: N813 from dateutil.parser import parse from django.conf import settings -from django.utils.text import slugify from requests.exceptions import HTTPError from learning_resources.constants import LearningResourceType @@ -101,6 +100,19 @@ def get_podcast_configs(): return podcast_configs +def parse_readable_id_from_url(url): + """ + Parse readable id from podcast/episode url + + Args: + url (str): the podcast/episode url + + Returns: + str: the readable id + """ + return url.split("//")[-1] + + def extract(): """ Function for extracting podcast data @@ -143,7 +155,9 @@ def transform_episode(rss_data, offered_by, topics, parent_image): return { "readable_id": rss_data.guid.text - or slugify(rss_data.link.text if rss_data.link else rss_data.enclosure["url"]), + or parse_readable_id_from_url( + rss_data.link.text if rss_data.link else rss_data.enclosure["url"] + ), "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast_episode.name, "title": rss_data.title.text, @@ -151,12 +165,10 @@ def transform_episode(rss_data, offered_by, topics, parent_image): "description": clean_data(rss_data.description.text), "url": rss_data.enclosure["url"], "image": { - "url": ( - rss_data.find("image")["href"] - if rss_data.find("image") - else parent_image - ), - }, + "url": (rss_data.find("image")["href"]), + } + if rss_data.find("image") + else parent_image, "last_modified": parse(rss_data.pubDate.text), "published": True, "topics": topics, @@ -205,7 +217,7 @@ def transform(extracted_podcasts): title = config_data.get("podcast_title", rss_data.channel.title.text) yield { - "readable_id": slugify(rss_data.channel.link or config_data["rss_url"]), + "readable_id": parse_readable_id_from_url(config_data["rss_url"]), "title": title, "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast.name, diff --git a/learning_resources/etl/podcast_test.py b/learning_resources/etl/podcast_test.py index bd65fd959f..f48ec454c8 100644 --- a/learning_resources/etl/podcast_test.py +++ b/learning_resources/etl/podcast_test.py @@ -43,23 +43,22 @@ def rss_content(): def mock_podcast_file( # pylint: disable=too-many-arguments # noqa: PLR0913 podcast_title=None, topics=None, - website_url="website_url", + website_url="http://website.url/podcast", offered_by=None, google_podcasts_url="google_podcasts_url", apple_podcasts_url="apple_podcasts_url", - rss_url="rss_url", + rss_url="http://website.url/podcast/rss.xml", ): """Mock podcast github file""" content = f"""--- -rss_url: rss_url +rss_url: {rss_url} { "podcast_title: " + podcast_title if podcast_title else "" } { "topics: " + topics if topics else "" } { "offered_by: " + offered_by if offered_by else "" } website: {website_url} google_podcasts_url: {google_podcasts_url} apple_podcasts_url: {apple_podcasts_url} -rss_url: {rss_url} """ return Mock(decoded_content=content) @@ -130,7 +129,7 @@ def test_transform(mock_github_client, title, topics, offered_by): expected_results = [ { - "readable_id": "tag:soundcloud,2010:tracks/1857159426", + "readable_id": "website.url/podcast/rss.xml", "etl_source": ETLSource.podcast.name, "title": expected_title, "offered_by": expected_offered_by, @@ -141,7 +140,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "podcast": { "google_podcasts_url": "google_podcasts_url", "apple_podcasts_url": "apple_podcasts_url", - "rss_url": "rss_url", + "rss_url": "http://website.url/podcast/rss.xml", }, "resource_type": LearningResourceType.podcast.name, "topics": expected_topics, @@ -221,11 +220,11 @@ def test_transform_with_error(mocker, mock_github_client): results = list(transform(extract_results)) mock_exception_log.assert_called_once_with( - "Error parsing podcast data from %s", "rss_url" + "Error parsing podcast data from %s", "http://website.url/podcast/rss.xml" ) assert len(results) == 1 - assert results[0]["url"] == "website_url" + assert results[0]["url"] == "http://website.url/podcast" @pytest.mark.django_db() diff --git a/test_html/test_podcast.rss b/test_html/test_podcast.rss index a712805392..ef6e7f9a17 100644 --- a/test_html/test_podcast.rss +++ b/test_html/test_podcast.rss @@ -26,7 +26,6 @@ https://awebsite.mit.edu/ - tag:soundcloud,2010:tracks/1857159426 tag:soundcloud,2010:tracks/numbers1 Episode1