diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py index b2076c60a5..91020fedf1 100644 --- a/learning_resources/etl/loaders.py +++ b/learning_resources/etl/loaders.py @@ -682,6 +682,10 @@ def load_podcast(podcast_data: dict) -> LearningResource: LearningResource.objects.filter(id__in=unpublished_episode_ids).update( published=False ) + bulk_resources_unpublished_actions( + unpublished_episode_ids, + LearningResourceType.podcast_episode.name, + ) episode_ids.extend(unpublished_episode_ids) learning_resource.resources.set( episode_ids, @@ -719,13 +723,22 @@ def load_podcasts(podcasts_data: list[dict]) -> list[LearningResource]: # unpublish the podcasts and episodes we're no longer tracking ids = [podcast.id for podcast in podcast_resources] - LearningResource.objects.filter( + unpublished_podcasts = LearningResource.objects.filter( resource_type=LearningResourceType.podcast.name - ).exclude(id__in=ids).update(published=False) - LearningResource.objects.filter( + ).exclude(id__in=ids) + unpublished_podcasts.update(published=False) + bulk_resources_unpublished_actions( + unpublished_podcasts.values_list("id", flat=True), + LearningResourceType.podcast.name, + ) + unpublished_episodes = LearningResource.objects.filter( resource_type=LearningResourceType.podcast_episode.name - ).exclude(parents__parent__in=ids).update(published=False) - + ).exclude(parents__parent__in=ids) + unpublished_episodes.update(published=False) + bulk_resources_unpublished_actions( + unpublished_episodes.values_list("id", flat=True), + LearningResourceType.podcast_episode.name, + ) return podcast_resources diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py index 332dc944b0..a6e40379a8 100644 --- a/learning_resources/etl/podcast.py +++ b/learning_resources/etl/podcast.py @@ -12,7 +12,7 @@ from learning_resources.constants import LearningResourceType from learning_resources.etl.constants import ETLSource -from learning_resources.etl.utils import clean_data, generate_readable_id +from learning_resources.etl.utils import clean_data from learning_resources.models import PodcastEpisode from main.utils import frontend_absolute_url, now_in_utc @@ -100,6 +100,19 @@ def get_podcast_configs(): return podcast_configs +def parse_readable_id_from_url(url): + """ + Parse readable id from podcast/episode url + + Args: + url (str): the podcast/episode url + + Returns: + str: the readable id + """ + return url.split("//")[-1] + + def extract(): """ Function for extracting podcast data @@ -125,7 +138,7 @@ def extract(): log.exception("Invalid rss url %s", rss_url) -def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): +def transform_episode(rss_data, offered_by, topics, parent_image): """ Transform a podcast episode into our normalized data @@ -140,10 +153,11 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): normalized podcast episode data """ - rss_data.guid.string = f"{podcast_id}: {rss_data.guid.text}" - return { - "readable_id": generate_readable_id(rss_data.title.text[:95]), + "readable_id": rss_data.guid.text + or parse_readable_id_from_url( + rss_data.link.text if rss_data.link else rss_data.enclosure["url"] + ), "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast_episode.name, "title": rss_data.title.text, @@ -151,12 +165,10 @@ def transform_episode(rss_data, offered_by, topics, parent_image, podcast_id): "description": clean_data(rss_data.description.text), "url": rss_data.enclosure["url"], "image": { - "url": ( - rss_data.find("image")["href"] - if rss_data.find("image") - else parent_image - ), - }, + "url": (rss_data.find("image")["href"]), + } + if rss_data.find("image") + else parent_image, "last_modified": parse(rss_data.pubDate.text), "published": True, "topics": topics, @@ -186,7 +198,7 @@ def transform(extracted_podcasts): for rss_data, config_data in extracted_podcasts: try: image = ( - rss_data.channel.find("itunes:image")["href"] + {"url": rss_data.channel.find("itunes:image")["href"]} if rss_data.channel.find("itunes:image") else None ) @@ -203,23 +215,20 @@ def transform(extracted_podcasts): apple_podcasts_url = config_data.get("apple_podcasts_url") google_podcasts_url = config_data.get("google_podcasts_url") title = config_data.get("podcast_title", rss_data.channel.title.text) - podcast_id = generate_readable_id(title[:95]) yield { - "readable_id": podcast_id, + "readable_id": parse_readable_id_from_url(config_data["rss_url"]), "title": title, "etl_source": ETLSource.podcast.name, "resource_type": LearningResourceType.podcast.name, "offered_by": offered_by, "description": clean_data(rss_data.channel.description.text), - "image": {"url": rss_data.channel.find("itunes:image")["href"]}, + "image": image, "published": True, "url": config_data["website"], "topics": topics, "episodes": ( - transform_episode( - episode_rss, offered_by, topics, image, podcast_id - ) + transform_episode(episode_rss, offered_by, topics, image) for episode_rss in rss_data.find_all("item") ), "podcast": { diff --git a/learning_resources/etl/podcast_test.py b/learning_resources/etl/podcast_test.py index 3012c6de62..f48ec454c8 100644 --- a/learning_resources/etl/podcast_test.py +++ b/learning_resources/etl/podcast_test.py @@ -43,23 +43,22 @@ def rss_content(): def mock_podcast_file( # pylint: disable=too-many-arguments # noqa: PLR0913 podcast_title=None, topics=None, - website_url="website_url", + website_url="http://website.url/podcast", offered_by=None, google_podcasts_url="google_podcasts_url", apple_podcasts_url="apple_podcasts_url", - rss_url="rss_url", + rss_url="http://website.url/podcast/rss.xml", ): """Mock podcast github file""" content = f"""--- -rss_url: rss_url +rss_url: {rss_url} { "podcast_title: " + podcast_title if podcast_title else "" } { "topics: " + topics if topics else "" } { "offered_by: " + offered_by if offered_by else "" } website: {website_url} google_podcasts_url: {google_podcasts_url} apple_podcasts_url: {apple_podcasts_url} -rss_url: {rss_url} """ return Mock(decoded_content=content) @@ -123,22 +122,14 @@ def test_transform(mock_github_client, title, topics, offered_by): ) expected_title = title if title else "A Podcast" - expected_readable_id = ( - "custom-titleb04b26d38dd63a2c829393e9e075927d" - if title - else "a-podcast7e3a1ebb0c4d3196ba4c7f8254af4d2d" - ) expected_offered_by = {"name": offered_by} if offered_by else None episodes_rss = list(bs(rss_content(), "xml").find_all("item")) - for episode in episodes_rss: - episode.guid.string = f"{expected_readable_id}: {episode.guid.text}" - expected_results = [ { - "readable_id": expected_readable_id, + "readable_id": "website.url/podcast/rss.xml", "etl_source": ETLSource.podcast.name, "title": expected_title, "offered_by": expected_offered_by, @@ -149,13 +140,13 @@ def test_transform(mock_github_client, title, topics, offered_by): "podcast": { "google_podcasts_url": "google_podcasts_url", "apple_podcasts_url": "apple_podcasts_url", - "rss_url": "rss_url", + "rss_url": "http://website.url/podcast/rss.xml", }, "resource_type": LearningResourceType.podcast.name, "topics": expected_topics, "episodes": [ { - "readable_id": "episode15ede89915db9342fb76bc91918d22016", + "readable_id": "tag:soundcloud,2010:tracks/numbers1", "etl_source": ETLSource.podcast.name, "title": "Episode1", "offered_by": expected_offered_by, @@ -178,7 +169,7 @@ def test_transform(mock_github_client, title, topics, offered_by): "topics": expected_topics, }, { - "readable_id": "episode205c066df9ed531e48c6414f6e72d3b96", + "readable_id": "tag:soundcloud,2010:tracks/numbers2", "etl_source": ETLSource.podcast.name, "title": "Episode2", "offered_by": expected_offered_by, @@ -229,11 +220,11 @@ def test_transform_with_error(mocker, mock_github_client): results = list(transform(extract_results)) mock_exception_log.assert_called_once_with( - "Error parsing podcast data from %s", "rss_url" + "Error parsing podcast data from %s", "http://website.url/podcast/rss.xml" ) assert len(results) == 1 - assert results[0]["url"] == "website_url" + assert results[0]["url"] == "http://website.url/podcast" @pytest.mark.django_db() diff --git a/learning_resources/management/commands/transfer_list_resources.py b/learning_resources/management/commands/transfer_list_resources.py new file mode 100644 index 0000000000..03b4ab4e36 --- /dev/null +++ b/learning_resources/management/commands/transfer_list_resources.py @@ -0,0 +1,63 @@ +"""Management command for migrating unpublished resource path/list relationships""" + +from django.core.management import BaseCommand + +from learning_resources.utils import transfer_list_resources +from main.utils import now_in_utc + + +class Command(BaseCommand): + """ + Migrate relationships in learningpaths and userlists from unpublished + resources to published replacement resources. + """ + + help = "Migrate relationships from unpublished resources to published resources." + + def add_arguments(self, parser): + parser.add_argument("resource_type", type=str, help="Resource type to migrate") + parser.add_argument( + "match_field", type=str, help="Resource field to match resources by" + ) + parser.add_argument( + "from_source", type=str, help="ETL Source for unpublished resources" + ) + parser.add_argument( + "to_source", type=str, help="ETL Source for published resources" + ) + parser.add_argument( + "--delete", + dest="delete", + action="store_true", + help="Delete unpublished resources after migrating relationships", + ) + super().add_arguments(parser) + + def handle(self, *args, **options): # noqa: ARG002 + """ + Migrate relationships in learningpaths and userlists from unpublished + resources to published replacement resources. + """ + resource_type = options["resource_type"] + match_field = options["match_field"] + from_source = options["from_source"] + to_source = options["to_source"] + delete = options["delete"] + + self.stdout.write( + f"Migrate {resource_type} relationships from " + f"{from_source} to {to_source}, matching on {match_field}" + ) + start = now_in_utc() + unpublished, matching = transfer_list_resources( + resource_type, + match_field, + from_source, + to_source, + delete_unpublished=delete, + ) + total_seconds = (now_in_utc() - start).total_seconds() + self.stdout.write( + f"Processed {unpublished} resources and found {matching} " + f"published matches, took {total_seconds} seconds" + ) diff --git a/learning_resources/migrations/0056_alter_learningresource_readable_id.py b/learning_resources/migrations/0056_alter_learningresource_readable_id.py new file mode 100644 index 0000000000..69ee099aeb --- /dev/null +++ b/learning_resources/migrations/0056_alter_learningresource_readable_id.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.13 on 2024-07-08 16:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0055_alter_relationship_options_ordering"), + ] + + operations = [ + migrations.AlterField( + model_name="learningresource", + name="readable_id", + field=models.CharField(max_length=512), + ), + ] diff --git a/learning_resources/models.py b/learning_resources/models.py index 5e1fa5a9ad..4de9909f74 100644 --- a/learning_resources/models.py +++ b/learning_resources/models.py @@ -185,7 +185,7 @@ class LearningResource(TimestampedModel): *([item.name for item in LearningResourceType]), ] - readable_id = models.CharField(max_length=128, null=False, blank=False) + readable_id = models.CharField(max_length=512, null=False, blank=False) title = models.CharField(max_length=256) description = models.TextField(null=True, blank=True) # noqa: DJ001 full_description = models.TextField(null=True, blank=True) # noqa: DJ001 diff --git a/learning_resources/utils.py b/learning_resources/utils.py index e22d43230d..90b3795cab 100644 --- a/learning_resources/utils.py +++ b/learning_resources/utils.py @@ -16,6 +16,7 @@ from learning_resources.constants import ( GROUP_STAFF_LISTS_EDITORS, + LearningResourceRelationTypes, semester_mapping, ) from learning_resources.hooks import get_plugin_manager @@ -24,9 +25,11 @@ LearningResourceDepartment, LearningResourceOfferor, LearningResourcePlatform, + LearningResourceRelationship, LearningResourceRun, LearningResourceSchool, LearningResourceTopic, + UserListRelationship, ) from main.utils import generate_filepath @@ -559,3 +562,53 @@ def add_parent_topics_to_learning_resource(resource): for topic in resource.topics.all(): if topic.parent: _walk_lr_topic_parents(resource, topic.parent) + + +def transfer_list_resources( + resource_type: str, + matching_field: str, + from_source: str, + to_source: str, + *, + delete_unpublished: bool = False, +) -> tuple[int, int]: + """ + Migrate unpublished learningpath/userlist resources that have + been replaced with new resource objects. + + Args: + resource_type (str): the resource type + matching_field (str): the unique field to match identical resources + from_source (str): the ETL source of unpublished resources + to_source (str): the ETL source of published resources + delete_unpublished (bool): whether to delete the unpublished resources + + Returns: + tuple[int, int]: the number of unpublished and matching published resources + """ + unpublished_resources = LearningResource.objects.filter( + resource_type=resource_type, published=False, etl_source=from_source + ) + unpublished_count = 0 + published_count = 0 + for resource in unpublished_resources: + unpublished_count += 1 + unique_value = getattr(resource, matching_field) + published_replacement = LearningResource.objects.filter( + **{matching_field: unique_value}, + resource_type=resource_type, + published=True, + etl_source=to_source, + ).first() + if published_replacement is not None: + published_count += 1 + LearningResourceRelationship.objects.filter( + relation_type=LearningResourceRelationTypes.LEARNING_PATH_ITEMS.value, + child=resource, + ).update(child=published_replacement) + UserListRelationship.objects.filter(child=resource).update( + child=published_replacement + ) + if delete_unpublished: + unpublished_resources.delete() + return unpublished_count, published_count diff --git a/learning_resources/utils_test.py b/learning_resources/utils_test.py index 8e83946a5e..177d7a0c5f 100644 --- a/learning_resources/utils_test.py +++ b/learning_resources/utils_test.py @@ -12,20 +12,26 @@ CONTENT_TYPE_FILE, CONTENT_TYPE_PDF, CONTENT_TYPE_VIDEO, + LearningResourceRelationTypes, ) from learning_resources.etl.utils import get_content_type from learning_resources.factories import ( CourseFactory, + LearningPathFactory, + LearningResourceFactory, LearningResourceRunFactory, LearningResourceTopicFactory, + UserListFactory, ) from learning_resources.models import ( + LearningResource, LearningResourceOfferor, LearningResourcePlatform, LearningResourceTopic, ) from learning_resources.utils import ( add_parent_topics_to_learning_resource, + transfer_list_resources, upsert_topic_data, ) @@ -324,3 +330,58 @@ def test_upsert_offered_by(mocker): ) assert offeror.name == offered_by_data["fields"]["name"] mock_upsert.assert_any_call(offeror, overwrite=True) + + +@pytest.mark.parametrize( + ("matching_field", "from_source", "to_source", "matches", "delete_old"), + [ + ("url", "podcast", "podcast", True, False), + ("url", "podcast", "podcast", True, True), + ("url", "podcast", "xpro", False, False), + ("readable_id", "podcast", "podcast", False, False), + ], +) +def test_transfer_list_resources( + matching_field, from_source, to_source, matches, delete_old +): + """Test that the transfer_list_resources function works as expected.""" + original_podcasts = LearningResourceFactory.create_batch( + 5, is_podcast=True, etl_source=from_source, published=False + ) + podcast_path = LearningPathFactory.create().learning_resource + podcast_path.resources.set( + original_podcasts, + through_defaults={ + "relation_type": LearningResourceRelationTypes.LEARNING_PATH_ITEMS + }, + ) + podcast_list = UserListFactory.create() + podcast_list.resources.set(original_podcasts) + + new_podcasts = [ + LearningResourceFactory.create( + is_podcast=True, url=old_podcast.url, etl_source=from_source + ) + for old_podcast in original_podcasts + ] + + results = transfer_list_resources( + "podcast", matching_field, from_source, to_source, delete_unpublished=delete_old + ) + podcast_path.refresh_from_db() + podcast_list.refresh_from_db() + + assert results == ((5, 5) if matches else (5, 0)) + list_podcasts = ( + new_podcasts if matches else (original_podcasts if not delete_old else []) + ) + for podcast in list_podcasts: + assert podcast in podcast_path.resources.all() + assert podcast in podcast_list.resources.all() + if delete_old: + assert ( + LearningResource.objects.filter( + id__in=[podcast.id for podcast in original_podcasts] + ).count() + == 0 + ) diff --git a/openapi/specs/v1.yaml b/openapi/specs/v1.yaml index 1517ae72fc..630d0ed113 100644 --- a/openapi/specs/v1.yaml +++ b/openapi/specs/v1.yaml @@ -7531,7 +7531,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -7598,7 +7598,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -9815,7 +9815,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -9882,7 +9882,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10065,7 +10065,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -10132,7 +10132,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10452,7 +10452,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -10519,7 +10519,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -10951,7 +10951,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -11018,7 +11018,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1 @@ -11191,7 +11191,7 @@ components: readOnly: true readable_id: type: string - maxLength: 128 + maxLength: 512 title: type: string maxLength: 256 @@ -11258,7 +11258,7 @@ components: readable_id: type: string minLength: 1 - maxLength: 128 + maxLength: 512 title: type: string minLength: 1