From 236319599c3ac4614386d5f6487af2e291a185d8 Mon Sep 17 00:00:00 2001 From: Matt Bertrand Date: Mon, 7 Oct 2024 14:40:53 -0400 Subject: [PATCH] Always delete past events during ETL, and filter them out from api results too just in case --- news_events/etl/conftest.py | 6 ++--- news_events/etl/loaders.py | 10 +++++--- news_events/etl/loaders_test.py | 43 ++++++++++++++++++++++++--------- news_events/factories.py | 2 +- news_events/views.py | 9 +++++-- news_events/views_test.py | 7 ++++++ 6 files changed, 57 insertions(+), 20 deletions(-) diff --git a/news_events/etl/conftest.py b/news_events/etl/conftest.py index af351d1408..badc4352c9 100644 --- a/news_events/etl/conftest.py +++ b/news_events/etl/conftest.py @@ -65,7 +65,7 @@ def sources_data() -> SimpleNamespace: } if feed_type == FeedType.news.name else { - "event_datetime": "2024-03-15T13:42:36Z", + "event_datetime": "2124-03-15T13:42:36Z", **event_details, }, }, @@ -86,7 +86,7 @@ def sources_data() -> SimpleNamespace: } if feed_type == FeedType.news.name else { - "event_datetime": "2024-03-13T15:57:53Z", + "event_datetime": "2124-03-13T15:57:53Z", **event_details, }, }, @@ -111,7 +111,7 @@ def sources_data() -> SimpleNamespace: } if feed_type == FeedType.news.name else { - "event_datetime": "2024-02-15T13:42:36Z", + "event_datetime": "2124-02-15T13:42:36Z", **event_details, }, }, diff --git a/news_events/etl/loaders.py b/news_events/etl/loaders.py index 15ea7dcc08..4f01a50f88 100644 --- a/news_events/etl/loaders.py +++ b/news_events/etl/loaders.py @@ -2,6 +2,7 @@ import logging +from main.utils import now_in_utc from news_events.constants import FeedType from news_events.models import ( FeedEventDetail, @@ -153,10 +154,13 @@ def load_feed_source( FeedItem.objects.filter(source=source).exclude( pk__in=[item.pk for item in items if item] ).delete() - FeedImage.objects.filter( - feeditem__isnull=True, feedsource__isnull=True + # Always delete past events and orphaned images + FeedImage.objects.filter(feeditem__isnull=True, feedsource__isnull=True).delete() + if source.feed_type == FeedType.events.name: + FeedItem.objects.filter( + source=source, + event_details__event_datetime__lt=now_in_utc(), ).delete() - return source, items diff --git a/news_events/etl/loaders_test.py b/news_events/etl/loaders_test.py index 3903b97553..06eab486c1 100644 --- a/news_events/etl/loaders_test.py +++ b/news_events/etl/loaders_test.py @@ -7,7 +7,11 @@ from news_events.constants import FeedType from news_events.etl import loaders -from news_events.factories import FeedImageFactory, FeedItemFactory, FeedSourceFactory +from news_events.factories import ( + FeedImageFactory, + FeedItemFactory, + FeedSourceFactory, +) from news_events.models import FeedImage, FeedItem, FeedSource pytestmark = [pytest.mark.django_db] @@ -65,24 +69,41 @@ def load_feed_sources_bad_item(mocker, sources_data): ) -def test_load_feed_sources_delete_old_items(sources_data): +@pytest.mark.parametrize("empty_data", [True, False]) +def test_load_feed_sources_delete_old_items(sources_data, empty_data): """Tests that load_sources deletes old items and images""" - source_data = sources_data.news + source_data = sources_data.events source = FeedSourceFactory.create( - url=source_data[0]["url"], feed_type=FeedType.news.name + url=source_data[0]["url"], feed_type=FeedType.events.name ) - old_source_item = FeedItemFactory(source=source, is_news=True) - other_source_item = FeedItemFactory.create(is_news=True) + omitted_event_item = FeedItemFactory.create(is_event=True, source=source) + + expired_event_item = FeedItemFactory.create(is_event=True, source=source) + expired_event_item.event_details.event_datetime = "2000-01-01T00:00:00Z" + expired_event_item.event_details.save() + + other_source_item = FeedItemFactory.create(is_event=True) orphaned_image = FeedImageFactory.create() # no source or item - loaders.load_feed_sources(FeedType.news.name, source_data) + if empty_data: + source_data[0]["items"] = [] + loaders.load_feed_sources(FeedType.events.name, source_data) + + # Existing feed items with future dates should be removed only if source data exists + assert FeedItem.objects.filter(pk=omitted_event_item.pk).exists() is empty_data + assert ( + FeedImage.objects.filter(pk=omitted_event_item.image.pk).exists() is empty_data + ) - assert FeedItem.objects.filter(pk=old_source_item.pk).exists() is False - assert FeedImage.objects.filter(pk=old_source_item.image.pk).exists() is False + # Events from other sources should be unaffected assert FeedItem.objects.filter(pk=other_source_item.pk).exists() is True - assert FeedImage.objects.filter(pk=other_source_item.image.pk).exists() is True + + # Old events and orphaned images should always be removed + assert FeedItem.objects.filter(pk=expired_event_item.pk).exists() is False + assert FeedImage.objects.filter(pk=expired_event_item.image.pk).exists() is False assert FeedImage.objects.filter(pk=orphaned_image.pk).exists() is False - assert FeedItem.objects.filter(source=source).count() == 2 + + assert FeedItem.objects.filter(source=source).count() == 1 if empty_data else 2 def test_load_item_null_data(): diff --git a/news_events/factories.py b/news_events/factories.py index c6f7683dd3..6b70e39051 100644 --- a/news_events/factories.py +++ b/news_events/factories.py @@ -113,7 +113,7 @@ class FeedEventDetailFactory(factory.django.DjangoModelFactory): audience = factory.List(random.choices(["Faculty", "Public", "Students"])) # noqa: S311 location = factory.List(random.choices(["Online", "MIT Campus"])) # noqa: S311 event_type = factory.List(random.choices(["Webinar", "Concert", "Conference"])) # noqa: S311 - event_datetime = factory.Faker("date_time", tzinfo=UTC) + event_datetime = factory.Faker("future_datetime", tzinfo=UTC) class Meta: model = models.FeedEventDetail diff --git a/news_events/views.py b/news_events/views.py index ebe7882519..5ec5c27635 100644 --- a/news_events/views.py +++ b/news_events/views.py @@ -1,6 +1,7 @@ """Views for news_events""" from django.conf import settings +from django.db.models import Q from django.utils.decorators import method_decorator from drf_spectacular.utils import extend_schema, extend_schema_view from rest_framework import viewsets @@ -8,7 +9,8 @@ from main.filters import MultipleOptionsFilterBackend from main.permissions import AnonymousAccessReadonlyPermission -from main.utils import cache_page_for_all_users +from main.utils import cache_page_for_all_users, now_in_utc +from news_events.constants import FeedType from news_events.filters import FeedItemFilter, FeedSourceFilter from news_events.models import FeedItem, FeedSource from news_events.serializers import FeedItemSerializer, FeedSourceSerializer @@ -45,7 +47,10 @@ class FeedItemViewSet(viewsets.ReadOnlyModelViewSet): filterset_class = FeedItemFilter queryset = ( FeedItem.objects.select_related(*FeedItem.related_selects) - .all() + .filter( + Q(source__feed_type=FeedType.news.name) + | Q(event_details__event_datetime__gte=now_in_utc()) + ) .order_by( "-news_details__publish_date", "-event_details__event_datetime", diff --git a/news_events/views_test.py b/news_events/views_test.py index da79578a6e..0e6396dff4 100644 --- a/news_events/views_test.py +++ b/news_events/views_test.py @@ -1,11 +1,14 @@ """Tests for news_events views""" +from datetime import UTC, datetime + import pytest from django.urls import reverse from main.test_utils import assert_json_equal from news_events import factories, serializers from news_events.constants import FeedType +from news_events.factories import FeedEventDetailFactory def test_feed_source_viewset_list(client): @@ -71,6 +74,9 @@ def test_feed_item_viewset_list(client, is_news): else x.event_details.event_datetime, reverse=True, ) + past_event = FeedEventDetailFactory( + event_datetime=datetime(2020, 1, 1, tzinfo=UTC) + ).feed_item results = ( client.get(reverse("news_events:v0:news_events_items_api-list")) .json() @@ -79,6 +85,7 @@ def test_feed_item_viewset_list(client, is_news): assert_json_equal( [serializers.FeedItemSerializer(instance=item).data for item in items], results ) + assert past_event.id not in [item["id"] for item in results] @pytest.mark.parametrize("feed_type", FeedType.names())