From 3ba27bdc4edddfbac5f03450a70fa3eb650e7cd0 Mon Sep 17 00:00:00 2001 From: Shankar Ambady Date: Fri, 7 Nov 2025 16:40:57 -0500 Subject: [PATCH 1/7] embeddings healthcheck (#2676) * stashing changes * fixing output * adding sentry logging and docstring * adding safe getters * switching off of deprecated push_scope * added tests * adding normal log lines and extra context * trunacate run ids * adding periodic celery task * adding check for summaries * adding test --- main/settings_celery.py | 6 ++ vector_search/tasks.py | 141 ++++++++++++++++++++++++++++++++++++ vector_search/tasks_test.py | 91 +++++++++++++++++++++++ 3 files changed, 238 insertions(+) diff --git a/main/settings_celery.py b/main/settings_celery.py index cd909c801f..adbe644cb0 100644 --- a/main/settings_celery.py +++ b/main/settings_celery.py @@ -184,6 +184,12 @@ "task": "vector_search.tasks.sync_topics", "schedule": crontab(minute=0, hour="6,18,23"), # 2am 2pm and 7pm EST }, + "weekly_check_missing_embeddings": { + "task": "vector_search.tasks.embeddings_healthcheck", + "schedule": crontab( + minute=0, hour=6, day_of_week=6 + ), # 2:00am EST on Friday + }, } ) diff --git a/vector_search/tasks.py b/vector_search/tasks.py index 3e497a3d9e..b9e724e695 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -2,10 +2,12 @@ import logging import celery +import sentry_sdk from celery.exceptions import Ignore from django.conf import settings from django.db.models import Q +from learning_resources.content_summarizer import ContentSummarizer from learning_resources.models import ( ContentFile, Course, @@ -32,10 +34,16 @@ chunks, now_in_utc, ) +from vector_search.constants import ( + CONTENT_FILES_COLLECTION_NAME, + RESOURCES_COLLECTION_NAME, +) from vector_search.utils import ( embed_learning_resources, embed_topics, + filter_existing_qdrant_points_by_ids, remove_qdrant_records, + vector_point_id, ) log = logging.getLogger(__name__) @@ -369,6 +377,139 @@ def remove_run_content_files(run_id): @app.task +def embeddings_healthcheck(): + """ + Check for missing embeddings and summaries in Qdrant and log warnings to Sentry + """ + remaining_content_files = [] + remaining_resources = [] + resource_point_ids = {} + all_resources = LearningResource.objects.filter( + Q(published=True) | Q(test_mode=True) + ) + + for lr in all_resources: + run = ( + lr.best_run + if lr.best_run + else lr.runs.filter(published=True).order_by("-start_date").first() + ) + point_id = vector_point_id(lr.readable_id) + resource_point_ids[point_id] = {"resource_id": lr.readable_id, "id": lr.id} + content_file_point_ids = {} + if run: + for cf in run.content_files.filter(published=True): + if cf and cf.content: + point_id = vector_point_id( + f"{lr.readable_id}.{run.run_id}.{cf.key}.0" + ) + content_file_point_ids[point_id] = {"key": cf.key, "id": cf.id} + for batch in chunks(content_file_point_ids.keys(), chunk_size=200): + remaining_content_files.extend( + filter_existing_qdrant_points_by_ids( + batch, collection_name=CONTENT_FILES_COLLECTION_NAME + ) + ) + + for batch in chunks( + all_resources.values_list("readable_id", flat=True), + chunk_size=200, + ): + remaining_resources.extend( + filter_existing_qdrant_points_by_ids( + [vector_point_id(pid) for pid in batch], + collection_name=RESOURCES_COLLECTION_NAME, + ) + ) + + remaining_content_file_ids = [ + content_file_point_ids.get(p, {}).get("id") for p in remaining_content_files + ] + remaining_resource_ids = [ + resource_point_ids.get(p, {}).get("id") for p in remaining_resources + ] + missing_summaries = _missing_summaries() + log.info( + "Embeddings healthcheck found %d missing content file embeddings", + len(remaining_content_files), + ) + log.info( + "Embeddings healthcheck found %d missing resource embeddings", + len(remaining_resources), + ) + log.info( + "Embeddings healthcheck found %d missing summaries and flashcards", + len(missing_summaries), + ) + + if len(remaining_content_files) > 0: + _sentry_healthcheck_log( + "embeddings", + "missing_content_file_embeddings", + { + "count": len(remaining_content_files), + "ids": remaining_content_file_ids, + "run_ids": set( + ContentFile.objects.filter( + id__in=remaining_content_file_ids + ).values_list("run__run_id", flat=True)[:100] + ), + }, + f"Warning: {len(remaining_content_files)} missing content file " + "embeddings detected", + ) + + if len(remaining_resources) > 0: + _sentry_healthcheck_log( + "embeddings", + "missing_learning_resource_embeddings", + { + "count": len(remaining_resource_ids), + "ids": remaining_resource_ids, + "titles": list( + LearningResource.objects.filter( + id__in=remaining_resource_ids + ).values_list("title", flat=True) + ), + }, + f"Warning: {len(remaining_resource_ids)} missing learning resource " + "embeddings detected", + ) + if len(missing_summaries) > 0: + _sentry_healthcheck_log( + "embeddings", + "missing_content_file_summaries", + { + "count": len(missing_summaries), + "ids": missing_summaries, + "run_ids": set( + ContentFile.objects.filter(id__in=missing_summaries).values_list( + "run__run_id", flat=True + )[:100] + ), + }, + f"Warning: {len(missing_summaries)} missing content file summaries " + "detected", + ) + + +def _missing_summaries(): + summarizer = ContentSummarizer() + return summarizer.get_unprocessed_content_file_ids( + LearningResource.objects.filter(require_summaries=True) + .filter(Q(published=True) | Q(test_mode=True)) + .values_list("id", flat=True) + ) + + +def _sentry_healthcheck_log(healthcheck, alert_type, context, message): + with sentry_sdk.new_scope() as scope: + scope.set_tag("healthcheck", healthcheck) + scope.set_tag("alert_type", alert_type) + scope.set_context("missing_content_file_embeddings", context) + sentry_sdk.capture_message(message) + + def sync_topics(): """ Sync topics to the Qdrant collection diff --git a/vector_search/tasks_test.py b/vector_search/tasks_test.py index 86135bc07e..1da0c61f65 100644 --- a/vector_search/tasks_test.py +++ b/vector_search/tasks_test.py @@ -10,8 +10,10 @@ ) from learning_resources.factories import ( ContentFileFactory, + ContentSummarizerConfigurationFactory, CourseFactory, LearningResourceFactory, + LearningResourcePlatformFactory, LearningResourceRunFactory, ProgramFactory, ) @@ -24,8 +26,10 @@ embed_learning_resources_by_id, embed_new_content_files, embed_new_learning_resources, + embeddings_healthcheck, start_embed_resources, ) +from vector_search.utils import vector_point_id pytestmark = pytest.mark.django_db @@ -388,3 +392,90 @@ def test_embed_new_content_files_without_runs(mocker, mocked_celery): embedded_ids = generate_embeddings_mock.si.mock_calls[0].args[0] for contentfile_id in content_files_without_run: assert contentfile_id in embedded_ids + + +def test_embeddings_healthcheck_no_missing_embeddings(mocker): + """ + Test embeddings_healthcheck when there are no missing embeddings + """ + lr = LearningResourceFactory.create(published=True) + LearningResourceRunFactory.create(published=True, learning_resource=lr) + ContentFileFactory.create(run=lr.runs.first(), content="test", published=True) + mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk", autospec=True) + mocker.patch( + "vector_search.tasks.filter_existing_qdrant_points_by_ids", return_value=[] + ) + + embeddings_healthcheck() + assert mock_sentry.capture_message.call_count == 0 + + +def test_embeddings_healthcheck_missing_both(mocker): + """ + Test embeddings_healthcheck when there are missing content files and learning resources + """ + lr = LearningResourceFactory.create(published=True) + LearningResourceRunFactory.create(published=True, learning_resource=lr) + cf = ContentFileFactory.create(run=lr.runs.first(), content="test", published=True) + mocker.patch( + "vector_search.tasks.filter_existing_qdrant_points_by_ids", + side_effect=[ + [vector_point_id(lr.readable_id)], + [ + vector_point_id( + f"{cf.run.learning_resource.id}.{cf.run.run_id}.{cf.key}.0" + ) + ], + ], + ) + mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message") + + embeddings_healthcheck() + + assert mock_sentry.call_count == 2 + + +def test_embeddings_healthcheck_missing_summaries(mocker): + """ + Test embeddings_healthcheck for missing contentfile summaries/flashcards + """ + content_extension = [".srt"] + content_type = ["file"] + platform = LearningResourcePlatformFactory.create() + ContentSummarizerConfigurationFactory.create( + allowed_extensions=content_extension, + allowed_content_types=content_type, + is_active=True, + llm_model="test", + platform__code=platform.code, + ) + resource = LearningResourceFactory.create( + published=True, require_summaries=True, platform=platform + ) + resource.runs.all().delete() + learning_resource_run = LearningResourceRunFactory.create( + published=True, + learning_resource=resource, + ) + learning_resource_run.learning_resource = resource + learning_resource_run.save() + + ContentFileFactory.create( + published=True, + content="test", + file_extension=content_extension[0], + summary="", + content_type=content_type[0], + run=learning_resource_run, + ) + mocker.patch( + "vector_search.tasks.filter_existing_qdrant_points_by_ids", + ) + mock_sentry = mocker.patch("vector_search.tasks.sentry_sdk.capture_message") + + embeddings_healthcheck() + assert mock_sentry.call_count == 1 + assert ( + mock_sentry.mock_calls[0].args[0] + == "Warning: 1 missing content file summaries detected" + ) From 5fc50e092507160d7c047227641895e35b027216 Mon Sep 17 00:00:00 2001 From: Ahtesham Quraish Date: Mon, 10 Nov 2025 19:29:43 +0500 Subject: [PATCH 2/7] feat: Implement the Article CRUD except listing (#2686) * feat: add article feature crud except listing Co-authored-by: Ahtesham Quraish --- ...003_remove_article_html_article_content.py | 21 +++ articles/models.py | 2 +- articles/serializers.py | 4 +- articles/validators.py | 6 +- articles/views_test.py | 4 +- frontends/api/src/generated/v1/api.ts | 12 +- .../api/src/test-utils/factories/articles.ts | 5 +- .../app-pages/Articles/ArticleDetailPage.tsx | 65 +++++++++ .../Articles/ArticleEditPage.test.tsx | 100 +++++++++++++ .../app-pages/Articles/ArticleEditPage.tsx | 138 ++++++++++++++++++ .../src/app-pages/Articles/ArticleNewPage.tsx | 122 ++++++++++++++++ .../Articles/NewArticlePage.test.tsx | 109 ++++++++++++++ .../main/src/app/articles/[id]/edit/page.tsx | 9 ++ frontends/main/src/app/articles/[id]/page.tsx | 15 ++ frontends/main/src/app/articles/new/page.tsx | 15 ++ openapi/specs/v1.yaml | 14 +- 16 files changed, 619 insertions(+), 22 deletions(-) create mode 100644 articles/migrations/0003_remove_article_html_article_content.py create mode 100644 frontends/main/src/app-pages/Articles/ArticleDetailPage.tsx create mode 100644 frontends/main/src/app-pages/Articles/ArticleEditPage.test.tsx create mode 100644 frontends/main/src/app-pages/Articles/ArticleEditPage.tsx create mode 100644 frontends/main/src/app-pages/Articles/ArticleNewPage.tsx create mode 100644 frontends/main/src/app-pages/Articles/NewArticlePage.test.tsx create mode 100644 frontends/main/src/app/articles/[id]/edit/page.tsx create mode 100644 frontends/main/src/app/articles/[id]/page.tsx create mode 100644 frontends/main/src/app/articles/new/page.tsx diff --git a/articles/migrations/0003_remove_article_html_article_content.py b/articles/migrations/0003_remove_article_html_article_content.py new file mode 100644 index 0000000000..54860c8f1e --- /dev/null +++ b/articles/migrations/0003_remove_article_html_article_content.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.25 on 2025-11-07 11:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("articles", "0002_alter_article_created_on"), + ] + + operations = [ + migrations.RemoveField( + model_name="article", + name="html", + ), + migrations.AddField( + model_name="article", + name="content", + field=models.JSONField(default={}), + ), + ] diff --git a/articles/models.py b/articles/models.py index 8534a7c898..abbdba7d6f 100644 --- a/articles/models.py +++ b/articles/models.py @@ -10,5 +10,5 @@ class Article(TimestampedModel): Stores rich-text content created by staff members. """ - html = models.TextField() + content = models.JSONField(default={}) title = models.CharField(max_length=255) diff --git a/articles/serializers.py b/articles/serializers.py index 1adf4f0aba..464faf039d 100644 --- a/articles/serializers.py +++ b/articles/serializers.py @@ -20,9 +20,9 @@ class RichTextArticleSerializer(serializers.ModelSerializer): Serializer for LearningResourceInstructor model """ - html = SanitizedHtmlField() + content = serializers.JSONField(default={}) title = serializers.CharField(max_length=255) class Meta: model = models.Article - fields = ["html", "id", "title"] + fields = ["content", "id", "title"] diff --git a/articles/validators.py b/articles/validators.py index 131f7d5e03..aa7997f340 100644 --- a/articles/validators.py +++ b/articles/validators.py @@ -32,10 +32,12 @@ # - On all tags: https://docs.rs/ammonia/latest/ammonia/struct.Builder.html#method.generic_attributes "attributes": { "a": {"href", "hreflang"}, - "img": {"alt", "height", "src", "width", "srcset", "sizes"}, - "figure": {"class"}, + "img": {"alt", "height", "src", "width", "srcset", "sizes", "style"}, + "figure": {"class", "style"}, "oembed": {"url"}, }, + # 👇 Allow data: URLs for src attributes + "url_schemes": {"data"}, } diff --git a/articles/views_test.py b/articles/views_test.py index e6bc5b63cd..748eda8a5e 100644 --- a/articles/views_test.py +++ b/articles/views_test.py @@ -13,12 +13,12 @@ def test_article_creation(staff_client, user): url = reverse("articles:v1:articles-list") data = { - "html": "

", + "content": {}, "title": "Some title", } resp = staff_client.post(url, data) json = resp.json() - assert json["html"] == "

" + assert json["content"] == {} assert json["title"] == "Some title" diff --git a/frontends/api/src/generated/v1/api.ts b/frontends/api/src/generated/v1/api.ts index ce9ce600b1..a19401073e 100644 --- a/frontends/api/src/generated/v1/api.ts +++ b/frontends/api/src/generated/v1/api.ts @@ -4772,10 +4772,10 @@ export interface PatchedLearningResourceRelationshipRequest { export interface PatchedRichTextArticleRequest { /** * - * @type {string} + * @type {any} * @memberof PatchedRichTextArticleRequest */ - html?: string + content?: any /** * * @type {string} @@ -7157,10 +7157,10 @@ export type ResourceTypeEnum = export interface RichTextArticle { /** * - * @type {string} + * @type {any} * @memberof RichTextArticle */ - html: string + content?: any /** * * @type {number} @@ -7182,10 +7182,10 @@ export interface RichTextArticle { export interface RichTextArticleRequest { /** * - * @type {string} + * @type {any} * @memberof RichTextArticleRequest */ - html: string + content?: any /** * * @type {string} diff --git a/frontends/api/src/test-utils/factories/articles.ts b/frontends/api/src/test-utils/factories/articles.ts index 46afe999ae..7abaeadeaf 100644 --- a/frontends/api/src/test-utils/factories/articles.ts +++ b/frontends/api/src/test-utils/factories/articles.ts @@ -6,7 +6,10 @@ import type { RichTextArticle } from "../../generated/v1" const article: Factory = (overrides = {}) => ({ id: faker.number.int(), title: faker.lorem.sentence(), - html: faker.lorem.paragraph(), + content: { + text: faker.lorem.paragraph(), + author: faker.person.fullName(), + }, ...overrides, }) diff --git a/frontends/main/src/app-pages/Articles/ArticleDetailPage.tsx b/frontends/main/src/app-pages/Articles/ArticleDetailPage.tsx new file mode 100644 index 0000000000..e8f73d53d8 --- /dev/null +++ b/frontends/main/src/app-pages/Articles/ArticleDetailPage.tsx @@ -0,0 +1,65 @@ +"use client" + +import React from "react" +import { useArticleDetail } from "api/hooks/articles" +import { Container, LoadingSpinner, styled, Typography } from "ol-components" +import { ButtonLink } from "@mitodl/smoot-design" +import { notFound } from "next/navigation" +import { Permission } from "api/hooks/user" +import RestrictedRoute from "@/components/RestrictedRoute/RestrictedRoute" +import { articlesEditView } from "@/common/urls" + +const Page = styled(Container)({ + marginTop: "40px", + marginBottom: "40px", +}) + +const ControlsContainer = styled.div({ + display: "flex", + justifyContent: "flex-end", + margin: "10px", +}) +const WrapperContainer = styled.div({ + borderBottom: "1px solid rgb(222, 208, 208)", + paddingBottom: "10px", +}) + +const PreTag = styled.pre({ + background: "#f6f6f6", + padding: "16px", + borderRadius: "8px", + fontSize: "14px", + overflowX: "auto", +}) + +export const ArticleDetailPage = ({ articleId }: { articleId: number }) => { + const id = Number(articleId) + const { data, isLoading } = useArticleDetail(id) + + const editUrl = articlesEditView(id) + + if (isLoading) { + return + } + if (!data) { + return notFound() + } + return ( + + + + + {data?.title} + + + + + Edit + + + + {JSON.stringify(data.content, null, 2)} + + + ) +} diff --git a/frontends/main/src/app-pages/Articles/ArticleEditPage.test.tsx b/frontends/main/src/app-pages/Articles/ArticleEditPage.test.tsx new file mode 100644 index 0000000000..2f1bf9b013 --- /dev/null +++ b/frontends/main/src/app-pages/Articles/ArticleEditPage.test.tsx @@ -0,0 +1,100 @@ +import React from "react" +import { screen, renderWithProviders, setMockResponse } from "@/test-utils" +import { waitFor, fireEvent } from "@testing-library/react" +import userEvent from "@testing-library/user-event" +import { factories, urls } from "api/test-utils" +import { ArticleEditPage } from "./ArticleEditPage" + +const pushMock = jest.fn() + +jest.mock("next-nprogress-bar", () => ({ + useRouter: () => ({ + push: pushMock, + }), +})) + +describe("ArticleEditPage", () => { + test("renders editor when user has ArticleEditor permission", async () => { + const user = factories.user.user({ + is_authenticated: true, + is_article_editor: true, + }) + setMockResponse.get(urls.userMe.get(), user) + + const article = factories.articles.article({ + id: 42, + title: "Existing Title", + content: { id: 1, content: "Existing content" }, + }) + setMockResponse.get(urls.articles.details(article.id), article) + + renderWithProviders() + + expect(await screen.findByText("Edit Article")).toBeInTheDocument() + expect(screen.getByTestId("editor")).toBeInTheDocument() + expect(screen.getByDisplayValue("Existing Title")).toBeInTheDocument() + }) + + test("submits article successfully and redirects", async () => { + const user = factories.user.user({ + is_authenticated: true, + is_article_editor: true, + }) + setMockResponse.get(urls.userMe.get(), user) + + const article = factories.articles.article({ + id: 123, + title: "Existing Title", + content: { id: 1, content: "Existing content" }, + }) + setMockResponse.get(urls.articles.details(article.id), article) + + // ✅ Mock successful update response + const updated = { ...article, title: "Updated Title" } + setMockResponse.patch(urls.articles.details(article.id), updated) + + renderWithProviders() + + const titleInput = await screen.findByPlaceholderText("Enter article title") + + fireEvent.change(titleInput, { target: { value: "Updated Title" } }) + + await waitFor(() => expect(titleInput).toHaveValue("Updated Title")) + + await userEvent.click(screen.getByText(/save article/i)) + + // ✅ Wait for redirect after update success + await waitFor(() => expect(pushMock).toHaveBeenCalledWith("/articles/123")) + }) + + test("shows error alert on failure", async () => { + const user = factories.user.user({ + is_authenticated: true, + is_article_editor: true, + }) + setMockResponse.get(urls.userMe.get(), user) + + const article = factories.articles.article({ + id: 7, + title: "Old Title", + content: { id: 1, content: "Bad content" }, + }) + setMockResponse.get(urls.articles.details(article.id), article) + + // ✅ Mock failed update (500) + setMockResponse.patch( + urls.articles.details(article.id), + { detail: "Server Error" }, + { code: 500 }, + ) + + renderWithProviders() + + const titleInput = await screen.findByPlaceholderText("Enter article title") + fireEvent.change(titleInput, { target: { value: "Bad Article" } }) + + await userEvent.click(screen.getByText(/save article/i)) + + expect(await screen.findByText(/Mock Error/i)).toBeInTheDocument() + }) +}) diff --git a/frontends/main/src/app-pages/Articles/ArticleEditPage.tsx b/frontends/main/src/app-pages/Articles/ArticleEditPage.tsx new file mode 100644 index 0000000000..dc4c8a5a35 --- /dev/null +++ b/frontends/main/src/app-pages/Articles/ArticleEditPage.tsx @@ -0,0 +1,138 @@ +"use client" +import React, { useEffect, useState, ChangeEvent } from "react" +import { Permission } from "api/hooks/user" +import { useRouter } from "next-nprogress-bar" +import { useArticleDetail, useArticlePartialUpdate } from "api/hooks/articles" +import { Button, Input, Alert } from "@mitodl/smoot-design" +import RestrictedRoute from "@/components/RestrictedRoute/RestrictedRoute" +import { Container, Typography, styled, LoadingSpinner } from "ol-components" +import { notFound } from "next/navigation" +import { articlesView } from "@/common/urls" + +const SaveButton = styled.div({ + textAlign: "right", + margin: "10px", +}) + +const ClientContainer = styled.div({ + width: "100%", + margin: "10px 0", +}) + +const TitleInput = styled(Input)({ + width: "100%", + margin: "10px 0", +}) + +const ArticleEditPage = ({ articleId }: { articleId: string }) => { + const router = useRouter() + + const id = Number(articleId) + const { data: article, isLoading } = useArticleDetail(id) + + const [title, setTitle] = useState("") + const [text, setText] = useState("") + const [json, setJson] = useState({}) + const [alertText, setAlertText] = useState("") + + const { mutate: updateArticle, isPending } = useArticlePartialUpdate() + + const handleSave = () => { + const payload = { + id: id, + title: title.trim(), + content: json, + } + + updateArticle(payload, { + onSuccess: (article) => { + router.push(articlesView(article.id)) + }, + onError: (error) => { + setAlertText(`❌ ${error.message}`) + }, + }) + } + + useEffect(() => { + if (article && !title) { + setTitle(article.title) + setText(article.content ? JSON.stringify(article.content, null, 2) : "") + setJson(article.content) + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [article]) + + if (isLoading) { + return + } + if (!article) { + return notFound() + } + + const handleChange = (e: ChangeEvent) => { + const value = e.target.value + setText(value) + + try { + const parsed = JSON.parse(value) + setJson(parsed) + } catch { + setJson({}) + } + } + return ( + + + + Edit Article + + {alertText && ( + + + {alertText} + + + )} + { + console.log("Title input changed:", e.target.value) + setTitle(e.target.value) + setAlertText("") + }} + placeholder="Enter article title" + className="input-field" + /> + + +