diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 49d82b6bac..121cffd28f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,12 +47,16 @@ jobs: with: python-version-file: "pyproject.toml" cache: "poetry" - + - name: Install poetry with pip + run: python -m pip install poetry - name: Validate lockfile run: poetry check --lock - + - name: Set Poetry Python + run: poetry env use python3.12 - name: Install dependencies - run: poetry install --no-interaction + run: | + source $(poetry env info --path)/bin/activate + poetry install --no-interaction - name: Create test local state run: ./scripts/test/stub-data.sh diff --git a/RELEASE.rst b/RELEASE.rst index 4c2e581c0b..4f8fc463b4 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -1,6 +1,14 @@ Release Notes ============= +Version 0.40.1 +-------------- + +- fix program collection org filter bug (#2435) +- Optimize memory footprint of pdf problem transcription task (#2433) +- Replace the social media image (#2434) +- Generate summaries for new video transcripts (#2428) + Version 0.40.0 (Released August 13, 2025) -------------- diff --git a/frontends/main/public/images/learn-og-image.jpg b/frontends/main/public/images/learn-og-image.jpg index cf57a0decd..40e48f1614 100644 Binary files a/frontends/main/public/images/learn-og-image.jpg and b/frontends/main/public/images/learn-og-image.jpg differ diff --git a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx index 346e1d33e8..fb7283c24c 100644 --- a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx +++ b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx @@ -194,6 +194,7 @@ const OrgProgramCollectionDisplay: React.FC<{ key={item.programId} program={item.program} enrollments={enrollments} + orgId={orgId} /> ) : null, )} @@ -206,7 +207,7 @@ const OrgProgramDisplay: React.FC<{ program: DashboardProgram courseRunEnrollments?: CourseRunEnrollment[] programLoading: boolean - orgId?: number + orgId: number }> = ({ program, courseRunEnrollments, programLoading, orgId }) => { const courses = useQuery( coursesQueries.coursesList({ id: program.courseIds, org_id: orgId }), @@ -254,14 +255,17 @@ const OrgProgramDisplay: React.FC<{ const ProgramCollectionItem: React.FC<{ program: DashboardProgram enrollments?: CourseRunEnrollment[] -}> = ({ program, enrollments }) => { - return + orgId: number +}> = ({ program, enrollments, orgId }) => { + return ( + + ) } const ProgramCard: React.FC<{ program: DashboardProgram enrollments?: CourseRunEnrollment[] - orgId?: number + orgId: number }> = ({ program, enrollments, orgId }) => { const courses = useQuery( coursesQueries.coursesList({ diff --git a/frontends/main/src/common/metadata.ts b/frontends/main/src/common/metadata.ts index 30c9fedc2e..1a881cd827 100644 --- a/frontends/main/src/common/metadata.ts +++ b/frontends/main/src/common/metadata.ts @@ -98,8 +98,8 @@ export const standardizeMetadata = ({ images: [ { url: image, - width: image === DEFAULT_OG_IMAGE ? "" : 967, - height: image === DEFAULT_OG_IMAGE ? "" : 511, + width: image === DEFAULT_OG_IMAGE ? 967 : "", + height: image === DEFAULT_OG_IMAGE ? 511 : "", alt: imageAlt, }, ], diff --git a/learning_resources/constants.py b/learning_resources/constants.py index 78d673da4e..ef2c835fdd 100644 --- a/learning_resources/constants.py +++ b/learning_resources/constants.py @@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices): ".json", ".md", ".pdf", + ".tex", ".ppt", ".pptx", ".rtf", diff --git a/learning_resources/content_summarizer.py b/learning_resources/content_summarizer.py index 04952ab6ae..ebc6465b80 100644 --- a/learning_resources/content_summarizer.py +++ b/learning_resources/content_summarizer.py @@ -215,11 +215,15 @@ def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM: if not settings.LITELLM_CUSTOM_PROVIDER: raise ValueError("The 'LITELLM_CUSTOM_PROVIDER' setting must be set.") # noqa: EM101, TRY003 + if not settings.LITELLM_API_BASE: + raise ValueError("The 'LITELLM_API_BASE' setting must be set.") # noqa: EM101, TRY003 + return ChatLiteLLM( model=model, temperature=temperature, max_tokens=max_tokens, custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER, + api_base=settings.LITELLM_API_BASE, ) def _generate_summary(self, content: str, llm_model: str) -> str: diff --git a/learning_resources/content_summarizer_test.py b/learning_resources/content_summarizer_test.py index 6c81bb4fc9..c129f9c011 100644 --- a/learning_resources/content_summarizer_test.py +++ b/learning_resources/content_summarizer_test.py @@ -21,6 +21,11 @@ pytestmark = pytest.mark.django_db +@pytest.fixture(autouse=True) +def setup_settings(settings): + settings.LITELLM_API_BASE = "https://test/api/" + + @pytest.fixture def mock_summarize_single_content_file(mocker): """Fixture for mocking the process single file method""" diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 040e86a151..6e0ef24042 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -9,10 +9,10 @@ from pathlib import Path from tempfile import TemporaryDirectory +import pypdfium2 as pdfium from defusedxml import ElementTree from django.conf import settings from litellm import completion -from pdf2image import convert_from_path from PIL import Image from learning_resources.constants import ( @@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): run.checksum = checksum run.save() - return resource_readable_id, run + return resource_readable_id def _course_url(course_archive_path) -> str: @@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict: return dict(resources_dict) -def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85): +def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): """ Convert a PDF file to a list of base64 encoded images (one per page). Resizes images to reduce file size while keeping good OCR quality. @@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8 Returns: list: List of base64 encoded strings (one per page) """ - images = convert_from_path(pdf_path, dpi=dpi) - base64_images = [] - for image in images: + pdf = pdfium.PdfDocument(pdf_path) + for page_index in range(len(pdf)): + page = pdf.get_page(page_index) + image = page.render(scale=2).to_pil() + page.close() # Resize the image if it's too large (preserving aspect ratio) if max(image.size) > max_size: image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) - buffered = BytesIO() - # Save with optimized settings if fmt.upper() == "JPEG": image.save(buffered, format="JPEG", quality=quality, optimize=True) else: # PNG image.save(buffered, format="PNG", optimize=True) - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - base64_images.append(img_str) - - return base64_images + yield img_str + pdf.close() def _pdf_to_markdown(pdf_path): diff --git a/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py new file mode 100644 index 0000000000..20f381762c --- /dev/null +++ b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py @@ -0,0 +1,67 @@ +# Generated by Django 4.2.23 on 2025-08-14 15:20 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0093_tutorproblem_view_group"), + ] + + operations = [ + migrations.AlterField( + model_name="contentsummarizerconfiguration", + name="allowed_extensions", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + (".csv", ".csv"), + (".doc", ".doc"), + (".docx", ".docx"), + (".htm", ".htm"), + (".html", ".html"), + (".json", ".json"), + (".m", ".m"), + (".mat", ".mat"), + (".md", ".md"), + (".pdf", ".pdf"), + (".ppt", ".ppt"), + (".pptx", ".pptx"), + (".ps", ".ps"), + (".py", ".py"), + (".r", ".r"), + (".rtf", ".rtf"), + (".sjson", ".sjson"), + (".srt", ".srt"), + (".txt", ".txt"), + (".vtt", ".vtt"), + (".xls", ".xls"), + (".xlsx", ".xlsx"), + (".xml", ".xml"), + (".doc", ".doc"), + (".docx", ".docx"), + (".htm", ".htm"), + (".html", ".html"), + (".json", ".json"), + (".md", ".md"), + (".pdf", ".pdf"), + (".tex", ".tex"), + (".ppt", ".ppt"), + (".pptx", ".pptx"), + (".rtf", ".rtf"), + (".sjson", ".sjson"), + (".srt", ".srt"), + (".txt", ".txt"), + (".vtt", ".vtt"), + (".xml", ".xml"), + ], + max_length=128, + ), + blank=True, + default=list, + null=True, + size=None, + ), + ), + ] diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 51d3914b73..16db1722b0 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite): for archive in latest_archives.values(): key = archive.key log.info("Ingesting canvas course %s", key) - resource_readable_id, canvas_run = ingest_canvas_course( + resource_readable_id = ingest_canvas_course( key, overwrite=overwrite, ) diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index ceeb2cf3bb..249cdb6cbd 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas # Patch ingest_canvas_course to return the readable_ids for the two non-stale courses mock_ingest_course = mocker.patch( "learning_resources.tasks.ingest_canvas_course", - side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())], + side_effect=["course1", "course2"], ) sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False) diff --git a/main/settings.py b/main/settings.py index ef2c14acca..a09ee22073 100644 --- a/main/settings.py +++ b/main/settings.py @@ -34,7 +34,7 @@ from main.settings_pluggy import * # noqa: F403 from openapi.settings_spectacular import open_spectacular_settings -VERSION = "0.40.0" +VERSION = "0.40.1" log = logging.getLogger() diff --git a/poetry.lock b/poetry.lock index 782f978038..85e9185648 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5512,21 +5512,6 @@ pygments = "*" [package.extras] testing = ["ipython", "pexpect", "pytest", "pytest-cov"] -[[package]] -name = "pdf2image" -version = "1.17.0" -description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list." -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"}, - {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"}, -] - -[package.dependencies] -pillow = "*" - [[package]] name = "pexpect" version = "4.9.0" @@ -6389,6 +6374,29 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow (>=8.0.0)", "cryptography"] image = ["Pillow (>=8.0.0)"] +[[package]] +name = "pypdfium2" +version = "4.30.0" +description = "Python bindings to PDFium" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "pyreadline3" version = "3.5.4" @@ -9105,4 +9113,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "~3.12" -content-hash = "63e448c31390942ead05c25627a91c91d5bfec50d2e6460432ef7868c5e0ffe8" +content-hash = "04bc62389781a0c453f7df3965a1b6bc2e728c5ed68377392776108ab0ea09b6" diff --git a/pyproject.toml b/pyproject.toml index 551a4983fc..d76f251f33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0" django-filter = "^2.4.0" django-guardian = "^3.0.0" django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret + + django-imagekit = "^5.0.0" django-ipware = "^7.0.0" django-json-widget = "^2.0.0" @@ -111,7 +113,9 @@ uwsgi = "^2.0.29" uwsgitop = "^0.12" wrapt = "^1.14.1" youtube-transcript-api = "^1.0.0" -pdf2image = "^1.17.0" +pypdfium2 = "^4.30.0" + + [tool.poetry.group.dev.dependencies] bpython = "^0.25" diff --git a/vector_search/conftest.py b/vector_search/conftest.py index a95efb1552..0148a4c1d9 100644 --- a/vector_search/conftest.py +++ b/vector_search/conftest.py @@ -31,6 +31,7 @@ def _use_dummy_encoder(settings): def _use_test_qdrant_settings(settings, mocker): settings.QDRANT_HOST = "https://test" settings.QDRANT_BASE_COLLECTION_NAME = "test" + settings.LITELLM_API_BASE = "https://test/api/" settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = 0 settings.CONTENT_FILE_EMBEDDING_SEMANTIC_CHUNKING_ENABLED = False mock_qdrant = mocker.patch("qdrant_client.QdrantClient") diff --git a/vector_search/utils.py b/vector_search/utils.py index eac2d415e4..fc3dae87a8 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -524,11 +524,15 @@ def embed_learning_resources(ids, resource_type, overwrite): else: serialized_resources = list(serialize_bulk_content_files(ids)) # TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501 - # Currently we only want to summarize content that already has a summary + # Currently we only want to summarize content that either already has a summary + # OR is in a course where atleast one other content file has a summary existing_summary_content_ids = [ resource["id"] for resource in serialized_resources if resource.get("summary") + or ContentFile.objects.filter(run__id=resource.get("run_id")) + .exclude(summary="") + .exists() ] ContentSummarizer().summarize_content_files_by_ids( existing_summary_content_ids, overwrite diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 1bcf1da56e..26c4f5ecf6 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -13,6 +13,9 @@ ) from learning_resources.models import LearningResource from learning_resources.serializers import LearningResourceMetadataDisplaySerializer +from learning_resources_search.constants import ( + CONTENT_FILE_TYPE, +) from learning_resources_search.serializers import ( serialize_bulk_content_files, serialize_bulk_learning_resources, @@ -806,3 +809,44 @@ def test_update_content_file_payload_only_includes_existing_keys( ) else: mock_retrieve.assert_not_called() + + +@pytest.mark.django_db +def test_embed_learning_resources_contentfile_summarization_filter(mocker): + """ + Test that the summarizer runs for a content file if another content file + in the parent learning run also has a summary. + """ + settings.OPENAI_API_KEY = "test" + settings.QDRANT_ENABLE_INDEXING_PLUGIN_HOOKS = True + mock_content_summarizer = mocker.patch( + "learning_resources.content_summarizer.ContentSummarizer.summarize_content_files_by_ids" + ) + mock_chat_llm = mocker.patch( + "learning_resources.content_summarizer.ChatLiteLLM", autospec=True + ) + mock_instance = mock_chat_llm.return_value + mock_summary_response = mocker.MagicMock() + mock_summary_response.content = "mocked summary" + mock_instance.invoke.return_value = mock_summary_response + mock_instance.with_structured_output.return_value.invoke.return_value = { + "flashcards": [ + { + "question": "Generated Question", + "answer": "Generated Answer", + } + ] + } + + run = LearningResourceRunFactory.create(published=True) + ContentFileFactory.create_batch( + 2, content="test content", summary="summary text", run=run + ) + new_content_files = ContentFileFactory.create_batch( + 2, content="new content", summary="", run=run + ) + cf_ids = [cf.id for cf in new_content_files] + embed_learning_resources(cf_ids, resource_type=CONTENT_FILE_TYPE, overwrite=False) + + # Assert that the summarizer was called with the correct content file IDs + assert sorted(mock_content_summarizer.mock_calls[0].args[0]) == sorted(cf_ids)