diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 49d82b6bac..121cffd28f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -47,12 +47,16 @@ jobs:
with:
python-version-file: "pyproject.toml"
cache: "poetry"
-
+ - name: Install poetry with pip
+ run: python -m pip install poetry
- name: Validate lockfile
run: poetry check --lock
-
+ - name: Set Poetry Python
+ run: poetry env use python3.12
- name: Install dependencies
- run: poetry install --no-interaction
+ run: |
+ source $(poetry env info --path)/bin/activate
+ poetry install --no-interaction
- name: Create test local state
run: ./scripts/test/stub-data.sh
diff --git a/RELEASE.rst b/RELEASE.rst
index 4c2e581c0b..4f8fc463b4 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -1,6 +1,14 @@
Release Notes
=============
+Version 0.40.1
+--------------
+
+- fix program collection org filter bug (#2435)
+- Optimize memory footprint of pdf problem transcription task (#2433)
+- Replace the social media image (#2434)
+- Generate summaries for new video transcripts (#2428)
+
Version 0.40.0 (Released August 13, 2025)
--------------
diff --git a/frontends/main/public/images/learn-og-image.jpg b/frontends/main/public/images/learn-og-image.jpg
index cf57a0decd..40e48f1614 100644
Binary files a/frontends/main/public/images/learn-og-image.jpg and b/frontends/main/public/images/learn-og-image.jpg differ
diff --git a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
index 346e1d33e8..fb7283c24c 100644
--- a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
+++ b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
@@ -194,6 +194,7 @@ const OrgProgramCollectionDisplay: React.FC<{
key={item.programId}
program={item.program}
enrollments={enrollments}
+ orgId={orgId}
/>
) : null,
)}
@@ -206,7 +207,7 @@ const OrgProgramDisplay: React.FC<{
program: DashboardProgram
courseRunEnrollments?: CourseRunEnrollment[]
programLoading: boolean
- orgId?: number
+ orgId: number
}> = ({ program, courseRunEnrollments, programLoading, orgId }) => {
const courses = useQuery(
coursesQueries.coursesList({ id: program.courseIds, org_id: orgId }),
@@ -254,14 +255,17 @@ const OrgProgramDisplay: React.FC<{
const ProgramCollectionItem: React.FC<{
program: DashboardProgram
enrollments?: CourseRunEnrollment[]
-}> = ({ program, enrollments }) => {
- return
+ orgId: number
+}> = ({ program, enrollments, orgId }) => {
+ return (
+
+ )
}
const ProgramCard: React.FC<{
program: DashboardProgram
enrollments?: CourseRunEnrollment[]
- orgId?: number
+ orgId: number
}> = ({ program, enrollments, orgId }) => {
const courses = useQuery(
coursesQueries.coursesList({
diff --git a/frontends/main/src/common/metadata.ts b/frontends/main/src/common/metadata.ts
index 30c9fedc2e..1a881cd827 100644
--- a/frontends/main/src/common/metadata.ts
+++ b/frontends/main/src/common/metadata.ts
@@ -98,8 +98,8 @@ export const standardizeMetadata = ({
images: [
{
url: image,
- width: image === DEFAULT_OG_IMAGE ? "" : 967,
- height: image === DEFAULT_OG_IMAGE ? "" : 511,
+ width: image === DEFAULT_OG_IMAGE ? 967 : "",
+ height: image === DEFAULT_OG_IMAGE ? 511 : "",
alt: imageAlt,
},
],
diff --git a/learning_resources/constants.py b/learning_resources/constants.py
index 78d673da4e..ef2c835fdd 100644
--- a/learning_resources/constants.py
+++ b/learning_resources/constants.py
@@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices):
".json",
".md",
".pdf",
+ ".tex",
".ppt",
".pptx",
".rtf",
diff --git a/learning_resources/content_summarizer.py b/learning_resources/content_summarizer.py
index 04952ab6ae..ebc6465b80 100644
--- a/learning_resources/content_summarizer.py
+++ b/learning_resources/content_summarizer.py
@@ -215,11 +215,15 @@ def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
if not settings.LITELLM_CUSTOM_PROVIDER:
raise ValueError("The 'LITELLM_CUSTOM_PROVIDER' setting must be set.") # noqa: EM101, TRY003
+ if not settings.LITELLM_API_BASE:
+ raise ValueError("The 'LITELLM_API_BASE' setting must be set.") # noqa: EM101, TRY003
+
return ChatLiteLLM(
model=model,
temperature=temperature,
max_tokens=max_tokens,
custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
+ api_base=settings.LITELLM_API_BASE,
)
def _generate_summary(self, content: str, llm_model: str) -> str:
diff --git a/learning_resources/content_summarizer_test.py b/learning_resources/content_summarizer_test.py
index 6c81bb4fc9..c129f9c011 100644
--- a/learning_resources/content_summarizer_test.py
+++ b/learning_resources/content_summarizer_test.py
@@ -21,6 +21,11 @@
pytestmark = pytest.mark.django_db
+@pytest.fixture(autouse=True)
+def setup_settings(settings):
+ settings.LITELLM_API_BASE = "https://test/api/"
+
+
@pytest.fixture
def mock_summarize_single_content_file(mocker):
"""Fixture for mocking the process single file method"""
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 040e86a151..6e0ef24042 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -9,10 +9,10 @@
from pathlib import Path
from tempfile import TemporaryDirectory
+import pypdfium2 as pdfium
from defusedxml import ElementTree
from django.conf import settings
from litellm import completion
-from pdf2image import convert_from_path
from PIL import Image
from learning_resources.constants import (
@@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
run.checksum = checksum
run.save()
- return resource_readable_id, run
+ return resource_readable_id
def _course_url(course_archive_path) -> str:
@@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
return dict(resources_dict)
-def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
+def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
"""
Convert a PDF file to a list of base64 encoded images (one per page).
Resizes images to reduce file size while keeping good OCR quality.
@@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8
Returns:
list: List of base64 encoded strings (one per page)
"""
- images = convert_from_path(pdf_path, dpi=dpi)
- base64_images = []
- for image in images:
+ pdf = pdfium.PdfDocument(pdf_path)
+ for page_index in range(len(pdf)):
+ page = pdf.get_page(page_index)
+ image = page.render(scale=2).to_pil()
+ page.close()
# Resize the image if it's too large (preserving aspect ratio)
if max(image.size) > max_size:
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
-
buffered = BytesIO()
-
# Save with optimized settings
if fmt.upper() == "JPEG":
image.save(buffered, format="JPEG", quality=quality, optimize=True)
else: # PNG
image.save(buffered, format="PNG", optimize=True)
-
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
- base64_images.append(img_str)
-
- return base64_images
+ yield img_str
+ pdf.close()
def _pdf_to_markdown(pdf_path):
diff --git a/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py
new file mode 100644
index 0000000000..20f381762c
--- /dev/null
+++ b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py
@@ -0,0 +1,67 @@
+# Generated by Django 4.2.23 on 2025-08-14 15:20
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("learning_resources", "0093_tutorproblem_view_group"),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name="contentsummarizerconfiguration",
+ name="allowed_extensions",
+ field=django.contrib.postgres.fields.ArrayField(
+ base_field=models.CharField(
+ choices=[
+ (".csv", ".csv"),
+ (".doc", ".doc"),
+ (".docx", ".docx"),
+ (".htm", ".htm"),
+ (".html", ".html"),
+ (".json", ".json"),
+ (".m", ".m"),
+ (".mat", ".mat"),
+ (".md", ".md"),
+ (".pdf", ".pdf"),
+ (".ppt", ".ppt"),
+ (".pptx", ".pptx"),
+ (".ps", ".ps"),
+ (".py", ".py"),
+ (".r", ".r"),
+ (".rtf", ".rtf"),
+ (".sjson", ".sjson"),
+ (".srt", ".srt"),
+ (".txt", ".txt"),
+ (".vtt", ".vtt"),
+ (".xls", ".xls"),
+ (".xlsx", ".xlsx"),
+ (".xml", ".xml"),
+ (".doc", ".doc"),
+ (".docx", ".docx"),
+ (".htm", ".htm"),
+ (".html", ".html"),
+ (".json", ".json"),
+ (".md", ".md"),
+ (".pdf", ".pdf"),
+ (".tex", ".tex"),
+ (".ppt", ".ppt"),
+ (".pptx", ".pptx"),
+ (".rtf", ".rtf"),
+ (".sjson", ".sjson"),
+ (".srt", ".srt"),
+ (".txt", ".txt"),
+ (".vtt", ".vtt"),
+ (".xml", ".xml"),
+ ],
+ max_length=128,
+ ),
+ blank=True,
+ default=list,
+ null=True,
+ size=None,
+ ),
+ ),
+ ]
diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py
index 51d3914b73..16db1722b0 100644
--- a/learning_resources/tasks.py
+++ b/learning_resources/tasks.py
@@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):
for archive in latest_archives.values():
key = archive.key
log.info("Ingesting canvas course %s", key)
- resource_readable_id, canvas_run = ingest_canvas_course(
+ resource_readable_id = ingest_canvas_course(
key,
overwrite=overwrite,
)
diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py
index ceeb2cf3bb..249cdb6cbd 100644
--- a/learning_resources/tasks_test.py
+++ b/learning_resources/tasks_test.py
@@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
mock_ingest_course = mocker.patch(
"learning_resources.tasks.ingest_canvas_course",
- side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
+ side_effect=["course1", "course2"],
)
sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)
diff --git a/main/settings.py b/main/settings.py
index ef2c14acca..a09ee22073 100644
--- a/main/settings.py
+++ b/main/settings.py
@@ -34,7 +34,7 @@
from main.settings_pluggy import * # noqa: F403
from openapi.settings_spectacular import open_spectacular_settings
-VERSION = "0.40.0"
+VERSION = "0.40.1"
log = logging.getLogger()
diff --git a/poetry.lock b/poetry.lock
index 782f978038..85e9185648 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5512,21 +5512,6 @@ pygments = "*"
[package.extras]
testing = ["ipython", "pexpect", "pytest", "pytest-cov"]
-[[package]]
-name = "pdf2image"
-version = "1.17.0"
-description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list."
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
- {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"},
- {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"},
-]
-
-[package.dependencies]
-pillow = "*"
-
[[package]]
name = "pexpect"
version = "4.9.0"
@@ -6389,6 +6374,29 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"]
full = ["Pillow (>=8.0.0)", "cryptography"]
image = ["Pillow (>=8.0.0)"]
+[[package]]
+name = "pypdfium2"
+version = "4.30.0"
+description = "Python bindings to PDFium"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+ {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"},
+ {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"},
+ {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"},
+ {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"},
+ {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"},
+ {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"},
+ {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"},
+ {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"},
+ {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"},
+ {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"},
+ {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"},
+ {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"},
+ {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"},
+]
+
[[package]]
name = "pyreadline3"
version = "3.5.4"
@@ -9105,4 +9113,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "~3.12"
-content-hash = "63e448c31390942ead05c25627a91c91d5bfec50d2e6460432ef7868c5e0ffe8"
+content-hash = "04bc62389781a0c453f7df3965a1b6bc2e728c5ed68377392776108ab0ea09b6"
diff --git a/pyproject.toml b/pyproject.toml
index 551a4983fc..d76f251f33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0"
django-filter = "^2.4.0"
django-guardian = "^3.0.0"
django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret
+
+
django-imagekit = "^5.0.0"
django-ipware = "^7.0.0"
django-json-widget = "^2.0.0"
@@ -111,7 +113,9 @@ uwsgi = "^2.0.29"
uwsgitop = "^0.12"
wrapt = "^1.14.1"
youtube-transcript-api = "^1.0.0"
-pdf2image = "^1.17.0"
+pypdfium2 = "^4.30.0"
+
+
[tool.poetry.group.dev.dependencies]
bpython = "^0.25"
diff --git a/vector_search/conftest.py b/vector_search/conftest.py
index a95efb1552..0148a4c1d9 100644
--- a/vector_search/conftest.py
+++ b/vector_search/conftest.py
@@ -31,6 +31,7 @@ def _use_dummy_encoder(settings):
def _use_test_qdrant_settings(settings, mocker):
settings.QDRANT_HOST = "https://test"
settings.QDRANT_BASE_COLLECTION_NAME = "test"
+ settings.LITELLM_API_BASE = "https://test/api/"
settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = 0
settings.CONTENT_FILE_EMBEDDING_SEMANTIC_CHUNKING_ENABLED = False
mock_qdrant = mocker.patch("qdrant_client.QdrantClient")
diff --git a/vector_search/utils.py b/vector_search/utils.py
index eac2d415e4..fc3dae87a8 100644
--- a/vector_search/utils.py
+++ b/vector_search/utils.py
@@ -524,11 +524,15 @@ def embed_learning_resources(ids, resource_type, overwrite):
else:
serialized_resources = list(serialize_bulk_content_files(ids))
# TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501
- # Currently we only want to summarize content that already has a summary
+ # Currently we only want to summarize content that either already has a summary
+ # OR is in a course where atleast one other content file has a summary
existing_summary_content_ids = [
resource["id"]
for resource in serialized_resources
if resource.get("summary")
+ or ContentFile.objects.filter(run__id=resource.get("run_id"))
+ .exclude(summary="")
+ .exists()
]
ContentSummarizer().summarize_content_files_by_ids(
existing_summary_content_ids, overwrite
diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py
index 1bcf1da56e..26c4f5ecf6 100644
--- a/vector_search/utils_test.py
+++ b/vector_search/utils_test.py
@@ -13,6 +13,9 @@
)
from learning_resources.models import LearningResource
from learning_resources.serializers import LearningResourceMetadataDisplaySerializer
+from learning_resources_search.constants import (
+ CONTENT_FILE_TYPE,
+)
from learning_resources_search.serializers import (
serialize_bulk_content_files,
serialize_bulk_learning_resources,
@@ -806,3 +809,44 @@ def test_update_content_file_payload_only_includes_existing_keys(
)
else:
mock_retrieve.assert_not_called()
+
+
+@pytest.mark.django_db
+def test_embed_learning_resources_contentfile_summarization_filter(mocker):
+ """
+ Test that the summarizer runs for a content file if another content file
+ in the parent learning run also has a summary.
+ """
+ settings.OPENAI_API_KEY = "test"
+ settings.QDRANT_ENABLE_INDEXING_PLUGIN_HOOKS = True
+ mock_content_summarizer = mocker.patch(
+ "learning_resources.content_summarizer.ContentSummarizer.summarize_content_files_by_ids"
+ )
+ mock_chat_llm = mocker.patch(
+ "learning_resources.content_summarizer.ChatLiteLLM", autospec=True
+ )
+ mock_instance = mock_chat_llm.return_value
+ mock_summary_response = mocker.MagicMock()
+ mock_summary_response.content = "mocked summary"
+ mock_instance.invoke.return_value = mock_summary_response
+ mock_instance.with_structured_output.return_value.invoke.return_value = {
+ "flashcards": [
+ {
+ "question": "Generated Question",
+ "answer": "Generated Answer",
+ }
+ ]
+ }
+
+ run = LearningResourceRunFactory.create(published=True)
+ ContentFileFactory.create_batch(
+ 2, content="test content", summary="summary text", run=run
+ )
+ new_content_files = ContentFileFactory.create_batch(
+ 2, content="new content", summary="", run=run
+ )
+ cf_ids = [cf.id for cf in new_content_files]
+ embed_learning_resources(cf_ids, resource_type=CONTENT_FILE_TYPE, overwrite=False)
+
+ # Assert that the summarizer was called with the correct content file IDs
+ assert sorted(mock_content_summarizer.mock_calls[0].args[0]) == sorted(cf_ids)