Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,16 @@ jobs:
with:
python-version-file: "pyproject.toml"
cache: "poetry"

- name: Install poetry with pip
run: python -m pip install poetry
- name: Validate lockfile
run: poetry check --lock

- name: Set Poetry Python
run: poetry env use python3.12
- name: Install dependencies
run: poetry install --no-interaction
run: |
source $(poetry env info --path)/bin/activate
poetry install --no-interaction

- name: Create test local state
run: ./scripts/test/stub-data.sh
Expand Down
8 changes: 8 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Release Notes
=============

Version 0.40.1
--------------

- fix program collection org filter bug (#2435)
- Optimize memory footprint of pdf problem transcription task (#2433)
- Replace the social media image (#2434)
- Generate summaries for new video transcripts (#2428)

Version 0.40.0 (Released August 13, 2025)
--------------

Expand Down
Binary file modified frontends/main/public/images/learn-og-image.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ const OrgProgramCollectionDisplay: React.FC<{
key={item.programId}
program={item.program}
enrollments={enrollments}
orgId={orgId}
/>
) : null,
)}
Expand All @@ -206,7 +207,7 @@ const OrgProgramDisplay: React.FC<{
program: DashboardProgram
courseRunEnrollments?: CourseRunEnrollment[]
programLoading: boolean
orgId?: number
orgId: number
}> = ({ program, courseRunEnrollments, programLoading, orgId }) => {
const courses = useQuery(
coursesQueries.coursesList({ id: program.courseIds, org_id: orgId }),
Expand Down Expand Up @@ -254,14 +255,17 @@ const OrgProgramDisplay: React.FC<{
const ProgramCollectionItem: React.FC<{
program: DashboardProgram
enrollments?: CourseRunEnrollment[]
}> = ({ program, enrollments }) => {
return <ProgramCard program={program} enrollments={enrollments} />
orgId: number
}> = ({ program, enrollments, orgId }) => {
return (
<ProgramCard program={program} enrollments={enrollments} orgId={orgId} />
)
}

const ProgramCard: React.FC<{
program: DashboardProgram
enrollments?: CourseRunEnrollment[]
orgId?: number
orgId: number
}> = ({ program, enrollments, orgId }) => {
const courses = useQuery(
coursesQueries.coursesList({
Expand Down
4 changes: 2 additions & 2 deletions frontends/main/src/common/metadata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ export const standardizeMetadata = ({
images: [
{
url: image,
width: image === DEFAULT_OG_IMAGE ? "" : 967,
height: image === DEFAULT_OG_IMAGE ? "" : 511,
width: image === DEFAULT_OG_IMAGE ? 967 : "",
height: image === DEFAULT_OG_IMAGE ? 511 : "",
alt: imageAlt,
},
],
Expand Down
1 change: 1 addition & 0 deletions learning_resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices):
".json",
".md",
".pdf",
".tex",
".ppt",
".pptx",
".rtf",
Expand Down
4 changes: 4 additions & 0 deletions learning_resources/content_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,15 @@ def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
if not settings.LITELLM_CUSTOM_PROVIDER:
raise ValueError("The 'LITELLM_CUSTOM_PROVIDER' setting must be set.") # noqa: EM101, TRY003

if not settings.LITELLM_API_BASE:
raise ValueError("The 'LITELLM_API_BASE' setting must be set.") # noqa: EM101, TRY003

return ChatLiteLLM(
model=model,
temperature=temperature,
max_tokens=max_tokens,
custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
api_base=settings.LITELLM_API_BASE,
)

def _generate_summary(self, content: str, llm_model: str) -> str:
Expand Down
5 changes: 5 additions & 0 deletions learning_resources/content_summarizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
pytestmark = pytest.mark.django_db


@pytest.fixture(autouse=True)
def setup_settings(settings):
settings.LITELLM_API_BASE = "https://test/api/"


@pytest.fixture
def mock_summarize_single_content_file(mocker):
"""Fixture for mocking the process single file method"""
Expand Down
22 changes: 10 additions & 12 deletions learning_resources/etl/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from pathlib import Path
from tempfile import TemporaryDirectory

import pypdfium2 as pdfium
from defusedxml import ElementTree
from django.conf import settings
from litellm import completion
from pdf2image import convert_from_path
from PIL import Image

from learning_resources.constants import (
Expand Down Expand Up @@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
run.checksum = checksum
run.save()

return resource_readable_id, run
return resource_readable_id


def _course_url(course_archive_path) -> str:
Expand Down Expand Up @@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
return dict(resources_dict)


def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
"""
Convert a PDF file to a list of base64 encoded images (one per page).
Resizes images to reduce file size while keeping good OCR quality.
Expand All @@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8
Returns:
list: List of base64 encoded strings (one per page)
"""
images = convert_from_path(pdf_path, dpi=dpi)
base64_images = []

for image in images:
pdf = pdfium.PdfDocument(pdf_path)
for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
image = page.render(scale=2).to_pil()
page.close()
# Resize the image if it's too large (preserving aspect ratio)
if max(image.size) > max_size:
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

buffered = BytesIO()

# Save with optimized settings
if fmt.upper() == "JPEG":
image.save(buffered, format="JPEG", quality=quality, optimize=True)
else: # PNG
image.save(buffered, format="PNG", optimize=True)

img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
base64_images.append(img_str)

return base64_images
yield img_str
pdf.close()


def _pdf_to_markdown(pdf_path):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Generated by Django 4.2.23 on 2025-08-14 15:20

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("learning_resources", "0093_tutorproblem_view_group"),
]

operations = [
migrations.AlterField(
model_name="contentsummarizerconfiguration",
name="allowed_extensions",
field=django.contrib.postgres.fields.ArrayField(
base_field=models.CharField(
choices=[
(".csv", ".csv"),
(".doc", ".doc"),
(".docx", ".docx"),
(".htm", ".htm"),
(".html", ".html"),
(".json", ".json"),
(".m", ".m"),
(".mat", ".mat"),
(".md", ".md"),
(".pdf", ".pdf"),
(".ppt", ".ppt"),
(".pptx", ".pptx"),
(".ps", ".ps"),
(".py", ".py"),
(".r", ".r"),
(".rtf", ".rtf"),
(".sjson", ".sjson"),
(".srt", ".srt"),
(".txt", ".txt"),
(".vtt", ".vtt"),
(".xls", ".xls"),
(".xlsx", ".xlsx"),
(".xml", ".xml"),
(".doc", ".doc"),
(".docx", ".docx"),
(".htm", ".htm"),
(".html", ".html"),
(".json", ".json"),
(".md", ".md"),
(".pdf", ".pdf"),
(".tex", ".tex"),
(".ppt", ".ppt"),
(".pptx", ".pptx"),
(".rtf", ".rtf"),
(".sjson", ".sjson"),
(".srt", ".srt"),
(".txt", ".txt"),
(".vtt", ".vtt"),
(".xml", ".xml"),
],
max_length=128,
),
blank=True,
default=list,
null=True,
size=None,
),
),
]
2 changes: 1 addition & 1 deletion learning_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):
for archive in latest_archives.values():
key = archive.key
log.info("Ingesting canvas course %s", key)
resource_readable_id, canvas_run = ingest_canvas_course(
resource_readable_id = ingest_canvas_course(
key,
overwrite=overwrite,
)
Expand Down
2 changes: 1 addition & 1 deletion learning_resources/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
mock_ingest_course = mocker.patch(
"learning_resources.tasks.ingest_canvas_course",
side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
side_effect=["course1", "course2"],
)
sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)

Expand Down
2 changes: 1 addition & 1 deletion main/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from main.settings_pluggy import * # noqa: F403
from openapi.settings_spectacular import open_spectacular_settings

VERSION = "0.40.0"
VERSION = "0.40.1"

log = logging.getLogger()

Expand Down
40 changes: 24 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0"
django-filter = "^2.4.0"
django-guardian = "^3.0.0"
django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret


django-imagekit = "^5.0.0"
django-ipware = "^7.0.0"
django-json-widget = "^2.0.0"
Expand Down Expand Up @@ -111,7 +113,9 @@ uwsgi = "^2.0.29"
uwsgitop = "^0.12"
wrapt = "^1.14.1"
youtube-transcript-api = "^1.0.0"
pdf2image = "^1.17.0"
pypdfium2 = "^4.30.0"



[tool.poetry.group.dev.dependencies]
bpython = "^0.25"
Expand Down
1 change: 1 addition & 0 deletions vector_search/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def _use_dummy_encoder(settings):
def _use_test_qdrant_settings(settings, mocker):
settings.QDRANT_HOST = "https://test"
settings.QDRANT_BASE_COLLECTION_NAME = "test"
settings.LITELLM_API_BASE = "https://test/api/"
settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = 0
settings.CONTENT_FILE_EMBEDDING_SEMANTIC_CHUNKING_ENABLED = False
mock_qdrant = mocker.patch("qdrant_client.QdrantClient")
Expand Down
6 changes: 5 additions & 1 deletion vector_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,11 +524,15 @@ def embed_learning_resources(ids, resource_type, overwrite):
else:
serialized_resources = list(serialize_bulk_content_files(ids))
# TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501
# Currently we only want to summarize content that already has a summary
# Currently we only want to summarize content that either already has a summary
# OR is in a course where atleast one other content file has a summary
existing_summary_content_ids = [
resource["id"]
for resource in serialized_resources
if resource.get("summary")
or ContentFile.objects.filter(run__id=resource.get("run_id"))
.exclude(summary="")
.exists()
]
ContentSummarizer().summarize_content_files_by_ids(
existing_summary_content_ids, overwrite
Expand Down
Loading
Loading