Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,16 @@ jobs:
with:
python-version-file: "pyproject.toml"
cache: "poetry"

- name: Install poetry with pip
run: python -m pip install poetry
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems unrelated to the rest of the pr. Is it an intentional change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be unnecessary but it was introduced to fix a strange issue adding pypdfium2 uncovered with the python version our ci runner uses (it had been using 3.10 all along and not the pinned 3.12). will see if i can take this step out

- name: Validate lockfile
run: poetry check --lock

- name: Set Poetry Python
run: poetry env use python3.12
- name: Install dependencies
run: poetry install --no-interaction
run: |
source $(poetry env info --path)/bin/activate
poetry install --no-interaction

- name: Create test local state
run: ./scripts/test/stub-data.sh
Expand Down
1 change: 1 addition & 0 deletions learning_resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices):
".json",
".md",
".pdf",
".tex",
".ppt",
".pptx",
".rtf",
Expand Down
22 changes: 10 additions & 12 deletions learning_resources/etl/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from pathlib import Path
from tempfile import TemporaryDirectory

import pypdfium2 as pdfium
from defusedxml import ElementTree
from django.conf import settings
from litellm import completion
from pdf2image import convert_from_path
from PIL import Image

from learning_resources.constants import (
Expand Down Expand Up @@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
run.checksum = checksum
run.save()

return resource_readable_id, run
return resource_readable_id


def _course_url(course_archive_path) -> str:
Expand Down Expand Up @@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
return dict(resources_dict)


def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
"""
Convert a PDF file to a list of base64 encoded images (one per page).
Resizes images to reduce file size while keeping good OCR quality.
Expand All @@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8
Returns:
list: List of base64 encoded strings (one per page)
"""
images = convert_from_path(pdf_path, dpi=dpi)
base64_images = []

for image in images:
pdf = pdfium.PdfDocument(pdf_path)
for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
image = page.render(scale=2).to_pil()
page.close()
# Resize the image if it's too large (preserving aspect ratio)
if max(image.size) > max_size:
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

buffered = BytesIO()

# Save with optimized settings
if fmt.upper() == "JPEG":
image.save(buffered, format="JPEG", quality=quality, optimize=True)
else: # PNG
image.save(buffered, format="PNG", optimize=True)

img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
base64_images.append(img_str)

return base64_images
yield img_str
pdf.close()


def _pdf_to_markdown(pdf_path):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Generated by Django 4.2.23 on 2025-08-14 15:20

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("learning_resources", "0093_tutorproblem_view_group"),
]

operations = [
migrations.AlterField(
model_name="contentsummarizerconfiguration",
name="allowed_extensions",
field=django.contrib.postgres.fields.ArrayField(
base_field=models.CharField(
choices=[
(".csv", ".csv"),
(".doc", ".doc"),
(".docx", ".docx"),
(".htm", ".htm"),
(".html", ".html"),
(".json", ".json"),
(".m", ".m"),
(".mat", ".mat"),
(".md", ".md"),
(".pdf", ".pdf"),
(".ppt", ".ppt"),
(".pptx", ".pptx"),
(".ps", ".ps"),
(".py", ".py"),
(".r", ".r"),
(".rtf", ".rtf"),
(".sjson", ".sjson"),
(".srt", ".srt"),
(".txt", ".txt"),
(".vtt", ".vtt"),
(".xls", ".xls"),
(".xlsx", ".xlsx"),
(".xml", ".xml"),
(".doc", ".doc"),
(".docx", ".docx"),
(".htm", ".htm"),
(".html", ".html"),
(".json", ".json"),
(".md", ".md"),
(".pdf", ".pdf"),
(".tex", ".tex"),
(".ppt", ".ppt"),
(".pptx", ".pptx"),
(".rtf", ".rtf"),
(".sjson", ".sjson"),
(".srt", ".srt"),
(".txt", ".txt"),
(".vtt", ".vtt"),
(".xml", ".xml"),
],
max_length=128,
),
blank=True,
default=list,
null=True,
size=None,
),
),
]
2 changes: 1 addition & 1 deletion learning_resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):
for archive in latest_archives.values():
key = archive.key
log.info("Ingesting canvas course %s", key)
resource_readable_id, canvas_run = ingest_canvas_course(
resource_readable_id = ingest_canvas_course(
key,
overwrite=overwrite,
)
Expand Down
2 changes: 1 addition & 1 deletion learning_resources/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
mock_ingest_course = mocker.patch(
"learning_resources.tasks.ingest_canvas_course",
side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
side_effect=["course1", "course2"],
)
sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)

Expand Down
40 changes: 24 additions & 16 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0"
django-filter = "^2.4.0"
django-guardian = "^3.0.0"
django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret


django-imagekit = "^5.0.0"
django-ipware = "^7.0.0"
django-json-widget = "^2.0.0"
Expand Down Expand Up @@ -111,7 +113,9 @@ uwsgi = "^2.0.29"
uwsgitop = "^0.12"
wrapt = "^1.14.1"
youtube-transcript-api = "^1.0.0"
pdf2image = "^1.17.0"
pypdfium2 = "^4.30.0"



[tool.poetry.group.dev.dependencies]
bpython = "^0.25"
Expand Down
Loading