diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 49d82b6bac..121cffd28f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,12 +47,16 @@ jobs: with: python-version-file: "pyproject.toml" cache: "poetry" - + - name: Install poetry with pip + run: python -m pip install poetry - name: Validate lockfile run: poetry check --lock - + - name: Set Poetry Python + run: poetry env use python3.12 - name: Install dependencies - run: poetry install --no-interaction + run: | + source $(poetry env info --path)/bin/activate + poetry install --no-interaction - name: Create test local state run: ./scripts/test/stub-data.sh diff --git a/learning_resources/constants.py b/learning_resources/constants.py index 78d673da4e..ef2c835fdd 100644 --- a/learning_resources/constants.py +++ b/learning_resources/constants.py @@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices): ".json", ".md", ".pdf", + ".tex", ".ppt", ".pptx", ".rtf", diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 040e86a151..6e0ef24042 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -9,10 +9,10 @@ from pathlib import Path from tempfile import TemporaryDirectory +import pypdfium2 as pdfium from defusedxml import ElementTree from django.conf import settings from litellm import completion -from pdf2image import convert_from_path from PIL import Image from learning_resources.constants import ( @@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): run.checksum = checksum run.save() - return resource_readable_id, run + return resource_readable_id def _course_url(course_archive_path) -> str: @@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict: return dict(resources_dict) -def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85): +def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): """ Convert a PDF file to a list of base64 encoded images (one per page). Resizes images to reduce file size while keeping good OCR quality. @@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8 Returns: list: List of base64 encoded strings (one per page) """ - images = convert_from_path(pdf_path, dpi=dpi) - base64_images = [] - for image in images: + pdf = pdfium.PdfDocument(pdf_path) + for page_index in range(len(pdf)): + page = pdf.get_page(page_index) + image = page.render(scale=2).to_pil() + page.close() # Resize the image if it's too large (preserving aspect ratio) if max(image.size) > max_size: image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) - buffered = BytesIO() - # Save with optimized settings if fmt.upper() == "JPEG": image.save(buffered, format="JPEG", quality=quality, optimize=True) else: # PNG image.save(buffered, format="PNG", optimize=True) - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - base64_images.append(img_str) - - return base64_images + yield img_str + pdf.close() def _pdf_to_markdown(pdf_path): diff --git a/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py new file mode 100644 index 0000000000..20f381762c --- /dev/null +++ b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py @@ -0,0 +1,67 @@ +# Generated by Django 4.2.23 on 2025-08-14 15:20 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0093_tutorproblem_view_group"), + ] + + operations = [ + migrations.AlterField( + model_name="contentsummarizerconfiguration", + name="allowed_extensions", + field=django.contrib.postgres.fields.ArrayField( + base_field=models.CharField( + choices=[ + (".csv", ".csv"), + (".doc", ".doc"), + (".docx", ".docx"), + (".htm", ".htm"), + (".html", ".html"), + (".json", ".json"), + (".m", ".m"), + (".mat", ".mat"), + (".md", ".md"), + (".pdf", ".pdf"), + (".ppt", ".ppt"), + (".pptx", ".pptx"), + (".ps", ".ps"), + (".py", ".py"), + (".r", ".r"), + (".rtf", ".rtf"), + (".sjson", ".sjson"), + (".srt", ".srt"), + (".txt", ".txt"), + (".vtt", ".vtt"), + (".xls", ".xls"), + (".xlsx", ".xlsx"), + (".xml", ".xml"), + (".doc", ".doc"), + (".docx", ".docx"), + (".htm", ".htm"), + (".html", ".html"), + (".json", ".json"), + (".md", ".md"), + (".pdf", ".pdf"), + (".tex", ".tex"), + (".ppt", ".ppt"), + (".pptx", ".pptx"), + (".rtf", ".rtf"), + (".sjson", ".sjson"), + (".srt", ".srt"), + (".txt", ".txt"), + (".vtt", ".vtt"), + (".xml", ".xml"), + ], + max_length=128, + ), + blank=True, + default=list, + null=True, + size=None, + ), + ), + ] diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py index 51d3914b73..16db1722b0 100644 --- a/learning_resources/tasks.py +++ b/learning_resources/tasks.py @@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite): for archive in latest_archives.values(): key = archive.key log.info("Ingesting canvas course %s", key) - resource_readable_id, canvas_run = ingest_canvas_course( + resource_readable_id = ingest_canvas_course( key, overwrite=overwrite, ) diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py index ceeb2cf3bb..249cdb6cbd 100644 --- a/learning_resources/tasks_test.py +++ b/learning_resources/tasks_test.py @@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas # Patch ingest_canvas_course to return the readable_ids for the two non-stale courses mock_ingest_course = mocker.patch( "learning_resources.tasks.ingest_canvas_course", - side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())], + side_effect=["course1", "course2"], ) sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False) diff --git a/poetry.lock b/poetry.lock index 782f978038..85e9185648 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5512,21 +5512,6 @@ pygments = "*" [package.extras] testing = ["ipython", "pexpect", "pytest", "pytest-cov"] -[[package]] -name = "pdf2image" -version = "1.17.0" -description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list." -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2"}, - {file = "pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57"}, -] - -[package.dependencies] -pillow = "*" - [[package]] name = "pexpect" version = "4.9.0" @@ -6389,6 +6374,29 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow (>=8.0.0)", "cryptography"] image = ["Pillow (>=8.0.0)"] +[[package]] +name = "pypdfium2" +version = "4.30.0" +description = "Python bindings to PDFium" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "pyreadline3" version = "3.5.4" @@ -9105,4 +9113,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "~3.12" -content-hash = "63e448c31390942ead05c25627a91c91d5bfec50d2e6460432ef7868c5e0ffe8" +content-hash = "04bc62389781a0c453f7df3965a1b6bc2e728c5ed68377392776108ab0ea09b6" diff --git a/pyproject.toml b/pyproject.toml index 551a4983fc..d76f251f33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0" django-filter = "^2.4.0" django-guardian = "^3.0.0" django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret + + django-imagekit = "^5.0.0" django-ipware = "^7.0.0" django-json-widget = "^2.0.0" @@ -111,7 +113,9 @@ uwsgi = "^2.0.29" uwsgitop = "^0.12" wrapt = "^1.14.1" youtube-transcript-api = "^1.0.0" -pdf2image = "^1.17.0" +pypdfium2 = "^4.30.0" + + [tool.poetry.group.dev.dependencies] bpython = "^0.25"