From ca6ad9158971588e6c5248531544c25e62208e32 Mon Sep 17 00:00:00 2001 From: Anastasia Beglova Date: Wed, 8 Oct 2025 15:47:30 -0400 Subject: [PATCH] fix canvas problem file import --- learning_resources/etl/canvas.py | 91 +---------------- learning_resources/etl/canvas_test.py | 141 ++++++++++++++------------ learning_resources/etl/utils.py | 109 +++++++++++++++++++- 3 files changed, 179 insertions(+), 162 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 1a95959e88..e1fb19e511 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -1,16 +1,11 @@ -import base64 import logging import zipfile from collections.abc import Generator from datetime import datetime -from io import BytesIO from pathlib import Path from tempfile import TemporaryDirectory -import pypdfium2 as pdfium from django.conf import settings -from litellm import completion -from PIL import Image from learning_resources.constants import ( TUTOR_PROBLEM_TYPE, @@ -213,6 +208,7 @@ def transform_canvas_problem_files( run, overwrite=overwrite, valid_file_types=VALID_TUTOR_PROBLEM_FILE_TYPES, + is_tutor_problem_file_import=True, ): keys_to_keep = [ "run", @@ -249,89 +245,4 @@ def transform_canvas_problem_files( problem_file_data["content"] ) - if ( - problem_file_data["file_extension"].lower() == ".pdf" - and settings.CANVAS_PDF_TRANSCRIPTION_MODEL - and not run.problem_files.filter( - checksum=problem_file_data["checksum"] - ).exists() - ): - markdown_content = _pdf_to_markdown( - Path(olx_path) / Path(problem_file_data["source_path"]) - ) - if markdown_content: - problem_file_data["content"] = markdown_content yield problem_file_data - - -def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): - """ - Convert a PDF file to a list of base64 encoded images (one per page). - Resizes images to reduce file size while keeping good OCR quality. - - Args: - pdf_path (str): Path to the PDF file - dpi (int): DPI for the output images (default: 200) - fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG') - max_size (int): Maximum width/height in pixels (default: 2000) - quality (int): JPEG quality (1-100, default: 85) - - Returns: - list: List of base64 encoded strings (one per page) - """ - - pdf = pdfium.PdfDocument(pdf_path) - for page_index in range(len(pdf)): - page = pdf.get_page(page_index) - image = page.render(scale=2).to_pil() - page.close() - # Resize the image if it's too large (preserving aspect ratio) - if max(image.size) > max_size: - image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) - buffered = BytesIO() - # Save with optimized settings - if fmt.upper() == "JPEG": - image.save(buffered, format="JPEG", quality=quality, optimize=True) - else: # PNG - image.save(buffered, format="PNG", optimize=True) - img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") - yield img_str - pdf.close() - - -def _pdf_to_markdown(pdf_path): - """ - Convert a PDF file to markdown using an llm - """ - markdown = "" - for im in pdf_to_base64_images(pdf_path): - response = completion( - api_base=settings.LITELLM_API_BASE, - custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER, - model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL, - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": settings.CANVAS_TRANSCRIPTION_PROMPT, - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{im}", - }, - }, - ], - } - ], - ) - markdown_snippet = ( - response.json()["choices"][0]["message"]["content"] - .removeprefix("```markdown\n") - .removesuffix("\n```") - ) - - markdown += markdown_snippet - return markdown diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 4a63745a8a..adcda294f6 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -34,7 +34,7 @@ ) from learning_resources.models import LearningResource from learning_resources_search.constants import CONTENT_FILE_TYPE -from main.utils import checksum_for_content, now_in_utc +from main.utils import now_in_utc pytestmark = pytest.mark.django_db @@ -438,11 +438,14 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_ bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE) +@pytest.mark.parametrize("overwrite", [True, False]) +@pytest.mark.parametrize("existing_file", [True, False]) def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown( - tmp_path, mocker, settings + tmp_path, mocker, settings, overwrite, existing_file ): """ Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files. + if overwrite is True or there is no existing file. Tikka should not be called """ settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/" @@ -456,37 +459,66 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown( # return a file with pdf extension fake_file_data = { "run": "run", - "content": "original pdf content", + "content_type": "application/pdf", "archive_checksum": "checksum", "source_path": f"tutorbot/{pdf_filename}", "file_extension": ".pdf", } mocker.patch( - "learning_resources.etl.canvas.process_olx_path", - return_value=iter([fake_file_data]), + "learning_resources.etl.utils.documents_from_olx", + return_value=iter([[mocker.Mock(), fake_file_data]]), ) # Patch _pdf_to_markdown to return a known value pdf_to_md = mocker.patch( - "learning_resources.etl.canvas._pdf_to_markdown", + "learning_resources.etl.utils._pdf_to_markdown", return_value="markdown content from pdf", ) + tika = mocker.patch( + "learning_resources.etl.utils.extract_text_metadata", + ) + run = LearningResourceRunFactory.create() - results = list(transform_canvas_problem_files(zip_path, run, overwrite=True)) + if existing_file: + TutorProblemFileFactory.create( + run=run, + type="problem", + archive_checksum="checksum", + source_path=f"tutorbot/{pdf_filename}", + content="existing content", + file_name="problem1.pdf", + ) + + results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite)) - pdf_to_md.assert_called_once() - assert results[0]["content"] == "markdown content from pdf" + if overwrite or not existing_file: + pdf_to_md.assert_called_once() + else: + pdf_to_md.assert_not_called() + + tika.assert_not_called() + + assert ( + results[0]["content"] == "markdown content from pdf" + if overwrite or not existing_file + else "existing content" + ) assert results[0]["problem_title"] == "problemset1" +@pytest.mark.django_db +@pytest.mark.parametrize("overwrite", [True, False]) +@pytest.mark.parametrize("existing_file", [True, False]) def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown( - tmp_path, mocker, settings + tmp_path, mocker, settings, overwrite, existing_file ): """ - Test that transform_canvas_problem_files does not call _pdf_to_markdown for non-PDF files. + Test that transform_canvas_problem_files does not call _pdf_to_markdown but calles tika for + non-PDF files. Niether tika or _pdf_to_markdown should be called if overwrite is false and + there is an existing file. """ settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/" settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model" @@ -498,24 +530,48 @@ def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown( fake_file_data = { "run": "run", - "content": csv_content, + "content_type": "application/csv", "archive_checksum": "checksum", "source_path": f"tutorbot/{csv_filename}", "file_extension": ".csv", } + mocker.patch( - "learning_resources.etl.canvas.process_olx_path", - return_value=iter([fake_file_data]), + "learning_resources.etl.utils.documents_from_olx", + return_value=iter([[mocker.Mock(), fake_file_data]]), ) - pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown") + pdf_to_md = mocker.patch("learning_resources.etl.utils._pdf_to_markdown") - run = mocker.Mock() + tika = mocker.patch( + "learning_resources.etl.utils.extract_text_metadata", + return_value={"content": csv_content}, + ) + + run = LearningResourceRunFactory.create() + + if existing_file: + TutorProblemFileFactory.create( + run=run, + type="problem", + archive_checksum="checksum", + source_path=f"tutorbot/{csv_filename}", + content="existing content", + file_name="problem2.csv", + ) - results = list(transform_canvas_problem_files(zip_path, run, overwrite=True)) + results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite)) pdf_to_md.assert_not_called() - assert results[0]["content"] == csv_content + if overwrite or not existing_file: + tika.assert_called_once() + else: + tika.assert_not_called() + assert ( + results[0]["content"] == csv_content + if overwrite or not existing_file + else "existing content" + ) assert results[0]["problem_title"] == "problemset2" @@ -1535,52 +1591,3 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path): } published = get_published_items(zip_path, url_config) assert Path("web_resources/file1.pdf").resolve() in published - - -def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists( - tmp_path, mocker, settings -): - """ - Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists. - """ - settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/" - settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model" - pdf_filename = "problemset3/problem.pdf" - pdf_content = b"%PDF-1.4 fake pdf content" - zip_path = make_canvas_zip( - tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)] - ) - - original_pdf_content = "original pdf content" - existing_checksum = checksum_for_content(original_pdf_content) - - mock_run = LearningResourceRunFactory.create() - TutorProblemFileFactory.create( - run=mock_run, - problem_title="Problem Set 1", - type="problem", - checksum=existing_checksum, - ) - - fake_file_data = { - "run": mock_run, - "content": original_pdf_content, - "archive_checksum": "checksum", - "source_path": f"tutorbot/{pdf_filename}", - "file_extension": ".pdf", - } - - mocker.patch( - "learning_resources.etl.canvas.process_olx_path", - return_value=iter([fake_file_data]), - ) - - pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown") - - results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True)) - - pdf_to_md.assert_not_called() - - assert len(results) == 1 - assert results[0]["content"] == "original pdf content" - assert results[0]["source_path"] == f"tutorbot/{pdf_filename}" diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py index e402b7a78f..304aa6b2cc 100644 --- a/learning_resources/etl/utils.py +++ b/learning_resources/etl/utils.py @@ -1,5 +1,6 @@ """Helper functions for ETL""" +import base64 import glob import json import logging @@ -15,18 +16,22 @@ from datetime import UTC, datetime from decimal import Decimal from hashlib import md5 +from io import BytesIO from pathlib import Path from subprocess import check_call from tempfile import TemporaryDirectory from typing import Optional import boto3 +import pypdfium2 as pdfium import rapidjson import requests from defusedxml import ElementTree from django.conf import settings from django.utils.dateparse import parse_duration from django.utils.text import slugify +from litellm import completion +from PIL import Image from pycountry import currencies from tika import parser as tika_parser @@ -57,6 +62,7 @@ LearningResourceRun, LearningResourceTopic, LearningResourceTopicMapping, + TutorProblemFile, ) log = logging.getLogger(__name__) @@ -535,6 +541,7 @@ def process_olx_path( *, overwrite, valid_file_types=VALID_TEXT_FILE_TYPES, + is_tutor_problem_file_import=False, ) -> Generator[dict, None, None]: video_srt_metadata = get_video_metadata(olx_path, run) for document, metadata in documents_from_olx( @@ -546,16 +553,33 @@ def process_olx_path( content_type = metadata["content_type"] mime_type = metadata.get("mime_type") file_extension = metadata.get("file_extension") - existing_content = ContentFile.objects.filter(key=key, run=run).first() + + if is_tutor_problem_file_import: + existing_record = TutorProblemFile.objects.filter( + source_path=source_path, run=run + ).first() + else: + existing_record = ContentFile.objects.filter(key=key, run=run).first() + if ( - not existing_content - or existing_content.archive_checksum != metadata.get("archive_checksum") + not existing_record + or existing_record.archive_checksum != metadata.get("archive_checksum") ) or overwrite: if settings.SKIP_TIKA and settings.ENVIRONMENT != "production": content_dict = { "content": "", "content_title": "", } + elif ( + file_extension == ".pdf" + and is_tutor_problem_file_import + and settings.CANVAS_PDF_TRANSCRIPTION_MODEL + ): + markdown_content = _pdf_to_markdown(Path(olx_path) / Path(source_path)) + content_dict = { + "content": markdown_content, + "content_title": "", + } else: tika_output = extract_text_metadata( document, @@ -584,8 +608,10 @@ def process_olx_path( } else: content_dict = { - "content": existing_content.content, - "content_title": existing_content.content_title, + "content": existing_record.content, + "content_title": "" + if is_tutor_problem_file_import + else existing_record.content_title, } yield ( { @@ -1041,3 +1067,76 @@ def parse_resource_commitment(commitment_str: str) -> CommitmentConfig: else: log.warning("Invalid commitment: %s", commitment_str) return CommitmentConfig(commitment=commitment_str or "") + + +def _pdf_to_markdown(pdf_path): + """ + Convert a PDF file to markdown using an llm + """ + markdown = "" + for im in pdf_to_base64_images(pdf_path): + response = completion( + api_base=settings.LITELLM_API_BASE, + custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER, + model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": settings.CANVAS_TRANSCRIPTION_PROMPT, + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{im}", + }, + }, + ], + } + ], + ) + markdown_snippet = ( + response.json()["choices"][0]["message"]["content"] + .removeprefix("```markdown\n") + .removesuffix("\n```") + ) + + markdown += markdown_snippet + return markdown + + +def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): + """ + Convert a PDF file to a list of base64 encoded images (one per page). + Resizes images to reduce file size while keeping good OCR quality. + + Args: + pdf_path (str): Path to the PDF file + dpi (int): DPI for the output images (default: 200) + fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG') + max_size (int): Maximum width/height in pixels (default: 2000) + quality (int): JPEG quality (1-100, default: 85) + + Returns: + list: List of base64 encoded strings (one per page) + """ + + pdf = pdfium.PdfDocument(pdf_path) + for page_index in range(len(pdf)): + page = pdf.get_page(page_index) + image = page.render(scale=2).to_pil() + page.close() + # Resize the image if it's too large (preserving aspect ratio) + if max(image.size) > max_size: + image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) + buffered = BytesIO() + # Save with optimized settings + if fmt.upper() == "JPEG": + image.save(buffered, format="JPEG", quality=quality, optimize=True) + else: # PNG + image.save(buffered, format="PNG", optimize=True) + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + yield img_str + pdf.close()