From 25fa2ea25c51f6b5a732eeae3f44846d45280f77 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 1 Oct 2025 09:45:59 -0400 Subject: [PATCH 1/2] adding migration --- .../0097_tutorproblemfile_checksum.py | 17 +++++++++++++++++ learning_resources/models.py | 1 + 2 files changed, 18 insertions(+) create mode 100644 learning_resources/migrations/0097_tutorproblemfile_checksum.py diff --git a/learning_resources/migrations/0097_tutorproblemfile_checksum.py b/learning_resources/migrations/0097_tutorproblemfile_checksum.py new file mode 100644 index 0000000000..d9a95d97ad --- /dev/null +++ b/learning_resources/migrations/0097_tutorproblemfile_checksum.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.24 on 2025-10-01 13:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("learning_resources", "0096_tutorproblemfile_file_name"), + ] + + operations = [ + migrations.AddField( + model_name="tutorproblemfile", + name="checksum", + field=models.CharField(blank=True, max_length=32, null=True), + ), + ] diff --git a/learning_resources/models.py b/learning_resources/models.py index 1d897e8c56..1dcc1e3f07 100644 --- a/learning_resources/models.py +++ b/learning_resources/models.py @@ -916,6 +916,7 @@ class TutorProblemFile(TimestampedModel): content = models.TextField(null=True, blank=True) # noqa: DJ001 archive_checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001 + checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001 source_path = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001 file_extension = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001 file_name = models.CharField(max_length=256, null=True, blank=True) # noqa: DJ001 From 05226fcf28b68954345a5985657b4aacbec1a9ce Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 1 Oct 2025 10:52:44 -0400 Subject: [PATCH 2/2] adding test --- learning_resources/etl/canvas.py | 9 ++++- learning_resources/etl/canvas_test.py | 55 +++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 226a084da5..25860c2034 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -39,6 +39,7 @@ from learning_resources_search.constants import ( CONTENT_FILE_TYPE, ) +from main.utils import checksum_for_content log = logging.getLogger(__name__) @@ -217,7 +218,6 @@ def transform_canvas_problem_files( problem_file_data = { key: file_data[key] for key in keys_to_keep if key in file_data } - path = file_data["source_path"] path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :] path_parts = path.split("/", 1) @@ -239,9 +239,16 @@ def transform_canvas_problem_files( else: problem_file_data["type"] = TUTOR_PROBLEM_TYPE + problem_file_data["checksum"] = checksum_for_content( + problem_file_data["content"] + ) + if ( problem_file_data["file_extension"].lower() == ".pdf" and settings.CANVAS_PDF_TRANSCRIPTION_MODEL + and not run.problem_files.filter( + checksum=problem_file_data["checksum"] + ).exists() ): markdown_content = _pdf_to_markdown( Path(olx_path) / Path(problem_file_data["source_path"]) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index b756dd5897..addff5affa 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -30,10 +30,11 @@ LearningResourceFactory, LearningResourcePlatformFactory, LearningResourceRunFactory, + TutorProblemFileFactory, ) from learning_resources.models import LearningResource from learning_resources_search.constants import CONTENT_FILE_TYPE -from main.utils import now_in_utc +from main.utils import checksum_for_content, now_in_utc pytestmark = pytest.mark.django_db @@ -471,8 +472,7 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown( return_value="markdown content from pdf", ) - # Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist - run = mocker.Mock() + run = LearningResourceRunFactory.create() results = list(transform_canvas_problem_files(zip_path, run, overwrite=True)) @@ -1534,3 +1534,52 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path): } published = get_published_items(zip_path, url_config) assert Path("web_resources/file1.pdf").resolve() in published + + +def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists( + tmp_path, mocker, settings +): + """ + Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists. + """ + settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/" + settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model" + pdf_filename = "problemset3/problem.pdf" + pdf_content = b"%PDF-1.4 fake pdf content" + zip_path = make_canvas_zip( + tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)] + ) + + original_pdf_content = "original pdf content" + existing_checksum = checksum_for_content(original_pdf_content) + + mock_run = LearningResourceRunFactory.create() + TutorProblemFileFactory.create( + run=mock_run, + problem_title="Problem Set 1", + type="problem", + checksum=existing_checksum, + ) + + fake_file_data = { + "run": mock_run, + "content": original_pdf_content, + "archive_checksum": "checksum", + "source_path": f"tutorbot/{pdf_filename}", + "file_extension": ".pdf", + } + + mocker.patch( + "learning_resources.etl.canvas._process_olx_path", + return_value=iter([fake_file_data]), + ) + + pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown") + + results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True)) + + pdf_to_md.assert_not_called() + + assert len(results) == 1 + assert results[0]["content"] == "original pdf content" + assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"