From 25fa2ea25c51f6b5a732eeae3f44846d45280f77 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Wed, 1 Oct 2025 09:45:59 -0400
Subject: [PATCH 1/2] adding migration

---
 .../0097_tutorproblemfile_checksum.py           | 17 +++++++++++++++++
 learning_resources/models.py                    |  1 +
 2 files changed, 18 insertions(+)
 create mode 100644 learning_resources/migrations/0097_tutorproblemfile_checksum.py

diff --git a/learning_resources/migrations/0097_tutorproblemfile_checksum.py b/learning_resources/migrations/0097_tutorproblemfile_checksum.py
new file mode 100644
index 0000000000..d9a95d97ad
--- /dev/null
+++ b/learning_resources/migrations/0097_tutorproblemfile_checksum.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.24 on 2025-10-01 13:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("learning_resources", "0096_tutorproblemfile_file_name"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="tutorproblemfile",
+            name="checksum",
+            field=models.CharField(blank=True, max_length=32, null=True),
+        ),
+    ]
diff --git a/learning_resources/models.py b/learning_resources/models.py
index 1d897e8c56..1dcc1e3f07 100644
--- a/learning_resources/models.py
+++ b/learning_resources/models.py
@@ -916,6 +916,7 @@ class TutorProblemFile(TimestampedModel):
     content = models.TextField(null=True, blank=True)  # noqa: DJ001
 
     archive_checksum = models.CharField(max_length=32, null=True, blank=True)  # noqa: DJ001
+    checksum = models.CharField(max_length=32, null=True, blank=True)  # noqa: DJ001
     source_path = models.CharField(max_length=1024, null=True, blank=True)  # noqa: DJ001
     file_extension = models.CharField(max_length=32, null=True, blank=True)  # noqa: DJ001
     file_name = models.CharField(max_length=256, null=True, blank=True)  # noqa: DJ001

From 05226fcf28b68954345a5985657b4aacbec1a9ce Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Wed, 1 Oct 2025 10:52:44 -0400
Subject: [PATCH 2/2] adding test

---
 learning_resources/etl/canvas.py      |  9 ++++-
 learning_resources/etl/canvas_test.py | 55 +++++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 226a084da5..25860c2034 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -39,6 +39,7 @@
 from learning_resources_search.constants import (
     CONTENT_FILE_TYPE,
 )
+from main.utils import checksum_for_content
 
 log = logging.getLogger(__name__)
 
@@ -217,7 +218,6 @@ def transform_canvas_problem_files(
             problem_file_data = {
                 key: file_data[key] for key in keys_to_keep if key in file_data
             }
-
             path = file_data["source_path"]
             path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
             path_parts = path.split("/", 1)
@@ -239,9 +239,16 @@ def transform_canvas_problem_files(
             else:
                 problem_file_data["type"] = TUTOR_PROBLEM_TYPE
 
+            problem_file_data["checksum"] = checksum_for_content(
+                problem_file_data["content"]
+            )
+
             if (
                 problem_file_data["file_extension"].lower() == ".pdf"
                 and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
+                and not run.problem_files.filter(
+                    checksum=problem_file_data["checksum"]
+                ).exists()
             ):
                 markdown_content = _pdf_to_markdown(
                     Path(olx_path) / Path(problem_file_data["source_path"])
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index b756dd5897..addff5affa 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -30,10 +30,11 @@
     LearningResourceFactory,
     LearningResourcePlatformFactory,
     LearningResourceRunFactory,
+    TutorProblemFileFactory,
 )
 from learning_resources.models import LearningResource
 from learning_resources_search.constants import CONTENT_FILE_TYPE
-from main.utils import now_in_utc
+from main.utils import checksum_for_content, now_in_utc
 
 pytestmark = pytest.mark.django_db
 
@@ -471,8 +472,7 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
         return_value="markdown content from pdf",
     )
 
-    # Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist
-    run = mocker.Mock()
+    run = LearningResourceRunFactory.create()
 
     results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
 
@@ -1534,3 +1534,52 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
     }
     published = get_published_items(zip_path, url_config)
     assert Path("web_resources/file1.pdf").resolve() in published
+
+
+def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists(
+    tmp_path, mocker, settings
+):
+    """
+    Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists.
+    """
+    settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
+    settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
+    pdf_filename = "problemset3/problem.pdf"
+    pdf_content = b"%PDF-1.4 fake pdf content"
+    zip_path = make_canvas_zip(
+        tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
+    )
+
+    original_pdf_content = "original pdf content"
+    existing_checksum = checksum_for_content(original_pdf_content)
+
+    mock_run = LearningResourceRunFactory.create()
+    TutorProblemFileFactory.create(
+        run=mock_run,
+        problem_title="Problem Set 1",
+        type="problem",
+        checksum=existing_checksum,
+    )
+
+    fake_file_data = {
+        "run": mock_run,
+        "content": original_pdf_content,
+        "archive_checksum": "checksum",
+        "source_path": f"tutorbot/{pdf_filename}",
+        "file_extension": ".pdf",
+    }
+
+    mocker.patch(
+        "learning_resources.etl.canvas._process_olx_path",
+        return_value=iter([fake_file_data]),
+    )
+
+    pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
+
+    results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True))
+
+    pdf_to_md.assert_not_called()
+
+    assert len(results) == 1
+    assert results[0]["content"] == "original pdf content"
+    assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"