Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion learning_resources/etl/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from learning_resources_search.constants import (
CONTENT_FILE_TYPE,
)
from main.utils import checksum_for_content

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -217,7 +218,6 @@ def transform_canvas_problem_files(
problem_file_data = {
key: file_data[key] for key in keys_to_keep if key in file_data
}

path = file_data["source_path"]
path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
path_parts = path.split("/", 1)
Expand All @@ -239,9 +239,16 @@ def transform_canvas_problem_files(
else:
problem_file_data["type"] = TUTOR_PROBLEM_TYPE

problem_file_data["checksum"] = checksum_for_content(
problem_file_data["content"]
)

if (
problem_file_data["file_extension"].lower() == ".pdf"
and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
and not run.problem_files.filter(
checksum=problem_file_data["checksum"]
).exists()
):
markdown_content = _pdf_to_markdown(
Path(olx_path) / Path(problem_file_data["source_path"])
Expand Down
55 changes: 52 additions & 3 deletions learning_resources/etl/canvas_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@
LearningResourceFactory,
LearningResourcePlatformFactory,
LearningResourceRunFactory,
TutorProblemFileFactory,
)
from learning_resources.models import LearningResource
from learning_resources_search.constants import CONTENT_FILE_TYPE
from main.utils import now_in_utc
from main.utils import checksum_for_content, now_in_utc

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -471,8 +472,7 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
return_value="markdown content from pdf",
)

# Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist
run = mocker.Mock()
run = LearningResourceRunFactory.create()

results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))

Expand Down Expand Up @@ -1534,3 +1534,52 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
}
published = get_published_items(zip_path, url_config)
assert Path("web_resources/file1.pdf").resolve() in published


def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists(
tmp_path, mocker, settings
):
"""
Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists.
"""
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
pdf_filename = "problemset3/problem.pdf"
pdf_content = b"%PDF-1.4 fake pdf content"
zip_path = make_canvas_zip(
tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
)

original_pdf_content = "original pdf content"
existing_checksum = checksum_for_content(original_pdf_content)

mock_run = LearningResourceRunFactory.create()
TutorProblemFileFactory.create(
run=mock_run,
problem_title="Problem Set 1",
type="problem",
checksum=existing_checksum,
)

fake_file_data = {
"run": mock_run,
"content": original_pdf_content,
"archive_checksum": "checksum",
"source_path": f"tutorbot/{pdf_filename}",
"file_extension": ".pdf",
}

mocker.patch(
"learning_resources.etl.canvas._process_olx_path",
return_value=iter([fake_file_data]),
)

pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")

results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True))

pdf_to_md.assert_not_called()

assert len(results) == 1
assert results[0]["content"] == "original pdf content"
assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"
17 changes: 17 additions & 0 deletions learning_resources/migrations/0097_tutorproblemfile_checksum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.24 on 2025-10-01 13:30

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("learning_resources", "0096_tutorproblemfile_file_name"),
]

operations = [
migrations.AddField(
model_name="tutorproblemfile",
name="checksum",
field=models.CharField(blank=True, max_length=32, null=True),
),
]
1 change: 1 addition & 0 deletions learning_resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,7 @@ class TutorProblemFile(TimestampedModel):
content = models.TextField(null=True, blank=True) # noqa: DJ001

archive_checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
checksum = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
source_path = models.CharField(max_length=1024, null=True, blank=True) # noqa: DJ001
file_extension = models.CharField(max_length=32, null=True, blank=True) # noqa: DJ001
file_name = models.CharField(max_length=256, null=True, blank=True) # noqa: DJ001
Expand Down
Loading