From ca6ad9158971588e6c5248531544c25e62208e32 Mon Sep 17 00:00:00 2001
From: Anastasia Beglova <abeglova@mit.edu>
Date: Wed, 8 Oct 2025 15:47:30 -0400
Subject: [PATCH] fix canvas problem file import

---
 learning_resources/etl/canvas.py      |  91 +----------------
 learning_resources/etl/canvas_test.py | 141 ++++++++++++++------------
 learning_resources/etl/utils.py       | 109 +++++++++++++++++++-
 3 files changed, 179 insertions(+), 162 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 1a95959e88..e1fb19e511 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -1,16 +1,11 @@
-import base64
 import logging
 import zipfile
 from collections.abc import Generator
 from datetime import datetime
-from io import BytesIO
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import pypdfium2 as pdfium
 from django.conf import settings
-from litellm import completion
-from PIL import Image
 
 from learning_resources.constants import (
     TUTOR_PROBLEM_TYPE,
@@ -213,6 +208,7 @@ def transform_canvas_problem_files(
             run,
             overwrite=overwrite,
             valid_file_types=VALID_TUTOR_PROBLEM_FILE_TYPES,
+            is_tutor_problem_file_import=True,
         ):
             keys_to_keep = [
                 "run",
@@ -249,89 +245,4 @@ def transform_canvas_problem_files(
                 problem_file_data["content"]
             )
 
-            if (
-                problem_file_data["file_extension"].lower() == ".pdf"
-                and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
-                and not run.problem_files.filter(
-                    checksum=problem_file_data["checksum"]
-                ).exists()
-            ):
-                markdown_content = _pdf_to_markdown(
-                    Path(olx_path) / Path(problem_file_data["source_path"])
-                )
-                if markdown_content:
-                    problem_file_data["content"] = markdown_content
             yield problem_file_data
-
-
-def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
-    """
-    Convert a PDF file to a list of base64 encoded images (one per page).
-    Resizes images to reduce file size while keeping good OCR quality.
-
-    Args:
-        pdf_path (str): Path to the PDF file
-        dpi (int): DPI for the output images (default: 200)
-        fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG')
-        max_size (int): Maximum width/height in pixels (default: 2000)
-        quality (int): JPEG quality (1-100, default: 85)
-
-    Returns:
-        list: List of base64 encoded strings (one per page)
-    """
-
-    pdf = pdfium.PdfDocument(pdf_path)
-    for page_index in range(len(pdf)):
-        page = pdf.get_page(page_index)
-        image = page.render(scale=2).to_pil()
-        page.close()
-        # Resize the image if it's too large (preserving aspect ratio)
-        if max(image.size) > max_size:
-            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
-        buffered = BytesIO()
-        # Save with optimized settings
-        if fmt.upper() == "JPEG":
-            image.save(buffered, format="JPEG", quality=quality, optimize=True)
-        else:  # PNG
-            image.save(buffered, format="PNG", optimize=True)
-        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        yield img_str
-    pdf.close()
-
-
-def _pdf_to_markdown(pdf_path):
-    """
-    Convert a PDF file to markdown using an llm
-    """
-    markdown = ""
-    for im in pdf_to_base64_images(pdf_path):
-        response = completion(
-            api_base=settings.LITELLM_API_BASE,
-            custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
-            model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": settings.CANVAS_TRANSCRIPTION_PROMPT,
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/jpeg;base64,{im}",
-                            },
-                        },
-                    ],
-                }
-            ],
-        )
-        markdown_snippet = (
-            response.json()["choices"][0]["message"]["content"]
-            .removeprefix("```markdown\n")
-            .removesuffix("\n```")
-        )
-
-        markdown += markdown_snippet
-    return markdown
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 4a63745a8a..adcda294f6 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -34,7 +34,7 @@
 )
 from learning_resources.models import LearningResource
 from learning_resources_search.constants import CONTENT_FILE_TYPE
-from main.utils import checksum_for_content, now_in_utc
+from main.utils import now_in_utc
 
 pytestmark = pytest.mark.django_db
 
@@ -438,11 +438,14 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_
     bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE)
 
 
+@pytest.mark.parametrize("overwrite", [True, False])
+@pytest.mark.parametrize("existing_file", [True, False])
 def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
-    tmp_path, mocker, settings
+    tmp_path, mocker, settings, overwrite, existing_file
 ):
     """
     Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
+    if overwrite is True or there is no existing file. Tikka should not be called
     """
 
     settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
@@ -456,37 +459,66 @@ def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
     # return a file with pdf extension
     fake_file_data = {
         "run": "run",
-        "content": "original pdf content",
+        "content_type": "application/pdf",
         "archive_checksum": "checksum",
         "source_path": f"tutorbot/{pdf_filename}",
         "file_extension": ".pdf",
     }
 
     mocker.patch(
-        "learning_resources.etl.canvas.process_olx_path",
-        return_value=iter([fake_file_data]),
+        "learning_resources.etl.utils.documents_from_olx",
+        return_value=iter([[mocker.Mock(), fake_file_data]]),
     )
 
     # Patch _pdf_to_markdown to return a known value
     pdf_to_md = mocker.patch(
-        "learning_resources.etl.canvas._pdf_to_markdown",
+        "learning_resources.etl.utils._pdf_to_markdown",
         return_value="markdown content from pdf",
     )
 
+    tika = mocker.patch(
+        "learning_resources.etl.utils.extract_text_metadata",
+    )
+
     run = LearningResourceRunFactory.create()
 
-    results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
+    if existing_file:
+        TutorProblemFileFactory.create(
+            run=run,
+            type="problem",
+            archive_checksum="checksum",
+            source_path=f"tutorbot/{pdf_filename}",
+            content="existing content",
+            file_name="problem1.pdf",
+        )
+
+    results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite))
 
-    pdf_to_md.assert_called_once()
-    assert results[0]["content"] == "markdown content from pdf"
+    if overwrite or not existing_file:
+        pdf_to_md.assert_called_once()
+    else:
+        pdf_to_md.assert_not_called()
+
+    tika.assert_not_called()
+
+    assert (
+        results[0]["content"] == "markdown content from pdf"
+        if overwrite or not existing_file
+        else "existing content"
+    )
     assert results[0]["problem_title"] == "problemset1"
 
 
+@pytest.mark.django_db
+@pytest.mark.parametrize("overwrite", [True, False])
+@pytest.mark.parametrize("existing_file", [True, False])
 def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
-    tmp_path, mocker, settings
+    tmp_path, mocker, settings, overwrite, existing_file
 ):
     """
-    Test that transform_canvas_problem_files does not call _pdf_to_markdown for non-PDF files.
+    Test that transform_canvas_problem_files does not call _pdf_to_markdown but calles tika for
+    non-PDF files. Niether tika or _pdf_to_markdown should be called if overwrite is false and
+    there is an existing file.
     """
     settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
     settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
@@ -498,24 +530,48 @@ def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
 
     fake_file_data = {
         "run": "run",
-        "content": csv_content,
+        "content_type": "application/csv",
         "archive_checksum": "checksum",
         "source_path": f"tutorbot/{csv_filename}",
         "file_extension": ".csv",
     }
+
     mocker.patch(
-        "learning_resources.etl.canvas.process_olx_path",
-        return_value=iter([fake_file_data]),
+        "learning_resources.etl.utils.documents_from_olx",
+        return_value=iter([[mocker.Mock(), fake_file_data]]),
     )
 
-    pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
+    pdf_to_md = mocker.patch("learning_resources.etl.utils._pdf_to_markdown")
 
-    run = mocker.Mock()
+    tika = mocker.patch(
+        "learning_resources.etl.utils.extract_text_metadata",
+        return_value={"content": csv_content},
+    )
+
+    run = LearningResourceRunFactory.create()
+
+    if existing_file:
+        TutorProblemFileFactory.create(
+            run=run,
+            type="problem",
+            archive_checksum="checksum",
+            source_path=f"tutorbot/{csv_filename}",
+            content="existing content",
+            file_name="problem2.csv",
+        )
 
-    results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
+    results = list(transform_canvas_problem_files(zip_path, run, overwrite=overwrite))
 
     pdf_to_md.assert_not_called()
-    assert results[0]["content"] == csv_content
+    if overwrite or not existing_file:
+        tika.assert_called_once()
+    else:
+        tika.assert_not_called()
+    assert (
+        results[0]["content"] == csv_content
+        if overwrite or not existing_file
+        else "existing content"
+    )
     assert results[0]["problem_title"] == "problemset2"
 
 
@@ -1535,52 +1591,3 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
     }
     published = get_published_items(zip_path, url_config)
     assert Path("web_resources/file1.pdf").resolve() in published
-
-
-def test_transform_canvas_problem_files_skips_pdf_to_markdown_if_checksum_exists(
-    tmp_path, mocker, settings
-):
-    """
-    Test that transform_canvas_problem_files does not call _pdf_to_markdown if the checksum already exists.
-    """
-    settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
-    settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
-    pdf_filename = "problemset3/problem.pdf"
-    pdf_content = b"%PDF-1.4 fake pdf content"
-    zip_path = make_canvas_zip(
-        tmp_path, files=[(f"tutorbot/{pdf_filename}", pdf_content)]
-    )
-
-    original_pdf_content = "original pdf content"
-    existing_checksum = checksum_for_content(original_pdf_content)
-
-    mock_run = LearningResourceRunFactory.create()
-    TutorProblemFileFactory.create(
-        run=mock_run,
-        problem_title="Problem Set 1",
-        type="problem",
-        checksum=existing_checksum,
-    )
-
-    fake_file_data = {
-        "run": mock_run,
-        "content": original_pdf_content,
-        "archive_checksum": "checksum",
-        "source_path": f"tutorbot/{pdf_filename}",
-        "file_extension": ".pdf",
-    }
-
-    mocker.patch(
-        "learning_resources.etl.canvas.process_olx_path",
-        return_value=iter([fake_file_data]),
-    )
-
-    pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
-
-    results = list(transform_canvas_problem_files(zip_path, mock_run, overwrite=True))
-
-    pdf_to_md.assert_not_called()
-
-    assert len(results) == 1
-    assert results[0]["content"] == "original pdf content"
-    assert results[0]["source_path"] == f"tutorbot/{pdf_filename}"
diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py
index e402b7a78f..304aa6b2cc 100644
--- a/learning_resources/etl/utils.py
+++ b/learning_resources/etl/utils.py
@@ -1,5 +1,6 @@
 """Helper functions for ETL"""
 
+import base64
 import glob
 import json
 import logging
@@ -15,18 +16,22 @@
 from datetime import UTC, datetime
 from decimal import Decimal
 from hashlib import md5
+from io import BytesIO
 from pathlib import Path
 from subprocess import check_call
 from tempfile import TemporaryDirectory
 from typing import Optional
 
 import boto3
+import pypdfium2 as pdfium
 import rapidjson
 import requests
 from defusedxml import ElementTree
 from django.conf import settings
 from django.utils.dateparse import parse_duration
 from django.utils.text import slugify
+from litellm import completion
+from PIL import Image
 from pycountry import currencies
 from tika import parser as tika_parser
 
@@ -57,6 +62,7 @@
     LearningResourceRun,
     LearningResourceTopic,
     LearningResourceTopicMapping,
+    TutorProblemFile,
 )
 
 log = logging.getLogger(__name__)
@@ -535,6 +541,7 @@ def process_olx_path(
     *,
     overwrite,
     valid_file_types=VALID_TEXT_FILE_TYPES,
+    is_tutor_problem_file_import=False,
 ) -> Generator[dict, None, None]:
     video_srt_metadata = get_video_metadata(olx_path, run)
     for document, metadata in documents_from_olx(
@@ -546,16 +553,33 @@ def process_olx_path(
         content_type = metadata["content_type"]
         mime_type = metadata.get("mime_type")
         file_extension = metadata.get("file_extension")
-        existing_content = ContentFile.objects.filter(key=key, run=run).first()
+
+        if is_tutor_problem_file_import:
+            existing_record = TutorProblemFile.objects.filter(
+                source_path=source_path, run=run
+            ).first()
+        else:
+            existing_record = ContentFile.objects.filter(key=key, run=run).first()
+
         if (
-            not existing_content
-            or existing_content.archive_checksum != metadata.get("archive_checksum")
+            not existing_record
+            or existing_record.archive_checksum != metadata.get("archive_checksum")
         ) or overwrite:
             if settings.SKIP_TIKA and settings.ENVIRONMENT != "production":
                 content_dict = {
                     "content": "",
                     "content_title": "",
                 }
+            elif (
+                file_extension == ".pdf"
+                and is_tutor_problem_file_import
+                and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
+            ):
+                markdown_content = _pdf_to_markdown(Path(olx_path) / Path(source_path))
+                content_dict = {
+                    "content": markdown_content,
+                    "content_title": "",
+                }
             else:
                 tika_output = extract_text_metadata(
                     document,
@@ -584,8 +608,10 @@ def process_olx_path(
                 }
         else:
             content_dict = {
-                "content": existing_content.content,
-                "content_title": existing_content.content_title,
+                "content": existing_record.content,
+                "content_title": ""
+                if is_tutor_problem_file_import
+                else existing_record.content_title,
             }
         yield (
             {
@@ -1041,3 +1067,76 @@ def parse_resource_commitment(commitment_str: str) -> CommitmentConfig:
         else:
             log.warning("Invalid commitment: %s", commitment_str)
     return CommitmentConfig(commitment=commitment_str or "")
+
+
+def _pdf_to_markdown(pdf_path):
+    """
+    Convert a PDF file to markdown using an llm
+    """
+    markdown = ""
+    for im in pdf_to_base64_images(pdf_path):
+        response = completion(
+            api_base=settings.LITELLM_API_BASE,
+            custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
+            model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": settings.CANVAS_TRANSCRIPTION_PROMPT,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{im}",
+                            },
+                        },
+                    ],
+                }
+            ],
+        )
+        markdown_snippet = (
+            response.json()["choices"][0]["message"]["content"]
+            .removeprefix("```markdown\n")
+            .removesuffix("\n```")
+        )
+
+        markdown += markdown_snippet
+    return markdown
+
+
+def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
+    """
+    Convert a PDF file to a list of base64 encoded images (one per page).
+    Resizes images to reduce file size while keeping good OCR quality.
+
+    Args:
+        pdf_path (str): Path to the PDF file
+        dpi (int): DPI for the output images (default: 200)
+        fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG')
+        max_size (int): Maximum width/height in pixels (default: 2000)
+        quality (int): JPEG quality (1-100, default: 85)
+
+    Returns:
+        list: List of base64 encoded strings (one per page)
+    """
+
+    pdf = pdfium.PdfDocument(pdf_path)
+    for page_index in range(len(pdf)):
+        page = pdf.get_page(page_index)
+        image = page.render(scale=2).to_pil()
+        page.close()
+        # Resize the image if it's too large (preserving aspect ratio)
+        if max(image.size) > max_size:
+            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+        buffered = BytesIO()
+        # Save with optimized settings
+        if fmt.upper() == "JPEG":
+            image.save(buffered, format="JPEG", quality=quality, optimize=True)
+        else:  # PNG
+            image.save(buffered, format="PNG", optimize=True)
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        yield img_str
+    pdf.close()