From 6e0815fdcf20ed4ffe0429cfda01e1eba02c340d Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 08:52:04 -0400 Subject: [PATCH 1/9] initial working version of gathering assignments and pages --- learning_resources/etl/canvas.py | 88 +++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index a6646042d8..7bd4967e1b 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -12,6 +12,7 @@ from urllib.parse import unquote_plus import pypdfium2 as pdfium +from bs4 import BeautifulSoup from defusedxml import ElementTree from django.conf import settings from litellm import completion @@ -180,13 +181,20 @@ def transform_canvas_content_files( basedir = course_zipfile.name.split(".")[0] zipfile_path = course_zipfile.absolute() # grab published module and file items - published_items = [ - Path(item["path"]).resolve() - for item in parse_module_meta(zipfile_path)["active"] - ] + [ - Path(item["path"]).resolve() - for item in parse_files_meta(zipfile_path)["active"] - ] + published_items = ( + [ + Path(item["path"]).resolve() + for item in parse_module_meta(zipfile_path)["active"] + ] + + [ + Path(item["path"]).resolve() + for item in parse_files_meta(zipfile_path)["active"] + ] + + [ + Path(item["path"]).resolve() + for item in parse_web_content(zipfile_path)["active"] + ] + ) def _generate_content(): """Inner generator for yielding content data""" @@ -419,6 +427,72 @@ def parse_module_meta(course_archive_path: str) -> dict: return publish_status +def _workflow_state_from_html(html): + soup = BeautifulSoup(html, "html.parser") + + meta = soup.find("meta", attrs={"name": "workflow_state"}) + return meta.get("content") if meta else None + + +def _workflow_state_from_xml(xml): + try: + root = ElementTree.fromstring(xml) + namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"} + workflow_element = root.find("ns:workflow_state", namespace) + return workflow_element.text.strip() if workflow_element is not None else None + + except Exception: + log.exception("Error parsing XML: %s", sys.stderr) + + +def _title_from_html(html): + soup = BeautifulSoup(html, "html.parser") + title = soup.find("title") + return title.get_text().strip() if title else "" + + +def parse_web_content(course_archive_path: str) -> dict: + """ + Parse course_settings/files_meta.xml and return publish/active status of resources. + """ + + publish_status = {"active": [], "unpublished": []} + + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + manifest_xml = course_archive.read("imsmanifest.xml") + resource_map = extract_resources_by_identifier(manifest_xml) + for item in resource_map: + item_link = resource_map[item].get("href") + assignment_settings = None + for file in resource_map[item].get("files", []): + if file.endswith("assignment_settings.xml"): + assignment_settings = file + if item_link and item_link.endswith(".html"): + file_path = resource_map[item]["href"] + html_content = course_archive.read(file_path) + if assignment_settings: + xml_content = course_archive.read(assignment_settings) + workflow_state = _workflow_state_from_xml(xml_content) + else: + workflow_state = _workflow_state_from_html(html_content) + title = _title_from_html(html_content) + if workflow_state in ["active", "published"]: + publish_status["active"].append( + { + "title": title, + "path": file_path, + } + ) + else: + publish_status["unpublished"].append( + { + "title": title, + "path": file_path, + } + ) + return publish_status + + def extract_resources_by_identifierref(manifest_xml: str) -> dict: """ Extract resources from an IMS manifest file and From 20064af8f33af0878cb3d064b3f0a60cee954fde Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 09:42:55 -0400 Subject: [PATCH 2/9] test fix --- learning_resources/etl/canvas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 7bd4967e1b..09e5eb8c7a 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -459,7 +459,10 @@ def parse_web_content(course_archive_path: str) -> dict: publish_status = {"active": [], "unpublished": []} with zipfile.ZipFile(course_archive_path, "r") as course_archive: - manifest_xml = course_archive.read("imsmanifest.xml") + manifest_path = "imsmanifest.xml" + if manifest_path not in course_archive.namelist(): + return publish_status + manifest_xml = course_archive.read(manifest_path) resource_map = extract_resources_by_identifier(manifest_xml) for item in resource_map: item_link = resource_map[item].get("href") From 3ada4b0562b8d488ac581778cd53914885a9a647 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 11:05:09 -0400 Subject: [PATCH 3/9] check metadata for visibility to students vs authors --- learning_resources/etl/canvas.py | 55 ++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 09e5eb8c7a..098df4f2cc 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -280,6 +280,9 @@ def transform_canvas_problem_files( def parse_context_xml(course_archive_path: str) -> dict: + """ + Parse course_settings/context.xml and return context info + """ with zipfile.ZipFile(course_archive_path, "r") as course_archive: context = course_archive.read("course_settings/context.xml") root = ElementTree.fromstring(context) @@ -427,14 +430,31 @@ def parse_module_meta(course_archive_path: str) -> dict: return publish_status +def _compact_element(element): + """Recursively compact an element into a nested dictionary""" + if len(element) == 0: # No children, return text + return element.text.strip() if element.text else None + return { + child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element( + child + ) + for child in element + } + + def _workflow_state_from_html(html): + """ + Extract the workflow_state meta tag from html + """ soup = BeautifulSoup(html, "html.parser") - meta = soup.find("meta", attrs={"name": "workflow_state"}) return meta.get("content") if meta else None def _workflow_state_from_xml(xml): + """ + Extract the workflow_state element from xml + """ try: root = ElementTree.fromstring(xml) namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"} @@ -446,6 +466,9 @@ def _workflow_state_from_xml(xml): def _title_from_html(html): + """ + Extract the title element from HTML content + """ soup = BeautifulSoup(html, "html.parser") title = soup.find("title") return title.get_text().strip() if title else "" @@ -453,7 +476,7 @@ def _title_from_html(html): def parse_web_content(course_archive_path: str) -> dict: """ - Parse course_settings/files_meta.xml and return publish/active status of resources. + Parse html pages and assignments and return publish/active status of resources """ publish_status = {"active": [], "unpublished": []} @@ -465,13 +488,14 @@ def parse_web_content(course_archive_path: str) -> dict: manifest_xml = course_archive.read(manifest_path) resource_map = extract_resources_by_identifier(manifest_xml) for item in resource_map: - item_link = resource_map[item].get("href") + resource_map_item = resource_map[item] + item_link = resource_map_item.get("href") assignment_settings = None - for file in resource_map[item].get("files", []): + for file in resource_map_item.get("files", []): if file.endswith("assignment_settings.xml"): assignment_settings = file if item_link and item_link.endswith(".html"): - file_path = resource_map[item]["href"] + file_path = resource_map_item["href"] html_content = course_archive.read(file_path) if assignment_settings: xml_content = course_archive.read(assignment_settings) @@ -479,7 +503,17 @@ def parse_web_content(course_archive_path: str) -> dict: else: workflow_state = _workflow_state_from_html(html_content) title = _title_from_html(html_content) - if workflow_state in ["active", "published"]: + + lom_elem = ( + resource_map_item.get("metadata", {}) + .get("lom", {}) + .get("educational", {}) + ) + # Determine if the content is intended for authors or instructors only + intended_role = lom_elem.get("intendedEndUserRole", {}).get("value") + authors_only = intended_role and intended_role.lower() != "student" + + if workflow_state in ["active", "published"] and not authors_only: publish_status["active"].append( { "title": title, @@ -540,15 +574,12 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict: file and return a map keyed by identifier. """ root = ElementTree.fromstring(manifest_xml) - namespaces = { "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", } - resources_dict = {} - # Find all resource elements for resource in root.findall(".//imscp:resource[@identifier]", namespaces): identifier = resource.get("identifier") @@ -560,14 +591,11 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict: file_elem.get("href") for file_elem in resource.findall("imscp:file", namespaces) ] - # Extract metadata if present metadata = {} metadata_elem = resource.find("imscp:metadata", namespaces) if metadata_elem is not None: - # You can expand this to extract specific metadata fields as needed - metadata["has_metadata"] = True - + metadata.update(_compact_element(metadata_elem)) resources_dict[identifier] = { "identifier": identifier, "type": resource_type, @@ -575,7 +603,6 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict: "files": files, "metadata": metadata, } - return resources_dict From aca1bc5d4a97ce0e20a528fff5ee63e1f0a1f473 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 11:06:38 -0400 Subject: [PATCH 4/9] type hints --- learning_resources/etl/canvas.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 098df4f2cc..50261fa96b 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -430,7 +430,7 @@ def parse_module_meta(course_archive_path: str) -> dict: return publish_status -def _compact_element(element): +def _compact_element(element: ElementTree.Element) -> dict | str | None: """Recursively compact an element into a nested dictionary""" if len(element) == 0: # No children, return text return element.text.strip() if element.text else None @@ -442,7 +442,7 @@ def _compact_element(element): } -def _workflow_state_from_html(html): +def _workflow_state_from_html(html: str) -> str: """ Extract the workflow_state meta tag from html """ @@ -465,7 +465,7 @@ def _workflow_state_from_xml(xml): log.exception("Error parsing XML: %s", sys.stderr) -def _title_from_html(html): +def _title_from_html(html: str) -> str: """ Extract the title element from HTML content """ @@ -642,6 +642,9 @@ def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): def _pdf_to_markdown(pdf_path): + """ + Convert a PDF file to markdown using an llm + """ markdown = "" for im in pdf_to_base64_images(pdf_path): response = completion( From 09c2ddaa7ce397e6e391cf6cbf42b5ad387c4a81 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 11:13:02 -0400 Subject: [PATCH 5/9] remove typehint --- learning_resources/etl/canvas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 50261fa96b..9398a4ee8f 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -430,7 +430,7 @@ def parse_module_meta(course_archive_path: str) -> dict: return publish_status -def _compact_element(element: ElementTree.Element) -> dict | str | None: +def _compact_element(element) -> dict | str | None: """Recursively compact an element into a nested dictionary""" if len(element) == 0: # No children, return text return element.text.strip() if element.text else None From d2f277e01e6c716cbf1d517d0491dcd8bf83cef2 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 12:55:02 -0400 Subject: [PATCH 6/9] consolidation of namespaces --- learning_resources/etl/canvas.py | 65 ++++++++++++++------------------ 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 9398a4ee8f..4ee74b762d 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -43,6 +43,14 @@ log = logging.getLogger(__name__) +NAMESPACES = { + "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0", + "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", + "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", + "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", +} + + def sync_canvas_archive(bucket, key: str, overwrite): """ Sync a Canvas course archive from S3 @@ -286,11 +294,10 @@ def parse_context_xml(course_archive_path: str) -> dict: with zipfile.ZipFile(course_archive_path, "r") as course_archive: context = course_archive.read("course_settings/context.xml") root = ElementTree.fromstring(context) - namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"} context_info = {} item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"] for key in item_keys: - element = root.find(f"ns:{key}", namespaces) + element = root.find(f"cccv1p0:{key}", NAMESPACES) if element is not None: context_info[key] = element.text @@ -310,12 +317,9 @@ def is_file_published(file_meta: dict) -> bool: hidden = str(file_meta.get("hidden", "false")).lower() == "true" locked = str(file_meta.get("locked", "false")).lower() == "true" - unlock_at = file_meta.get("unlock_at") lock_at = file_meta.get("lock_at") - visibility = file_meta.get("visibility", "inherit") - # If explicitly hidden or locked → unpublished if hidden or locked: return False @@ -355,11 +359,9 @@ def parse_files_meta(course_archive_path: str) -> dict: files_xml = course_archive.read(files_meta_path) manifest_xml = course_archive.read("imsmanifest.xml") resource_map = extract_resources_by_identifier(manifest_xml) - root = ElementTree.fromstring(files_xml) - namespaces = {"c": "http://canvas.instructure.com/xsd/cccv1p0"} try: - for file_elem in root.findall(".//c:file", namespaces): + for file_elem in root.findall(".//cccv1p0:file", NAMESPACES): meta = dict(file_elem.attrib) for child in file_elem: tag = child.tag @@ -398,19 +400,18 @@ def parse_module_meta(course_archive_path: str) -> dict: resource_map = extract_resources_by_identifierref(manifest_xml) publish_status = {"active": [], "unpublished": []} try: - namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"} root = ElementTree.fromstring(module_xml) - for module in root.findall(".//ns:module", namespaces): - module_title = module.find("ns:title", namespaces).text - for item in module.findall("ns:items/ns:item", namespaces): - item_state = item.find("ns:workflow_state", namespaces).text - item_title = item.find("ns:title", namespaces).text + for module in root.findall(".//cccv1p0:module", NAMESPACES): + module_title = module.find("cccv1p0:title", NAMESPACES).text + for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES): + item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text + item_title = item.find("cccv1p0:title", NAMESPACES).text identifierref = ( - item.find("ns:identifierref", namespaces).text - if item.find("ns:identifierref", namespaces) is not None + item.find("cccv1p0:identifierref", NAMESPACES).text + if item.find("cccv1p0:identifierref", NAMESPACES) is not None else None ) - content_type = item.find("ns:content_type", namespaces).text + content_type = item.find("cccv1p0:content_type", NAMESPACES).text items = resource_map.get(identifierref, {}) for item_info in items: for file in item_info.get("files", []): @@ -457,8 +458,8 @@ def _workflow_state_from_xml(xml): """ try: root = ElementTree.fromstring(xml) - namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"} - workflow_element = root.find("ns:workflow_state", namespace) + + workflow_element = root.find("cccv1p0:workflow_state", NAMESPACES) return workflow_element.text.strip() if workflow_element is not None else None except Exception: @@ -537,29 +538,24 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict: """ root = ElementTree.fromstring(manifest_xml) - namespaces = { - "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", - "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", - "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", - } # Dictionary to hold resources keyed by identifierref resources_dict = defaultdict(list) # Find all item elements with identifierref attributes - for item in root.findall(".//imscp:item[@identifierref]", namespaces): + for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES): identifierref = item.get("identifierref") title = ( - item.find("imscp:title", namespaces).text - if item.find("imscp:title", namespaces) is not None + item.find("imscp:title", NAMESPACES).text + if item.find("imscp:title", NAMESPACES) is not None else "No Title" ) resource = root.find( - f'.//imscp:resource[@identifier="{identifierref}"]', namespaces + f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES ) if resource is not None: # Get all file elements within the resource files = [ file_elem.get("href") - for file_elem in resource.findall("imscp:file", namespaces) + for file_elem in resource.findall("imscp:file", NAMESPACES) ] resources_dict[identifierref].append( @@ -574,14 +570,9 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict: file and return a map keyed by identifier. """ root = ElementTree.fromstring(manifest_xml) - namespaces = { - "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", - "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", - "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", - } resources_dict = {} # Find all resource elements - for resource in root.findall(".//imscp:resource[@identifier]", namespaces): + for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES): identifier = resource.get("identifier") resource_type = resource.get("type") href = resource.get("href") @@ -589,11 +580,11 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict: # Get all file elements within the resource files = [ file_elem.get("href") - for file_elem in resource.findall("imscp:file", namespaces) + for file_elem in resource.findall("imscp:file", NAMESPACES) ] # Extract metadata if present metadata = {} - metadata_elem = resource.find("imscp:metadata", namespaces) + metadata_elem = resource.find("imscp:metadata", NAMESPACES) if metadata_elem is not None: metadata.update(_compact_element(metadata_elem)) resources_dict[identifier] = { From 9f128d45846630b3bb5f95df0c68ece12e161b9f Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 13:42:31 -0400 Subject: [PATCH 7/9] explicitely exclude tutor folder no matter what and added tests --- learning_resources/etl/canvas.py | 6 +- learning_resources/etl/canvas_test.py | 167 +++++++++++++++++++++++++- 2 files changed, 170 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 4ee74b762d..5a431a6f22 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -314,7 +314,6 @@ def is_file_published(file_meta: dict) -> bool: bool: True if file is published/visible, False otherwise. """ now = now_in_utc() - hidden = str(file_meta.get("hidden", "false")).lower() == "true" locked = str(file_meta.get("locked", "false")).lower() == "true" unlock_at = file_meta.get("unlock_at") @@ -380,7 +379,10 @@ def parse_files_meta(course_archive_path: str) -> dict: file_path = Path(file) file_data["path"] = file_path file_data["title"] = file_data.get("display_name") - if file_data["published"]: + # explicitly exclude files in web_resources/ai/tutor + if file_data["published"] and not file.startswith( + settings.CANVAS_TUTORBOT_FOLDER + ): publish_status["active"].append(file_data) else: publish_status["unpublished"].append(file_data) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 0f9928e2ae..05e3dc3fc1 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -6,12 +6,15 @@ from unittest.mock import MagicMock import pytest +from defusedxml import ElementTree from learning_resources.constants import LearningResourceType, PlatformType from learning_resources.etl.canvas import ( + _compact_element, is_file_published, parse_canvas_settings, parse_module_meta, + parse_web_content, run_for_canvas_archive, transform_canvas_content_files, transform_canvas_problem_files, @@ -591,7 +594,7 @@ def test_is_file_published(file_meta, expected): def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path): """ - Test published files from files_meta and module_meta are retained, + Test published files from files_meta and module_meta are retained """ module_xml = b""" @@ -692,3 +695,165 @@ def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path): assert "/file1.html" in result_paths assert "/file3.html" in result_paths assert bulk_unpub.mock_calls[0].args[0] == [stale_contentfile.id] + + +@pytest.mark.parametrize( + ("element", "expected"), + [ + # Test case: Element with no children, only text + ( + ElementTree.fromstring("""Sample Text"""), + "Sample Text", + ), + # Test case: Element with children, nested structure + ( + ElementTree.fromstring( + """ + Child 1 Text + + Subchild Text + + + """ + ), + { + "child1": "Child 1 Text", + "child2": {"subchild": "Subchild Text"}, + }, + ), + # Test case: Element with mixed text and children + ( + ElementTree.fromstring( + """Parent TextChild Text""" + ), + { + "child": "Child Text", + }, + ), + ], +) +def test_compact_element(element, expected): + """ + Test _compact_element function with nested tags. + """ + result = _compact_element(element) + assert result == expected + + +def test_parse_web_content_returns_active_and_unpublished(tmp_path): + """ + Test that parse_web_content returns active and unpublished items + """ + manifest_xml = b""" + + + + + + + + + + + """ + html_content_active = b""" + Active Content + Content + + """ + html_content_unpublished = b""" + Unpublished Content + Content + + """ + zip_path = tmp_path / "canvas_course.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("imsmanifest.xml", manifest_xml) + zf.writestr("file1.html", html_content_active) + zf.writestr("file2.html", html_content_unpublished) + + result = parse_web_content(zip_path) + assert "active" in result + assert "unpublished" in result + + assert any( + item["title"] == "Active Content" and item["path"] == "file1.html" + for item in result["active"] + ) + assert any( + item["title"] == "Unpublished Content" and item["path"] == "file2.html" + for item in result["unpublished"] + ) + + +def test_parse_web_content_handles_missing_workflow_state(tmp_path): + """ + Test that parse_web_content handles missing workflow_state gracefully + """ + manifest_xml = b""" + + + + + + + + """ + html_content = b""" + No Workflow State + Content + + """ + zip_path = tmp_path / "canvas_course.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("imsmanifest.xml", manifest_xml) + zf.writestr("file1.html", html_content) + + result = parse_web_content(zip_path) + assert "active" in result + assert "unpublished" in result + assert len(result["active"]) == 0 + assert len(result["unpublished"]) == 1 + assert result["unpublished"][0]["title"] == "No Workflow State" + assert result["unpublished"][0]["path"] == "file1.html" + + +def test_parse_web_content_excludes_instructor_only_content(tmp_path): + """ + Test that parse_web_content excludes content intended for Instructors only + """ + manifest_xml = b""" + + + + + + + + + Instructor + + + + + + + + """ + html_content = b""" + Instructor Only Content + Content + + """ + zip_path = tmp_path / "canvas_course.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("imsmanifest.xml", manifest_xml) + zf.writestr("file1.html", html_content) + + result = parse_web_content(zip_path) + assert "active" in result + assert "unpublished" in result + assert len(result["active"]) == 0 + assert len(result["unpublished"]) == 1 + assert result["unpublished"][0]["title"] == "Instructor Only Content" + assert result["unpublished"][0]["path"] == "file1.html" From f65ce82145433310e88d505d724fc1838fc5dd89 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 13:48:13 -0400 Subject: [PATCH 8/9] fix test --- learning_resources/etl/canvas_test.py | 49 +++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 05e3dc3fc1..5f9e5536e8 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -13,6 +13,7 @@ _compact_element, is_file_published, parse_canvas_settings, + parse_files_meta, parse_module_meta, parse_web_content, run_for_canvas_archive, @@ -857,3 +858,51 @@ def test_parse_web_content_excludes_instructor_only_content(tmp_path): assert len(result["unpublished"]) == 1 assert result["unpublished"][0]["title"] == "Instructor Only Content" assert result["unpublished"][0]["path"] == "file1.html" + + +def test_parse_files_meta_excludes_tutorbot_folder(tmp_path, settings): + """ + Test that parse_files_meta explicitly excludes files in the tutorbot folder + even if they are marked as published in the files_meta.xml. + """ + settings.CANVAS_TUTORBOT_FOLDER = "web_resources/ai/tutor/" + files_meta_xml = b""" + + + Published File + false + false + web_resources/ai/tutor/tutorfile.html + + + Regular File + false + false + regular_folder/file2.html + + + """ + manifest_xml = b""" + + + + + + + + + + + """ + zip_path = tmp_path / "canvas_course.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("course_settings/files_meta.xml", files_meta_xml) + zf.writestr("imsmanifest.xml", manifest_xml) + + result = parse_files_meta(zip_path) + + # Ensure the file in the tutorbot folder is excluded + assert len(result["active"]) == 1 + assert result["active"][0]["path"].name == "file2.html" + assert len(result["unpublished"]) == 1 + assert result["unpublished"][0]["path"].name == "tutorfile.html" From 48b7027d8f8123cf931191c578f099ecb42bd146 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 9 Sep 2025 20:16:28 -0400 Subject: [PATCH 9/9] more conditionals for assignment workflow state --- learning_resources/etl/canvas.py | 85 +++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index 5a431a6f22..f9fa6c7aff 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -304,6 +304,35 @@ def parse_context_xml(course_archive_path: str) -> dict: return context_info +def is_date_locked(lock_at: str, unlock_at: str) -> bool: + """ + Determine if a resource is currently date-locked based + on lock_at and unlock_at strings. + Args: + lock_at (str): ISO 8601 date string when the resource locks + unlock_at (str): ISO 8601 date string when the resource unlocks + Returns: + bool: True if the resource is currently locked, False otherwise + """ + now = now_in_utc() + if unlock_at and unlock_at.lower() != "nil": + try: + unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00")) + if now < unlock_dt: + return True + except Exception: + log.exception("Error parsing unlock_at date: %s", unlock_at) + + if lock_at and lock_at.lower() != "nil": + try: + lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00")) + if now > lock_dt: + return True + except Exception: + log.exception("Error parsing lock_at date: %s", lock_at) + return False + + def is_file_published(file_meta: dict) -> bool: """ Determine if a Canvas file (from files_meta.xml) is published/visible to students. @@ -313,7 +342,7 @@ def is_file_published(file_meta: dict) -> bool: Returns: bool: True if file is published/visible, False otherwise. """ - now = now_in_utc() + hidden = str(file_meta.get("hidden", "false")).lower() == "true" locked = str(file_meta.get("locked", "false")).lower() == "true" unlock_at = file_meta.get("unlock_at") @@ -323,21 +352,8 @@ def is_file_published(file_meta: dict) -> bool: if hidden or locked: return False - if unlock_at and unlock_at.lower() != "nil": - try: - unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00")) - if now < unlock_dt: - return False - except Exception: - log.exception("Error parsing date: %s", unlock_at) - - if lock_at and lock_at.lower() != "nil": - try: - lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00")) - if now > lock_dt: - return False - except Exception: - log.exception("Error parsing date: %s", lock_at) + if is_date_locked(lock_at, unlock_at): + return False # Visibility rules if visibility in ("course", "inherit"): return True @@ -454,18 +470,43 @@ def _workflow_state_from_html(html: str) -> str: return meta.get("content") if meta else None -def _workflow_state_from_xml(xml): +def _workflow_state_from_xml(xml_string: str) -> bool: """ - Extract the workflow_state element from xml + Determine the workflow_state (published/unpublished) from assignment_settings.xml """ - try: - root = ElementTree.fromstring(xml) - workflow_element = root.find("cccv1p0:workflow_state", NAMESPACES) - return workflow_element.text.strip() if workflow_element is not None else None + def _get_text(tag): + el = root.find(f"cccv1p0:{tag}", NAMESPACES) + return el.text.strip() if el is not None and el.text else "" + try: + root = ElementTree.fromstring(xml_string) except Exception: log.exception("Error parsing XML: %s", sys.stderr) + return "unpublished" + + if ( + ( + # workflow_state must be published + _get_text("workflow_state") != "published" + ) + or ( + # only_visible_to_overrides must not be true + _get_text("only_visible_to_overrides") == "true" + ) + or ( + # hide_in_gradebook must not be true (hidden from gradebook) + _get_text("hide_in_gradebook") == "true" + ) + ): + return "unpublished" + + lock_at = _get_text("lock_at") + unlock_at = _get_text("unlock_at") + if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at): + return "unpublished" + + return "published" def _title_from_html(html: str) -> str: