From 80aeab0d19b047e837b6d0ca84cac18a4bc5b9d4 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 23 Sep 2025 09:52:39 -0400 Subject: [PATCH 1/7] stashing changes --- learning_resources/etl/canvas.py | 506 +------------------------- learning_resources/etl/canvas_test.py | 14 +- 2 files changed, 21 insertions(+), 499 deletions(-) diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index e8b5049204..c01ea81504 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -1,21 +1,13 @@ import base64 -import json import logging -import sys import zipfile -from collections import defaultdict from collections.abc import Generator -from datetime import UTC, datetime +from datetime import datetime from io import BytesIO from pathlib import Path from tempfile import TemporaryDirectory -from urllib.parse import unquote, unquote_plus -from zoneinfo import ZoneInfo -import dateutil import pypdfium2 as pdfium -from bs4 import BeautifulSoup -from defusedxml import ElementTree from django.conf import settings from litellm import completion from PIL import Image @@ -25,6 +17,12 @@ LearningResourceType, PlatformType, ) +from learning_resources.etl.canvas_utils import ( + canvas_course_url, + canvas_url_config, + get_published_items, + parse_canvas_settings, +) from learning_resources.etl.constants import ETLSource from learning_resources.etl.utils import ( _process_olx_path, @@ -40,27 +38,9 @@ from learning_resources_search.constants import ( CONTENT_FILE_TYPE, ) -from main.utils import now_in_utc log = logging.getLogger(__name__) -# list of file regexes we should ignore -IGNORE_FILES = [ - "course_settings.xml", - "context.xml", - "files_meta.xml", - "module_meta.xml", - "imsmanifest.xml", - "assignment_settings.xml", -] - -NAMESPACES = { - "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0", - "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", - "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", - "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", -} - def sync_canvas_archive(bucket, key: str, overwrite): """ @@ -73,7 +53,7 @@ def sync_canvas_archive(bucket, key: str, overwrite): with TemporaryDirectory() as export_tempdir: course_archive_path = Path(export_tempdir, key.split("/")[-1]) bucket.download_file(key, course_archive_path) - url_config = _get_url_config(bucket, export_tempdir, url_config_file) + url_config = canvas_url_config(bucket, export_tempdir, url_config_file) resource_readable_id, run = run_for_canvas_archive( course_archive_path, course_folder=course_folder, overwrite=overwrite ) @@ -101,31 +81,6 @@ def sync_canvas_archive(bucket, key: str, overwrite): return resource_readable_id -def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict: - """ - Get URL (citation) config from the metadata JSON file - """ - url_config_path = Path(export_tempdir, url_config_file.split("/")[-1]) - # download the url config file - bucket.download_file(url_config_file, url_config_path) - url_config = {} - with Path.open(url_config_path, "rb") as f: - url_json = json.loads(f.read().decode("utf-8")) - for url_item in url_json.get("course_files", []): - url_key = url_item["file_path"] - url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0])) - url_config[url_key] = url_item["url"] - for url_item in url_json.get("assignments", []) + url_json.get("pages", []): - url_key = url_item.get("name", url_item.get("title")) - url_config[url_key] = url_item.get("html_url") - return url_config - - -def _course_url(course_archive_path) -> str: - context_info = parse_context_xml(course_archive_path) - return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/" - - def run_for_canvas_archive(course_archive_path, course_folder, overwrite): """ Generate and return a LearningResourceRun for a Canvas course @@ -133,7 +88,7 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): checksum = calc_checksum(course_archive_path) course_info = parse_canvas_settings(course_archive_path) course_title = course_info.get("title") - url = _course_url(course_archive_path) + url = canvas_course_url(course_archive_path) start_at = course_info.get("start_at") end_at = course_info.get("conclude_at") if start_at: @@ -181,20 +136,6 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): return resource_readable_id, run -def parse_canvas_settings(course_archive_path): - """ - Get course attributes from a Canvas course archive - """ - with zipfile.ZipFile(course_archive_path, "r") as course_archive: - xml_string = course_archive.read("course_settings/course_settings.xml") - tree = ElementTree.fromstring(xml_string) - attributes = {} - for node in tree.iter(): - tag = node.tag.split("}")[1] if "}" in node.tag else node.tag - attributes[tag] = node.text - return attributes - - def transform_canvas_content_files( course_zipfile: Path, run: LearningResourceRun, url_config: dict, *, overwrite ) -> Generator[dict, None, None]: @@ -203,23 +144,7 @@ def transform_canvas_content_files( """ basedir = course_zipfile.name.split(".")[0] zipfile_path = course_zipfile.absolute() - all_published_items = ( - parse_module_meta(zipfile_path)["active"] - + parse_files_meta(zipfile_path)["active"] - + parse_web_content(zipfile_path)["active"] - ) - published_items = {} - for item in all_published_items: - path = Path(item["path"]).resolve() - published_items[path] = item - for embedded_file in item.get("embedded_files", []): - embedded_path = Path(embedded_file).resolve() - if embedded_path in all_published_items: - continue - published_items[embedded_path] = { - "path": embedded_path, - "title": "", - } + published_items = get_published_items(zipfile_path, url_config) def _generate_content(): """Inner generator for yielding content data""" @@ -242,10 +167,10 @@ def _generate_content(): item_meta = published_items.get( Path(content_data["source_path"]).resolve(), {} ) - - content_url = url_config.get(url_path) or url_config.get( - item_meta.get("title") + item_url_config = url_config.get(url_path, {}) or url_config.get( + item_meta.get("title"), {} ) + content_url = item_url_config.get("url") content_data["content_title"] = item_meta.get("title") if content_url: content_data["url"] = content_url @@ -312,411 +237,6 @@ def transform_canvas_problem_files( yield problem_file_data -def parse_context_xml(course_archive_path: str) -> dict: - """ - Parse course_settings/context.xml and return context info - """ - with zipfile.ZipFile(course_archive_path, "r") as course_archive: - context = course_archive.read("course_settings/context.xml") - root = ElementTree.fromstring(context) - context_info = {} - item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"] - for key in item_keys: - element = root.find(f"cccv1p0:{key}", NAMESPACES) - if element is not None: - context_info[key] = element.text - - return context_info - - -def is_date_locked(lock_at: str, unlock_at: str) -> bool: - """ - Determine if a resource is currently date-locked based - on lock_at and unlock_at strings. - Args: - lock_at (str): ISO 8601 date string when the resource locks - unlock_at (str): ISO 8601 date string when the resource unlocks - Returns: - bool: True if the resource is currently locked, False otherwise - """ - now = now_in_utc() - if unlock_at and unlock_at.lower() != "nil": - try: - unlock_dt = ( - dateutil.parser.parse(unlock_at) - .replace(tzinfo=ZoneInfo("US/Eastern")) - .astimezone(UTC) - ) - - if now < unlock_dt: - return True - except Exception: - log.exception("Error parsing unlock_at date: %s", unlock_at) - - if lock_at and lock_at.lower() != "nil": - try: - lock_dt = ( - dateutil.parser.parse(lock_at) - .replace(tzinfo=ZoneInfo("US/Eastern")) - .astimezone(UTC) - ) - if now > lock_dt: - return True - except Exception: - log.exception("Error parsing lock_at date: %s", lock_at) - return False - - -def is_file_published(file_meta: dict) -> bool: - """ - Determine if a Canvas file (from files_meta.xml) is published/visible to students. - - Args: - file_meta (dict): Parsed metadata for a file. - Returns: - bool: True if file is published/visible, False otherwise. - """ - - hidden = str(file_meta.get("hidden", "false")).lower() == "true" - locked = str(file_meta.get("locked", "false")).lower() == "true" - unlock_at = file_meta.get("unlock_at") - lock_at = file_meta.get("lock_at") - visibility = file_meta.get("visibility", "inherit") - # If explicitly hidden or locked → unpublished - if hidden or locked: - return False - - if is_date_locked(lock_at, unlock_at): - return False - # Visibility rules - if visibility in ("course", "inherit"): - return True - elif visibility in ("institution", "public"): - return True # technically more visible - return False - - -def parse_files_meta(course_archive_path: str) -> dict: - """ - Parse course_settings/files_meta.xml and return publish/active status of resources. - """ - publish_status = {"active": [], "unpublished": []} - with zipfile.ZipFile(course_archive_path, "r") as course_archive: - files_meta_path = "course_settings/files_meta.xml" - if files_meta_path not in course_archive.namelist(): - return publish_status - files_xml = course_archive.read(files_meta_path) - manifest_xml = course_archive.read("imsmanifest.xml") - resource_map = extract_resources_by_identifier(manifest_xml) - root = ElementTree.fromstring(files_xml) - try: - for file_elem in root.findall(".//cccv1p0:file", NAMESPACES): - meta = dict(file_elem.attrib) - for child in file_elem: - tag = child.tag - # strip namespace - if "}" in tag: - tag = tag.split("}", 1)[1] - if child.attrib.get("nil") == "true": - value = None - else: - value = (child.text or "").strip() - meta[tag] = value - item_info = resource_map.get(meta.get("identifier"), {}) - meta["published"] = is_file_published(meta) - for file in item_info.get("files", []): - file_data = meta.copy() - file_path = Path(file) - file_data["path"] = file_path - file_data["title"] = file_data.get("display_name") - # explicitly exclude files in web_resources/ai/tutor - if file_data["published"] and not file.startswith( - settings.CANVAS_TUTORBOT_FOLDER - ): - publish_status["active"].append(file_data) - else: - publish_status["unpublished"].append(file_data) - except Exception: - log.exception("Error parsing XML: %s", sys.stderr) - return None - return publish_status - - -def parse_module_meta(course_archive_path: str) -> dict: - """ - Parse module_meta.xml and return publish/active status of resources. - """ - with zipfile.ZipFile(course_archive_path, "r") as course_archive: - module_xml = course_archive.read("course_settings/module_meta.xml") - manifest_xml = course_archive.read("imsmanifest.xml") - resource_map = extract_resources_by_identifierref(manifest_xml) - publish_status = {"active": [], "unpublished": []} - try: - root = ElementTree.fromstring(module_xml) - for module in root.findall(".//cccv1p0:module", NAMESPACES): - module_title = module.find("cccv1p0:title", NAMESPACES).text - for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES): - item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text - item_title = item.find("cccv1p0:title", NAMESPACES).text - identifierref = ( - item.find("cccv1p0:identifierref", NAMESPACES).text - if item.find("cccv1p0:identifierref", NAMESPACES) is not None - else None - ) - content_type = item.find("cccv1p0:content_type", NAMESPACES).text - items = resource_map.get(identifierref, {}) - for item_info in items: - for file in item_info.get("files", []): - file_path = Path(file) - status = "active" if item_state == "active" else "unpublished" - publish_status[status].append( - { - "title": item_title, - "type": content_type, - "path": file_path, - "module": module_title, - } - ) - except Exception: - log.exception("Error parsing XML: %s", sys.stderr) - return None - return publish_status - - -def _compact_element(element) -> dict | str | None: - """Recursively compact an element into a nested dictionary""" - if len(element) == 0: # No children, return text - return element.text.strip() if element.text else None - return { - child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element( - child - ) - for child in element - } - - -def _workflow_state_from_html(html: str) -> str: - """ - Extract the workflow_state meta tag from html - """ - soup = BeautifulSoup(html, "html.parser") - meta = soup.find("meta", attrs={"name": "workflow_state"}) - return meta.get("content") if meta else None - - -def _embedded_files_from_html(html: str) -> list[str]: - """ - Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources - and returning URL-decoded paths without query params. - """ - soup = BeautifulSoup(html, "html.parser") - links = [] - - for a in soup.find_all("a", href=True): - href = a["href"] - if href.startswith("$IMS-CC-FILEBASE$"): - # Remove query parameters if present - clean_href = href.split("?")[0] - # Replace $IMS-CC-FILEBASE$ with "web_resources" - clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources") - # URL decode - decoded = unquote(clean_href) - links.append(decoded) - - return links - - -def _workflow_state_from_xml(xml_string: str) -> bool: - """ - Determine the workflow_state (published/unpublished) from assignment_settings.xml - """ - - def _get_text(tag): - el = root.find(f"cccv1p0:{tag}", NAMESPACES) - return el.text.strip() if el is not None and el.text else "" - - try: - root = ElementTree.fromstring(xml_string) - except Exception: - log.exception("Error parsing XML: %s", sys.stderr) - return "unpublished" - - if ( - ( - # workflow_state must be published - _get_text("workflow_state") != "published" - ) - or ( - # only_visible_to_overrides must not be true - _get_text("only_visible_to_overrides") == "true" - ) - or ( - # hide_in_gradebook must not be true (hidden from gradebook) - _get_text("hide_in_gradebook") == "true" - ) - ): - return "unpublished" - - lock_at = _get_text("lock_at") - unlock_at = _get_text("unlock_at") - if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at): - return "unpublished" - - return "published" - - -def _title_from_html(html: str) -> str: - """ - Extract the title element from HTML content - """ - soup = BeautifulSoup(html, "html.parser") - title = soup.find("title") - return title.get_text().strip() if title else "" - - -def _title_from_assignment_settings(xml_string: str) -> str: - """ - Extract the title from assignment_settings.xml - """ - try: - root = ElementTree.fromstring(xml_string) - except Exception: - log.exception("Error parsing XML: %s", sys.stderr) - return "" - title_elem = root.find("cccv1p0:title", NAMESPACES) - return title_elem.text.strip() if title_elem is not None and title_elem.text else "" - - -def parse_web_content(course_archive_path: str) -> dict: - """ - Parse html pages and assignments and return publish/active status of resources - """ - - publish_status = {"active": [], "unpublished": []} - - with zipfile.ZipFile(course_archive_path, "r") as course_archive: - manifest_path = "imsmanifest.xml" - if manifest_path not in course_archive.namelist(): - return publish_status - manifest_xml = course_archive.read(manifest_path) - resource_map = extract_resources_by_identifier(manifest_xml) - for item in resource_map: - resource_map_item = resource_map[item] - item_link = resource_map_item.get("href") - assignment_settings = None - for file in resource_map_item.get("files", []): - if file.endswith("assignment_settings.xml"): - assignment_settings = file - if item_link and item_link.endswith(".html"): - file_path = resource_map_item["href"] - html_content = course_archive.read(file_path) - embedded_files = _embedded_files_from_html(html_content) - if assignment_settings: - xml_content = course_archive.read(assignment_settings) - workflow_state = _workflow_state_from_xml(xml_content) - title = _title_from_assignment_settings(xml_content) - canvas_type = "assignment" - else: - workflow_state = _workflow_state_from_html(html_content) - title = _title_from_html(html_content) - canvas_type = "page" - - lom_elem = ( - resource_map_item.get("metadata", {}) - .get("lom", {}) - .get("educational", {}) - ) - # Determine if the content is intended for authors or instructors only - intended_role = lom_elem.get("intendedEndUserRole", {}).get("value") - authors_only = intended_role and intended_role.lower() != "student" - - if workflow_state in ["active", "published"] and not authors_only: - publish_status["active"].append( - { - "title": title, - "path": file_path, - "canvas_type": canvas_type, - "embedded_files": embedded_files, - } - ) - else: - publish_status["unpublished"].append( - { - "title": title, - "path": file_path, - "canvas_type": canvas_type, - "embedded_files": embedded_files, - } - ) - return publish_status - - -def extract_resources_by_identifierref(manifest_xml: str) -> dict: - """ - Extract resources from an IMS manifest file and - return a map keyed by identifierref. - """ - root = ElementTree.fromstring(manifest_xml) - - # Dictionary to hold resources keyed by identifierref - resources_dict = defaultdict(list) - # Find all item elements with identifierref attributes - for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES): - identifierref = item.get("identifierref") - title = ( - item.find("imscp:title", NAMESPACES).text - if item.find("imscp:title", NAMESPACES) is not None - else "" - ) - resource = root.find( - f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES - ) - if resource is not None: - # Get all file elements within the resource - files = [ - file_elem.get("href") - for file_elem in resource.findall("imscp:file", NAMESPACES) - ] - - resources_dict[identifierref].append( - {"title": title, "files": files, "type": resource.get("type")} - ) - return dict(resources_dict) - - -def extract_resources_by_identifier(manifest_xml: str) -> dict: - """ - Extract resources from an IMS manifest - file and return a map keyed by identifier. - """ - root = ElementTree.fromstring(manifest_xml) - resources_dict = {} - # Find all resource elements - for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES): - identifier = resource.get("identifier") - resource_type = resource.get("type") - href = resource.get("href") - - # Get all file elements within the resource - files = [ - file_elem.get("href") - for file_elem in resource.findall("imscp:file", NAMESPACES) - ] - # Extract metadata if present - metadata = {} - metadata_elem = resource.find("imscp:metadata", NAMESPACES) - if metadata_elem is not None: - metadata.update(_compact_element(metadata_elem)) - resources_dict[identifier] = { - "identifier": identifier, - "type": resource_type, - "href": href, - "files": files, - "metadata": metadata, - } - return resources_dict - - def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85): """ Convert a PDF file to a list of base64 encoded images (one per page). diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 2c1c244022..4ca9fe7b17 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -10,15 +10,17 @@ from learning_resources.constants import LearningResourceType, PlatformType from learning_resources.etl.canvas import ( + run_for_canvas_archive, + transform_canvas_content_files, + transform_canvas_problem_files, +) +from learning_resources.etl.canvas_utils import ( _compact_element, is_file_published, parse_canvas_settings, parse_files_meta, parse_module_meta, parse_web_content, - run_for_canvas_archive, - transform_canvas_content_files, - transform_canvas_problem_files, ) from learning_resources.etl.constants import ETLSource from learning_resources.etl.utils import get_edx_module_id @@ -120,7 +122,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker): return_value={"title": "Test Course", "course_code": "TEST101"}, ) mocker.patch( - "learning_resources.etl.canvas.parse_context_xml", + "learning_resources.etl.canvas_utils.parse_context_xml", return_value={"course_id": "123", "canvas_domain": "mit.edu"}, ) @@ -152,7 +154,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker): return_value={"title": "Test Course", "course_code": "TEST104"}, ) mocker.patch( - "learning_resources.etl.canvas.parse_context_xml", + "learning_resources.etl.canvas_utils.parse_context_xml", return_value={"course_id": "123", "canvas_domain": "mit.edu"}, ) mocker.patch( @@ -472,7 +474,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path): return_value=mock_content_data, ) mocker.patch( - "learning_resources.etl.canvas.parse_module_meta", + "learning_resources.etl.canvas_utils.parse_module_meta", return_value={"active": [], "unpublished": []}, ) # Use a real zip file From 1013ec57957ca05b9bd15b36f704844c1fe3b9d9 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 24 Sep 2025 10:21:16 -0400 Subject: [PATCH 2/7] adding more permission/hidden checks --- learning_resources/etl/canvas_utils.py | 553 +++++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 learning_resources/etl/canvas_utils.py diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py new file mode 100644 index 0000000000..a62f62c377 --- /dev/null +++ b/learning_resources/etl/canvas_utils.py @@ -0,0 +1,553 @@ +import json +import logging +import sys +import zipfile +from collections import defaultdict +from datetime import UTC +from pathlib import Path +from urllib.parse import unquote, unquote_plus +from zoneinfo import ZoneInfo + +import dateutil +from bs4 import BeautifulSoup +from defusedxml import ElementTree +from django.conf import settings + +from main.utils import now_in_utc + +log = logging.getLogger(__name__) + +# list of file regexes we should ignore +IGNORE_FILES = [ + "course_settings.xml", + "context.xml", + "files_meta.xml", + "module_meta.xml", + "imsmanifest.xml", + "assignment_settings.xml", +] + +NAMESPACES = { + "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0", + "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1", + "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource", + "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest", +} + + +def is_file_published(file_meta: dict) -> bool: + """ + Determine if a Canvas file (from files_meta.xml) is published/visible to students. + + Args: + file_meta (dict): Parsed metadata for a file. + Returns: + bool: True if file is published/visible, False otherwise. + """ + + hidden = str(file_meta.get("hidden", "false")).lower() == "true" + locked = str(file_meta.get("locked", "false")).lower() == "true" + unlock_at = file_meta.get("unlock_at") + lock_at = file_meta.get("lock_at") + visibility = file_meta.get("visibility", "inherit") + # If explicitly hidden or locked → unpublished + if hidden or locked: + return False + + if is_date_locked(lock_at, unlock_at): + return False + # Visibility rules + if visibility in ("course", "inherit"): + return True + elif visibility in ("institution", "public"): + return True # technically more visible + return False + + +def parse_files_meta(course_archive_path: str) -> dict: + """ + Parse course_settings/files_meta.xml and return publish/active status of resources. + """ + publish_status = {"active": [], "unpublished": []} + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + files_meta_path = "course_settings/files_meta.xml" + if files_meta_path not in course_archive.namelist(): + return publish_status + files_xml = course_archive.read(files_meta_path) + manifest_xml = course_archive.read("imsmanifest.xml") + resource_map = extract_resources_by_identifier(manifest_xml) + root = ElementTree.fromstring(files_xml) + try: + for file_elem in root.findall(".//cccv1p0:file", NAMESPACES): + meta = dict(file_elem.attrib) + for child in file_elem: + tag = child.tag + # strip namespace + if "}" in tag: + tag = tag.split("}", 1)[1] + if child.attrib.get("nil") == "true": + value = None + else: + value = (child.text or "").strip() + meta[tag] = value + item_info = resource_map.get(meta.get("identifier"), {}) + meta["published"] = is_file_published(meta) + for file in item_info.get("files", []): + file_data = meta.copy() + file_path = Path(file) + file_data["path"] = file_path + file_data["title"] = file_data.get("display_name") + # explicitly exclude files in web_resources/ai/tutor + if file_data["published"] and not file.startswith( + settings.CANVAS_TUTORBOT_FOLDER + ): + publish_status["active"].append(file_data) + else: + publish_status["unpublished"].append(file_data) + except Exception: + log.exception("Error parsing XML: %s", sys.stderr) + return None + return publish_status + + +def parse_module_meta(course_archive_path: str) -> dict: + """ + Parse module_meta.xml and return publish/active status of resources. + """ + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + module_xml = course_archive.read("course_settings/module_meta.xml") + manifest_xml = course_archive.read("imsmanifest.xml") + resource_map = extract_resources_by_identifierref(manifest_xml) + publish_status = {"active": [], "unpublished": []} + try: + root = ElementTree.fromstring(module_xml) + for module in root.findall(".//cccv1p0:module", NAMESPACES): + module_title = module.find("cccv1p0:title", NAMESPACES).text + for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES): + item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text + item_title = item.find("cccv1p0:title", NAMESPACES).text + identifierref = ( + item.find("cccv1p0:identifierref", NAMESPACES).text + if item.find("cccv1p0:identifierref", NAMESPACES) is not None + else None + ) + content_type = item.find("cccv1p0:content_type", NAMESPACES).text + items = resource_map.get(identifierref, {}) + for item_info in items: + for file in item_info.get("files", []): + file_path = Path(file) + status = "active" if item_state == "active" else "unpublished" + publish_status[status].append( + { + "title": item_title, + "type": content_type, + "path": file_path, + "module": module_title, + } + ) + except Exception: + log.exception("Error parsing XML: %s", sys.stderr) + return None + return publish_status + + +def _compact_element(element) -> dict | str | None: + """Recursively compact an element into a nested dictionary""" + if len(element) == 0: # No children, return text + return element.text.strip() if element.text else None + return { + child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element( + child + ) + for child in element + } + + +def _workflow_state_from_html(html: str) -> str: + """ + Extract the workflow_state meta tag from html + """ + soup = BeautifulSoup(html, "html.parser") + meta = soup.find("meta", attrs={"name": "workflow_state"}) + return meta.get("content") if meta else None + + +def _embedded_files_from_html(html: str) -> list[str]: + """ + Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources + and returning URL-decoded paths without query params. + """ + soup = BeautifulSoup(html, "html.parser") + links = [] + + for a in soup.find_all("a", href=True): + href = a["href"] + if href.startswith("$IMS-CC-FILEBASE$"): + # Remove query parameters if present + clean_href = href.split("?")[0] + # Replace $IMS-CC-FILEBASE$ with "web_resources" + clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources") + # URL decode + decoded = unquote(clean_href) + links.append(decoded) + + return links + + +def _workflow_state_from_xml(xml_string: str) -> bool: + """ + Determine the workflow_state (published/unpublished) from assignment_settings.xml + """ + + def _get_text(tag): + el = root.find(f"cccv1p0:{tag}", NAMESPACES) + return el.text.strip() if el is not None and el.text else "" + + try: + root = ElementTree.fromstring(xml_string) + except Exception: + log.exception("Error parsing XML: %s", sys.stderr) + return "unpublished" + + if ( + ( + # workflow_state must be published + _get_text("workflow_state") != "published" + ) + or ( + # only_visible_to_overrides must not be true + _get_text("only_visible_to_overrides") == "true" + ) + or ( + # hide_in_gradebook must not be true (hidden from gradebook) + _get_text("hide_in_gradebook") == "true" + ) + ): + return "unpublished" + + lock_at = _get_text("lock_at") + unlock_at = _get_text("unlock_at") + if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at): + return "unpublished" + + return "published" + + +def _title_from_html(html: str) -> str: + """ + Extract the title element from HTML content + """ + soup = BeautifulSoup(html, "html.parser") + title = soup.find("title") + return title.get_text().strip() if title else "" + + +def _title_from_assignment_settings(xml_string: str) -> str: + """ + Extract the title from assignment_settings.xml + """ + try: + root = ElementTree.fromstring(xml_string) + except Exception: + log.exception("Error parsing XML: %s", sys.stderr) + return "" + title_elem = root.find("cccv1p0:title", NAMESPACES) + return title_elem.text.strip() if title_elem is not None and title_elem.text else "" + + +def parse_web_content(course_archive_path: str) -> dict: + """ + Parse html pages and assignments and return publish/active status of resources + """ + + publish_status = {"active": [], "unpublished": []} + + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + manifest_path = "imsmanifest.xml" + if manifest_path not in course_archive.namelist(): + return publish_status + manifest_xml = course_archive.read(manifest_path) + resource_map = extract_resources_by_identifier(manifest_xml) + for item in resource_map: + resource_map_item = resource_map[item] + item_link = resource_map_item.get("href") + assignment_settings = None + for file in resource_map_item.get("files", []): + if file.endswith("assignment_settings.xml"): + assignment_settings = file + if item_link and item_link.endswith(".html"): + file_path = resource_map_item["href"] + html_content = course_archive.read(file_path) + embedded_files = _embedded_files_from_html(html_content) + if assignment_settings: + xml_content = course_archive.read(assignment_settings) + workflow_state = _workflow_state_from_xml(xml_content) + title = _title_from_assignment_settings(xml_content) + canvas_type = "assignment" + else: + workflow_state = _workflow_state_from_html(html_content) + title = _title_from_html(html_content) + canvas_type = "page" + + lom_elem = ( + resource_map_item.get("metadata", {}) + .get("lom", {}) + .get("educational", {}) + ) + # Determine if the content is intended for authors or instructors only + intended_role = lom_elem.get("intendedEndUserRole", {}).get("value") + authors_only = intended_role and intended_role.lower() != "student" + + if workflow_state in ["active", "published"] and not authors_only: + publish_status["active"].append( + { + "title": title, + "path": file_path, + "canvas_type": canvas_type, + "embedded_files": embedded_files, + } + ) + else: + publish_status["unpublished"].append( + { + "title": title, + "path": file_path, + "canvas_type": canvas_type, + "embedded_files": embedded_files, + } + ) + return publish_status + + +def extract_resources_by_identifierref(manifest_xml: str) -> dict: + """ + Extract resources from an IMS manifest file and + return a map keyed by identifierref. + """ + root = ElementTree.fromstring(manifest_xml) + + # Dictionary to hold resources keyed by identifierref + resources_dict = defaultdict(list) + # Find all item elements with identifierref attributes + for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES): + identifierref = item.get("identifierref") + title = ( + item.find("imscp:title", NAMESPACES).text + if item.find("imscp:title", NAMESPACES) is not None + else "" + ) + resource = root.find( + f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES + ) + if resource is not None: + # Get all file elements within the resource + files = [ + file_elem.get("href") + for file_elem in resource.findall("imscp:file", NAMESPACES) + ] + + resources_dict[identifierref].append( + {"title": title, "files": files, "type": resource.get("type")} + ) + return dict(resources_dict) + + +def extract_resources_by_identifier(manifest_xml: str) -> dict: + """ + Extract resources from an IMS manifest + file and return a map keyed by identifier. + """ + root = ElementTree.fromstring(manifest_xml) + resources_dict = {} + # Find all resource elements + for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES): + identifier = resource.get("identifier") + resource_type = resource.get("type") + href = resource.get("href") + + # Get all file elements within the resource + files = [ + file_elem.get("href") + for file_elem in resource.findall("imscp:file", NAMESPACES) + ] + # Extract metadata if present + metadata = {} + metadata_elem = resource.find("imscp:metadata", NAMESPACES) + if metadata_elem is not None: + metadata.update(_compact_element(metadata_elem)) + resources_dict[identifier] = { + "identifier": identifier, + "type": resource_type, + "href": href, + "files": files, + "metadata": metadata, + } + return resources_dict + + +def parse_context_xml(course_archive_path: str) -> dict: + """ + Parse course_settings/context.xml and return context info + """ + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + context = course_archive.read("course_settings/context.xml") + root = ElementTree.fromstring(context) + context_info = {} + item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"] + for key in item_keys: + element = root.find(f"cccv1p0:{key}", NAMESPACES) + if element is not None: + context_info[key] = element.text + + return context_info + + +def is_date_locked(lock_at: str, unlock_at: str) -> bool: + """ + Determine if a resource is currently date-locked based + on lock_at and unlock_at strings. + Args: + lock_at (str): ISO 8601 date string when the resource locks + unlock_at (str): ISO 8601 date string when the resource unlocks + Returns: + bool: True if the resource is currently locked, False otherwise + """ + now = now_in_utc() + if unlock_at and unlock_at.lower() != "nil": + try: + unlock_dt = ( + dateutil.parser.parse(unlock_at) + .replace(tzinfo=ZoneInfo("US/Eastern")) + .astimezone(UTC) + ) + + if now < unlock_dt: + return True + except Exception: + log.exception("Error parsing unlock_at date: %s", unlock_at) + + if lock_at and lock_at.lower() != "nil": + try: + lock_dt = ( + dateutil.parser.parse(lock_at) + .replace(tzinfo=ZoneInfo("US/Eastern")) + .astimezone(UTC) + ) + if now > lock_dt: + return True + except Exception: + log.exception("Error parsing lock_at date: %s", lock_at) + return False + + +def parse_canvas_settings(course_archive_path): + """ + Get course attributes from a Canvas course archive + """ + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + xml_string = course_archive.read("course_settings/course_settings.xml") + tree = ElementTree.fromstring(xml_string) + attributes = {} + for node in tree.iter(): + tag = node.tag.split("}")[1] if "}" in node.tag else node.tag + node_value = node.text + if tag == "tab_configuration": + tab_config = json.loads(node.text) + + node_value = dict(zip([tc["id"] for tc in tab_config], tab_config)) + attributes[tag] = node_value + return attributes + + +def canvas_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict: + """ + Get URL (citation) config from the metadata JSON file + """ + url_config_path = Path(export_tempdir, url_config_file.split("/")[-1]) + # download the url config file + bucket.download_file(url_config_file, url_config_path) + url_config = {} + with Path.open(url_config_path, "rb") as f: + url_json = json.loads(f.read().decode("utf-8")) + for url_item in url_json.get("course_files", []): + url_key = url_item["file_path"] + url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0])) + url_config[url_key] = url_item + for url_item in url_json.get("assignments", []) + url_json.get("pages", []): + url_key = url_item.get("name", url_item.get("title")) + # normalize url field + url_item["url"] = url_item.get("html_url") + url_config[url_key] = url_item + return url_config + + +def canvas_course_url(course_archive_path) -> str: + context_info = parse_context_xml(course_archive_path) + return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/" + + +def _url_config_key(item): + if "web_resources" in str(item["path"]): + return str(item["path"]).split("web_resources")[-1] + return item.get("title") + + +def _url_config_item_visible(item_configuration): + """ + Determine if an item is visible based on its configuration + from the metadata json file + """ + if item_configuration: + # check if explicitely unpublished + unpublished = not item_configuration.get("published", True) + return not any( + [ + unpublished, + item_configuration.get("hidden"), # file hidden + item_configuration.get("locked"), # file locked + item_configuration.get("folder", {}).get( + "hidden" + ), # parent folder hidden + item_configuration.get("folder", {}).get( + "locked" + ), # parent folder locked + ] + ) + return True + + +def get_published_items(zipfile_path, url_config): + published_items = {} + course_settings = parse_canvas_settings(zipfile_path) + tab_configuration = course_settings.get("tab_configuration", {}) + files_section_is_visible = not tab_configuration.get(11, {}).get("hidden", False) + all_published_items = ( + parse_module_meta(zipfile_path)["active"] + + parse_files_meta(zipfile_path)["active"] + + parse_web_content(zipfile_path)["active"] + ) + + all_embedded_items = [] + for item in all_published_items: + path = Path(item["path"]).resolve() + item_configuration = url_config.get(_url_config_key(item)) + item_visible = _url_config_item_visible(item_configuration) + + # if the item is not explicitely hidden and global files section is visible + if item_visible and ( + str(Path(item["path"]).parent) != "web_resources" + or files_section_is_visible + ): + published_items[path] = item + for embedded_file in item.get("embedded_files", []): + embedded_path = Path(embedded_file).resolve() + embedded = { + "path": embedded_path, + "title": "", + } + all_embedded_items.append(embedded) + if embedded_path in all_published_items: + continue + published_items[embedded_path] = embedded + + return published_items From 4f85aeaf12ba38402878fb33e00ced7f92b3f05b Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 24 Sep 2025 10:56:02 -0400 Subject: [PATCH 3/7] adding more permission/hidden checks --- learning_resources/etl/canvas_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py index a62f62c377..012871b444 100644 --- a/learning_resources/etl/canvas_utils.py +++ b/learning_resources/etl/canvas_utils.py @@ -500,9 +500,12 @@ def _url_config_item_visible(item_configuration): if item_configuration: # check if explicitely unpublished unpublished = not item_configuration.get("published", True) + lock_at = item_configuration.get("lock_at") + unlock_at = item_configuration.get("unlock_at") return not any( [ unpublished, + is_date_locked(lock_at, unlock_at), item_configuration.get("hidden"), # file hidden item_configuration.get("locked"), # file locked item_configuration.get("folder", {}).get( @@ -520,6 +523,10 @@ def get_published_items(zipfile_path, url_config): published_items = {} course_settings = parse_canvas_settings(zipfile_path) tab_configuration = course_settings.get("tab_configuration", {}) + """ + mappings for ids: + # https://developerdocs.instructure.com/services/dap/dataset/dataset-additional-notes + """ files_section_is_visible = not tab_configuration.get(11, {}).get("hidden", False) all_published_items = ( parse_module_meta(zipfile_path)["active"] From e9da34ba353e361c6f056b4b9b848f7c9e3a7096 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 26 Sep 2025 10:05:45 -0400 Subject: [PATCH 4/7] test fix --- learning_resources/etl/canvas_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 00201cbef3..cba2138063 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -524,7 +524,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path): """ run = MagicMock() run.id = 1 - url_config = {"/folder/file1.html": "https://cdn.example.com/file1.html"} + url_config = {"/folder/file1.html": {"url": "https://cdn.example.com/file1.html"}} # Patch _process_olx_path to yield content_data with source_path mock_content_data = [ {"source_path": "data/folder/file1.html", "key": "file1"}, @@ -1140,8 +1140,8 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path): hmtl_page_title = "html page" url_config = { - hmtl_page_title: "https://example.com/htmlpage", - "/file1.html": "https://example.com/file1", + hmtl_page_title: {"url": "https://example.com/htmlpage"}, + "/file1.html": {"url": "https://example.com/file1"}, } run = LearningResourceRunFactory.create() From bf1a15793ebe20e266330d2243150bc5c6efbad8 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 26 Sep 2025 13:58:24 -0400 Subject: [PATCH 5/7] adding tests --- learning_resources/etl/canvas_test.py | 301 ++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index cba2138063..1e96f49910 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -16,6 +16,7 @@ ) from learning_resources.etl.canvas_utils import ( _compact_element, + get_published_items, is_file_published, parse_canvas_settings, parse_files_meta, @@ -1235,3 +1236,303 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path): ) assert results["html page"] == "https://example.com/htmlpage" assert results["Item 1"] == "https://example.com/file1" + + +def test_get_published_items_for_unpublshed_file(tmp_path): + html_content = """ + + + + Embedded File 1 + + + """ + module_xml = b""" + + + + Module 1 + + + active + Item 1 + false + false + RES3 + resource + + + + + """ + manifest_xml = b""" + + + + + + + + + + + + + + """ + zip_path = make_canvas_zip( + tmp_path, + module_xml=module_xml, + manifest_xml=manifest_xml, + files=[ + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", html_content), + ], + ) + url_config = { + "/html_page.html": { + "url": "https://cdn.example.com/", + "locked": True, + }, + } + published = [item.name for item in get_published_items(zip_path, url_config)] + url_config = { + "/html_page.html": { + "url": "https://cdn.example.com/", + "hidden": True, + }, + } + published.extend([item.name for item in get_published_items(zip_path, url_config)]) + assert "html_page.html" not in published + + +def test_get_published_items_for_unpublshed_parent_folder(mocker, tmp_path): + html_content = """ + + + + Embedded File 1 + + + """ + module_xml = b""" + + + + Module 1 + + + active + Item 1 + false + false + RES3 + resource + + + + + """ + manifest_xml = b""" + + + + + + + + + + + + + + """ + zip_path = make_canvas_zip( + tmp_path, + module_xml=module_xml, + manifest_xml=manifest_xml, + files=[ + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", html_content), + ], + ) + + url_config = { + "/html_page.html": { + "url": "https://cdn.example.com/", + "published": True, + "folder": { + "hidden": True, + }, + }, + } + published = [item.name for item in get_published_items(zip_path, url_config)] + assert "html_page.html" not in published + + +def test_get_published_items_with_hidden_file_section(mocker, tmp_path): + """ + Test that if the files section in the navbar is hidden, + no files are considered published even if they are marked as published + """ + manifest_xml = b""" + + + + + + + + + + + + + + """ + files_xml = b""" + + + + uncategorized + + + uncategorized + + + uncategorized + + + + """ + zip_path = make_canvas_zip( + tmp_path, + manifest_xml=manifest_xml, + files=[ + ("course_settings/files_meta.xml", files_xml), + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", ""), + ], + ) + + url_config = { + "/html_page.html": { + "url": "https://cdn.example.com/", + "published": True, + }, + "/file2.html": { + "url": "https://cdn.example.com/file2.html", + "published": True, + }, + "/file3.html": { + "url": "https://cdn.example.com/file2.html", + "published": True, + }, + } + published = [item.name for item in get_published_items(zip_path, url_config)] + assert sorted(published) == sorted(["html_page.html", "file2.html", "file1.pdf"]) + + # hide the files section in the navbar + settings_xml = b""" + + Test Course Title + [{"id":0},{"id":11, "hidden":true}] + TEST-101 + Other Value + + """ + + zip_path = make_canvas_zip( + tmp_path, + settings_xml=settings_xml, + manifest_xml=manifest_xml, + files=[ + ("course_settings/files_meta.xml", files_xml), + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", ""), + ], + ) + published = [item.name for item in get_published_items(zip_path, url_config)] + assert len(published) == 0 + + +def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path): + html_content = """ + + + + Embedded File 1 + + + """ + module_xml = b""" + + + + Module 1 + + + active + Item 1 + false + false + RES3 + resource + + + + + """ + manifest_xml = b""" + + + + + + + + + + + + + + """ + zip_path = make_canvas_zip( + tmp_path, + module_xml=module_xml, + manifest_xml=manifest_xml, + files=[ + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", html_content), + ], + ) + url_config = { + "/file1.pdf": { + "url": "https://cdn.example.com/file1.pdf", + "locked": True, + }, + } + published = get_published_items(zip_path, url_config) + assert ( + published[Path("/src/web_resources/html_page.html")]["embedded_files"][0] + == "web_resources/file1.pdf" + ) From c95cd67e53a1b84f343b17090160f135dad57842 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 26 Sep 2025 15:13:20 -0400 Subject: [PATCH 6/7] fix test --- learning_resources/etl/canvas_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 1e96f49910..b756dd5897 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -1529,10 +1529,8 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path): "/file1.pdf": { "url": "https://cdn.example.com/file1.pdf", "locked": True, + "hidden": True, }, } published = get_published_items(zip_path, url_config) - assert ( - published[Path("/src/web_resources/html_page.html")]["embedded_files"][0] - == "web_resources/file1.pdf" - ) + assert Path("web_resources/file1.pdf").resolve() in published From e0ab8afd7dcafa344d5314ceac56be9df61c20c3 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Mon, 29 Sep 2025 10:08:44 -0400 Subject: [PATCH 7/7] missing docstrings --- learning_resources/etl/canvas_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py index 2b4bb7718b..6001a3d03f 100644 --- a/learning_resources/etl/canvas_utils.py +++ b/learning_resources/etl/canvas_utils.py @@ -511,6 +511,9 @@ def canvas_course_url(course_archive_path) -> str: def _url_config_key(item): + """ + Get the key to look up an item from the url_config dictionary + """ if "web_resources" in str(item["path"]): return str(item["path"]).split("web_resources")[-1] return item.get("title") @@ -544,6 +547,9 @@ def _url_config_item_visible(item_configuration): def get_published_items(zipfile_path, url_config): + """ + Get all published items from a Canvas course archive + """ published_items = {} course_settings = parse_canvas_settings(zipfile_path) tab_configuration = course_settings.get("tab_configuration", {})