diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 11534fc0aa..c01ea81504 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -1,21 +1,13 @@
import base64
-import json
import logging
-import sys
import zipfile
-from collections import defaultdict
from collections.abc import Generator
-from datetime import UTC, datetime
+from datetime import datetime
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory
-from urllib.parse import unquote, unquote_plus
-from zoneinfo import ZoneInfo
-import dateutil
import pypdfium2 as pdfium
-from bs4 import BeautifulSoup
-from defusedxml import ElementTree
from django.conf import settings
from litellm import completion
from PIL import Image
@@ -25,6 +17,12 @@
LearningResourceType,
PlatformType,
)
+from learning_resources.etl.canvas_utils import (
+ canvas_course_url,
+ canvas_url_config,
+ get_published_items,
+ parse_canvas_settings,
+)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import (
_process_olx_path,
@@ -40,27 +38,9 @@
from learning_resources_search.constants import (
CONTENT_FILE_TYPE,
)
-from main.utils import now_in_utc
log = logging.getLogger(__name__)
-# list of file regexes we should ignore
-IGNORE_FILES = [
- "course_settings.xml",
- "context.xml",
- "files_meta.xml",
- "module_meta.xml",
- "imsmanifest.xml",
- "assignment_settings.xml",
-]
-
-NAMESPACES = {
- "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
- "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
- "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
- "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
-}
-
def sync_canvas_archive(bucket, key: str, overwrite):
"""
@@ -73,7 +53,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
with TemporaryDirectory() as export_tempdir:
course_archive_path = Path(export_tempdir, key.split("/")[-1])
bucket.download_file(key, course_archive_path)
- url_config = _get_url_config(bucket, export_tempdir, url_config_file)
+ url_config = canvas_url_config(bucket, export_tempdir, url_config_file)
resource_readable_id, run = run_for_canvas_archive(
course_archive_path, course_folder=course_folder, overwrite=overwrite
)
@@ -101,31 +81,6 @@ def sync_canvas_archive(bucket, key: str, overwrite):
return resource_readable_id
-def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
- """
- Get URL (citation) config from the metadata JSON file
- """
- url_config_path = Path(export_tempdir, url_config_file.split("/")[-1])
- # download the url config file
- bucket.download_file(url_config_file, url_config_path)
- url_config = {}
- with Path.open(url_config_path, "rb") as f:
- url_json = json.loads(f.read().decode("utf-8"))
- for url_item in url_json.get("course_files", []):
- url_key = url_item["file_path"]
- url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0]))
- url_config[url_key] = url_item["url"]
- for url_item in url_json.get("assignments", []) + url_json.get("pages", []):
- url_key = url_item.get("name", url_item.get("title"))
- url_config[url_key] = url_item.get("html_url")
- return url_config
-
-
-def _course_url(course_archive_path) -> str:
- context_info = parse_context_xml(course_archive_path)
- return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/"
-
-
def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
"""
Generate and return a LearningResourceRun for a Canvas course
@@ -133,7 +88,7 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
checksum = calc_checksum(course_archive_path)
course_info = parse_canvas_settings(course_archive_path)
course_title = course_info.get("title")
- url = _course_url(course_archive_path)
+ url = canvas_course_url(course_archive_path)
start_at = course_info.get("start_at")
end_at = course_info.get("conclude_at")
if start_at:
@@ -181,23 +136,6 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
return resource_readable_id, run
-def parse_canvas_settings(course_archive_path):
- """
- Get course attributes from a Canvas course archive
- """
- with zipfile.ZipFile(course_archive_path, "r") as course_archive:
- settings_path = "course_settings/course_settings.xml"
- if settings_path not in course_archive.namelist():
- return {}
- xml_string = course_archive.read(settings_path)
- tree = ElementTree.fromstring(xml_string)
- attributes = {}
- for node in tree.iter():
- tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
- attributes[tag] = node.text
- return attributes
-
-
def transform_canvas_content_files(
course_zipfile: Path, run: LearningResourceRun, url_config: dict, *, overwrite
) -> Generator[dict, None, None]:
@@ -206,23 +144,7 @@ def transform_canvas_content_files(
"""
basedir = course_zipfile.name.split(".")[0]
zipfile_path = course_zipfile.absolute()
- all_published_items = (
- parse_module_meta(zipfile_path)["active"]
- + parse_files_meta(zipfile_path)["active"]
- + parse_web_content(zipfile_path)["active"]
- )
- published_items = {}
- for item in all_published_items:
- path = Path(item["path"]).resolve()
- published_items[path] = item
- for embedded_file in item.get("embedded_files", []):
- embedded_path = Path(embedded_file).resolve()
- if embedded_path in all_published_items:
- continue
- published_items[embedded_path] = {
- "path": embedded_path,
- "title": "",
- }
+ published_items = get_published_items(zipfile_path, url_config)
def _generate_content():
"""Inner generator for yielding content data"""
@@ -245,10 +167,10 @@ def _generate_content():
item_meta = published_items.get(
Path(content_data["source_path"]).resolve(), {}
)
-
- content_url = url_config.get(url_path) or url_config.get(
- item_meta.get("title")
+ item_url_config = url_config.get(url_path, {}) or url_config.get(
+ item_meta.get("title"), {}
)
+ content_url = item_url_config.get("url")
content_data["content_title"] = item_meta.get("title")
if content_url:
content_data["url"] = content_url
@@ -315,432 +237,6 @@ def transform_canvas_problem_files(
yield problem_file_data
-def parse_context_xml(course_archive_path: str) -> dict:
- """
- Parse course_settings/context.xml and return context info
- """
- with zipfile.ZipFile(course_archive_path, "r") as course_archive:
- context = course_archive.read("course_settings/context.xml")
- root = ElementTree.fromstring(context)
- context_info = {}
- item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
- for key in item_keys:
- element = root.find(f"cccv1p0:{key}", NAMESPACES)
- if element is not None:
- context_info[key] = element.text
-
- return context_info
-
-
-def is_date_locked(lock_at: str, unlock_at: str) -> bool:
- """
- Determine if a resource is currently date-locked based
- on lock_at and unlock_at strings.
- Args:
- lock_at (str): ISO 8601 date string when the resource locks
- unlock_at (str): ISO 8601 date string when the resource unlocks
- Returns:
- bool: True if the resource is currently locked, False otherwise
- """
- now = now_in_utc()
- if unlock_at and unlock_at.lower() != "nil":
- try:
- unlock_dt = (
- dateutil.parser.parse(unlock_at)
- .replace(tzinfo=ZoneInfo("US/Eastern"))
- .astimezone(UTC)
- )
-
- if now < unlock_dt:
- return True
- except Exception:
- log.exception("Error parsing unlock_at date: %s", unlock_at)
-
- if lock_at and lock_at.lower() != "nil":
- try:
- lock_dt = (
- dateutil.parser.parse(lock_at)
- .replace(tzinfo=ZoneInfo("US/Eastern"))
- .astimezone(UTC)
- )
- if now > lock_dt:
- return True
- except Exception:
- log.exception("Error parsing lock_at date: %s", lock_at)
- return False
-
-
-def is_file_published(file_meta: dict) -> bool:
- """
- Determine if a Canvas file (from files_meta.xml) is published/visible to students.
-
- Args:
- file_meta (dict): Parsed metadata for a file.
- Returns:
- bool: True if file is published/visible, False otherwise.
- """
-
- hidden = str(file_meta.get("hidden", "false")).lower() == "true"
- locked = str(file_meta.get("locked", "false")).lower() == "true"
- unlock_at = file_meta.get("unlock_at")
- lock_at = file_meta.get("lock_at")
- visibility = file_meta.get("visibility", "inherit")
- # If explicitly hidden or locked → unpublished
- if hidden or locked:
- return False
-
- if is_date_locked(lock_at, unlock_at):
- return False
- # Visibility rules
- if visibility in ("course", "inherit"):
- return True
- elif visibility in ("institution", "public"):
- return True # technically more visible
- return False
-
-
-def parse_files_meta(course_archive_path: str) -> dict:
- """
- Parse course_settings/files_meta.xml and return publish/active status of resources.
- """
- publish_status = {"active": [], "unpublished": []}
- with zipfile.ZipFile(course_archive_path, "r") as course_archive:
- files_meta_path = "course_settings/files_meta.xml"
- if files_meta_path not in course_archive.namelist():
- return publish_status
- files_xml = course_archive.read(files_meta_path)
- manifest_xml = course_archive.read("imsmanifest.xml")
- resource_map = extract_resources_by_identifier(manifest_xml)
- root = ElementTree.fromstring(files_xml)
- try:
- for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
- meta = dict(file_elem.attrib)
- for child in file_elem:
- tag = child.tag
- # strip namespace
- if "}" in tag:
- tag = tag.split("}", 1)[1]
- if child.attrib.get("nil") == "true":
- value = None
- else:
- value = (child.text or "").strip()
- meta[tag] = value
- item_info = resource_map.get(meta.get("identifier"), {})
- meta["published"] = is_file_published(meta)
- for file in item_info.get("files", []):
- file_data = meta.copy()
- file_path = Path(file)
- file_data["path"] = file_path
- file_data["title"] = file_data.get("display_name")
- # explicitly exclude files in web_resources/ai/tutor
- if file_data["published"] and not file.startswith(
- settings.CANVAS_TUTORBOT_FOLDER
- ):
- publish_status["active"].append(file_data)
- else:
- publish_status["unpublished"].append(file_data)
- except Exception:
- log.exception("Error parsing XML: %s", sys.stderr)
- return None
- return publish_status
-
-
-def parse_module_meta(course_archive_path: str) -> dict:
- """
- Parse module_meta.xml and return publish/active status of resources.
- """
- with zipfile.ZipFile(course_archive_path, "r") as course_archive:
- module_xml = course_archive.read("course_settings/module_meta.xml")
- manifest_xml = course_archive.read("imsmanifest.xml")
- resource_map = extract_resources_by_identifierref(manifest_xml)
- publish_status = {"active": [], "unpublished": []}
- try:
- root = ElementTree.fromstring(module_xml)
- for module in root.findall(".//cccv1p0:module", NAMESPACES):
- module_title = module.find("cccv1p0:title", NAMESPACES).text
- for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
- item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
- item_title = item.find("cccv1p0:title", NAMESPACES).text
- identifierref = (
- item.find("cccv1p0:identifierref", NAMESPACES).text
- if item.find("cccv1p0:identifierref", NAMESPACES) is not None
- else None
- )
- content_type = item.find("cccv1p0:content_type", NAMESPACES).text
- items = resource_map.get(identifierref, {})
- for item_info in items:
- for file in item_info.get("files", []):
- file_path = Path(file)
- status = "active" if item_state == "active" else "unpublished"
- publish_status[status].append(
- {
- "title": item_title,
- "type": content_type,
- "path": file_path,
- "module": module_title,
- }
- )
- except Exception:
- log.exception("Error parsing XML: %s", sys.stderr)
- return None
- return publish_status
-
-
-def _compact_element(element) -> dict | str | None:
- """Recursively compact an element into a nested dictionary"""
- if len(element) == 0: # No children, return text
- return element.text.strip() if element.text else None
- return {
- child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
- child
- )
- for child in element
- }
-
-
-def _workflow_state_from_html(html: str) -> str:
- """
- Extract the workflow_state meta tag from html
- """
- soup = BeautifulSoup(html, "html.parser")
- meta = soup.find("meta", attrs={"name": "workflow_state"})
- return meta.get("content") if meta else None
-
-
-def _embedded_files_from_html(html: str) -> list[str]:
- """
- Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources
- and returning URL-decoded paths without query params.
- """
- soup = BeautifulSoup(html, "html.parser")
- links = []
-
- for a in soup.find_all("a", href=True):
- href = a["href"]
- if href.startswith("$IMS-CC-FILEBASE$"):
- # Remove query parameters if present
- clean_href = href.split("?")[0]
- # Replace $IMS-CC-FILEBASE$ with "web_resources"
- clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources")
- # URL decode
- decoded = unquote(clean_href)
- links.append(decoded)
-
- return links
-
-
-def _workflow_state_from_xml(xml_string: str) -> bool:
- """
- Determine the workflow_state (published/unpublished) from assignment_settings.xml
- """
-
- def _get_text(tag):
- el = root.find(f"cccv1p0:{tag}", NAMESPACES)
- return el.text.strip() if el is not None and el.text else ""
-
- try:
- root = ElementTree.fromstring(xml_string)
- except Exception:
- log.exception("Error parsing XML: %s", sys.stderr)
- return "unpublished"
-
- if (
- (
- # workflow_state must be published
- _get_text("workflow_state") != "published"
- )
- or (
- # only_visible_to_overrides must not be true
- _get_text("only_visible_to_overrides") == "true"
- )
- or (
- # hide_in_gradebook must not be true (hidden from gradebook)
- _get_text("hide_in_gradebook") == "true"
- )
- ):
- return "unpublished"
-
- lock_at = _get_text("lock_at")
- unlock_at = _get_text("unlock_at")
- if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
- return "unpublished"
-
- return "published"
-
-
-def _title_from_html(html: str) -> str:
- """
- Extract the title element from HTML content
- """
- soup = BeautifulSoup(html, "html.parser")
- title = soup.find("title")
- return title.get_text().strip() if title else ""
-
-
-def _title_from_assignment_settings(xml_string: str) -> str:
- """
- Extract the title from assignment_settings.xml
- """
- try:
- root = ElementTree.fromstring(xml_string)
- except Exception:
- log.exception("Error parsing XML: %s", sys.stderr)
- return ""
- title_elem = root.find("cccv1p0:title", NAMESPACES)
- return title_elem.text.strip() if title_elem is not None and title_elem.text else ""
-
-
-def parse_web_content(course_archive_path: str) -> dict:
- """
- Parse html pages and assignments and return publish/active status of resources
- """
-
- publish_status = {"active": [], "unpublished": []}
- course_settings = parse_canvas_settings(course_archive_path)
- public_syllabus_setting = course_settings.get("public_syllabus", "true").lower()
- public_syllabus_to_auth_setting = course_settings.get(
- "public_syllabus_to_auth", "true"
- ).lower()
- ingest_syllabus = True
- if (
- public_syllabus_setting == "false"
- and public_syllabus_to_auth_setting == "false"
- ):
- ingest_syllabus = False
- with zipfile.ZipFile(course_archive_path, "r") as course_archive:
- manifest_path = "imsmanifest.xml"
- if manifest_path not in course_archive.namelist():
- return publish_status
- manifest_xml = course_archive.read(manifest_path)
- resource_map = extract_resources_by_identifier(manifest_xml)
-
- for item in resource_map:
- resource_map_item = resource_map[item]
- item_link = resource_map_item.get("href")
- assignment_settings = None
- for file in resource_map_item.get("files", []):
- if file.endswith("assignment_settings.xml"):
- assignment_settings = file
- if item_link and item_link.endswith(".html"):
- file_path = resource_map_item["href"]
- html_content = course_archive.read(file_path)
- embedded_files = _embedded_files_from_html(html_content)
- if assignment_settings:
- xml_content = course_archive.read(assignment_settings)
- workflow_state = _workflow_state_from_xml(xml_content)
- title = _title_from_assignment_settings(xml_content)
- canvas_type = "assignment"
- else:
- workflow_state = _workflow_state_from_html(html_content)
- title = _title_from_html(html_content)
- canvas_type = "page"
-
- lom_elem = (
- resource_map_item.get("metadata", {})
- .get("lom", {})
- .get("educational", {})
- )
- # Determine if the content is intended for authors or instructors only
- intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
- authors_only = intended_role and intended_role.lower() != "student"
- intended_use = resource_map_item.get("intendeduse", "")
- if (
- workflow_state in ["active", "published"]
- and not authors_only
- and intended_use != "syllabus"
- ) or (ingest_syllabus and intended_use == "syllabus"):
- publish_status["active"].append(
- {
- "title": title,
- "path": file_path,
- "canvas_type": canvas_type,
- "embedded_files": embedded_files,
- }
- )
- else:
- publish_status["unpublished"].append(
- {
- "title": title,
- "path": file_path,
- "canvas_type": canvas_type,
- "embedded_files": embedded_files,
- }
- )
- return publish_status
-
-
-def extract_resources_by_identifierref(manifest_xml: str) -> dict:
- """
- Extract resources from an IMS manifest file and
- return a map keyed by identifierref.
- """
- root = ElementTree.fromstring(manifest_xml)
-
- # Dictionary to hold resources keyed by identifierref
- resources_dict = defaultdict(list)
- # Find all item elements with identifierref attributes
- for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
- identifierref = item.get("identifierref")
- title = (
- item.find("imscp:title", NAMESPACES).text
- if item.find("imscp:title", NAMESPACES) is not None
- else ""
- )
- resource = root.find(
- f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
- )
- if resource is not None:
- # Get all file elements within the resource
- files = [
- file_elem.get("href")
- for file_elem in resource.findall("imscp:file", NAMESPACES)
- ]
-
- resources_dict[identifierref].append(
- {
- "title": title,
- "files": files,
- "type": resource.get("type"),
- "intendeduse": resource.get("intendeduse"),
- }
- )
- return dict(resources_dict)
-
-
-def extract_resources_by_identifier(manifest_xml: str) -> dict:
- """
- Extract resources from an IMS manifest
- file and return a map keyed by identifier.
- """
- root = ElementTree.fromstring(manifest_xml)
- resources_dict = {}
- # Find all resource elements
- for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
- identifier = resource.get("identifier")
- resource_type = resource.get("type")
- href = resource.get("href")
-
- # Get all file elements within the resource
- files = [
- file_elem.get("href")
- for file_elem in resource.findall("imscp:file", NAMESPACES)
- ]
- # Extract metadata if present
- metadata = {}
- metadata_elem = resource.find("imscp:metadata", NAMESPACES)
- if metadata_elem is not None:
- metadata.update(_compact_element(metadata_elem))
- resources_dict[identifier] = {
- "identifier": identifier,
- "type": resource_type,
- "href": href,
- "files": files,
- "metadata": metadata,
- "intendeduse": resource.get("intendeduse"),
- }
- return resources_dict
-
-
def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
"""
Convert a PDF file to a list of base64 encoded images (one per page).
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 0ca3a6d97e..b756dd5897 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -10,15 +10,18 @@
from learning_resources.constants import LearningResourceType, PlatformType
from learning_resources.etl.canvas import (
+ run_for_canvas_archive,
+ transform_canvas_content_files,
+ transform_canvas_problem_files,
+)
+from learning_resources.etl.canvas_utils import (
_compact_element,
+ get_published_items,
is_file_published,
parse_canvas_settings,
parse_files_meta,
parse_module_meta,
parse_web_content,
- run_for_canvas_archive,
- transform_canvas_content_files,
- transform_canvas_problem_files,
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import get_edx_module_id
@@ -165,7 +168,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker):
return_value={"title": "Test Course", "course_code": "TEST101"},
)
mocker.patch(
- "learning_resources.etl.canvas.parse_context_xml",
+ "learning_resources.etl.canvas_utils.parse_context_xml",
return_value={"course_id": "123", "canvas_domain": "mit.edu"},
)
@@ -197,7 +200,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker):
return_value={"title": "Test Course", "course_code": "TEST104"},
)
mocker.patch(
- "learning_resources.etl.canvas.parse_context_xml",
+ "learning_resources.etl.canvas_utils.parse_context_xml",
return_value={"course_id": "123", "canvas_domain": "mit.edu"},
)
mocker.patch(
@@ -522,7 +525,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path):
"""
run = MagicMock()
run.id = 1
- url_config = {"/folder/file1.html": "https://cdn.example.com/file1.html"}
+ url_config = {"/folder/file1.html": {"url": "https://cdn.example.com/file1.html"}}
# Patch _process_olx_path to yield content_data with source_path
mock_content_data = [
{"source_path": "data/folder/file1.html", "key": "file1"},
@@ -532,7 +535,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path):
return_value=mock_content_data,
)
mocker.patch(
- "learning_resources.etl.canvas.parse_module_meta",
+ "learning_resources.etl.canvas_utils.parse_module_meta",
return_value={"active": [], "unpublished": []},
)
# Use a real zip file
@@ -1138,8 +1141,8 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path):
hmtl_page_title = "html page"
url_config = {
- hmtl_page_title: "https://example.com/htmlpage",
- "/file1.html": "https://example.com/file1",
+ hmtl_page_title: {"url": "https://example.com/htmlpage"},
+ "/file1.html": {"url": "https://example.com/file1"},
}
run = LearningResourceRunFactory.create()
@@ -1233,3 +1236,301 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path):
)
assert results["html page"] == "https://example.com/htmlpage"
assert results["Item 1"] == "https://example.com/file1"
+
+
+def test_get_published_items_for_unpublshed_file(tmp_path):
+ html_content = """
+
+
+
+ Embedded File 1
+
+
+ """
+ module_xml = b"""
+
+
+
+ Module 1
+
+ -
+ active
+ Item 1
+ false
+ false
+ RES3
+ resource
+
+
+
+
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ zip_path = make_canvas_zip(
+ tmp_path,
+ module_xml=module_xml,
+ manifest_xml=manifest_xml,
+ files=[
+ ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file2.html", "content of file2"),
+ ("web_resources/html_page.html", html_content),
+ ],
+ )
+ url_config = {
+ "/html_page.html": {
+ "url": "https://cdn.example.com/",
+ "locked": True,
+ },
+ }
+ published = [item.name for item in get_published_items(zip_path, url_config)]
+ url_config = {
+ "/html_page.html": {
+ "url": "https://cdn.example.com/",
+ "hidden": True,
+ },
+ }
+ published.extend([item.name for item in get_published_items(zip_path, url_config)])
+ assert "html_page.html" not in published
+
+
+def test_get_published_items_for_unpublshed_parent_folder(mocker, tmp_path):
+ html_content = """
+
+
+
+ Embedded File 1
+
+
+ """
+ module_xml = b"""
+
+
+
+ Module 1
+
+ -
+ active
+ Item 1
+ false
+ false
+ RES3
+ resource
+
+
+
+
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ zip_path = make_canvas_zip(
+ tmp_path,
+ module_xml=module_xml,
+ manifest_xml=manifest_xml,
+ files=[
+ ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file2.html", "content of file2"),
+ ("web_resources/html_page.html", html_content),
+ ],
+ )
+
+ url_config = {
+ "/html_page.html": {
+ "url": "https://cdn.example.com/",
+ "published": True,
+ "folder": {
+ "hidden": True,
+ },
+ },
+ }
+ published = [item.name for item in get_published_items(zip_path, url_config)]
+ assert "html_page.html" not in published
+
+
+def test_get_published_items_with_hidden_file_section(mocker, tmp_path):
+ """
+ Test that if the files section in the navbar is hidden,
+ no files are considered published even if they are marked as published
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ files_xml = b"""
+
+
+
+ uncategorized
+
+
+ uncategorized
+
+
+ uncategorized
+
+
+
+ """
+ zip_path = make_canvas_zip(
+ tmp_path,
+ manifest_xml=manifest_xml,
+ files=[
+ ("course_settings/files_meta.xml", files_xml),
+ ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file2.html", "content of file2"),
+ ("web_resources/html_page.html", ""),
+ ],
+ )
+
+ url_config = {
+ "/html_page.html": {
+ "url": "https://cdn.example.com/",
+ "published": True,
+ },
+ "/file2.html": {
+ "url": "https://cdn.example.com/file2.html",
+ "published": True,
+ },
+ "/file3.html": {
+ "url": "https://cdn.example.com/file2.html",
+ "published": True,
+ },
+ }
+ published = [item.name for item in get_published_items(zip_path, url_config)]
+ assert sorted(published) == sorted(["html_page.html", "file2.html", "file1.pdf"])
+
+ # hide the files section in the navbar
+ settings_xml = b"""
+
+ Test Course Title
+ [{"id":0},{"id":11, "hidden":true}]
+ TEST-101
+ Other Value
+
+ """
+
+ zip_path = make_canvas_zip(
+ tmp_path,
+ settings_xml=settings_xml,
+ manifest_xml=manifest_xml,
+ files=[
+ ("course_settings/files_meta.xml", files_xml),
+ ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file2.html", "content of file2"),
+ ("web_resources/html_page.html", ""),
+ ],
+ )
+ published = [item.name for item in get_published_items(zip_path, url_config)]
+ assert len(published) == 0
+
+
+def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
+ html_content = """
+
+
+
+ Embedded File 1
+
+
+ """
+ module_xml = b"""
+
+
+
+ Module 1
+
+ -
+ active
+ Item 1
+ false
+ false
+ RES3
+ resource
+
+
+
+
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ zip_path = make_canvas_zip(
+ tmp_path,
+ module_xml=module_xml,
+ manifest_xml=manifest_xml,
+ files=[
+ ("web_resources/file1.pdf", "content of file1"),
+ ("web_resources/file2.html", "content of file2"),
+ ("web_resources/html_page.html", html_content),
+ ],
+ )
+ url_config = {
+ "/file1.pdf": {
+ "url": "https://cdn.example.com/file1.pdf",
+ "locked": True,
+ "hidden": True,
+ },
+ }
+ published = get_published_items(zip_path, url_config)
+ assert Path("web_resources/file1.pdf").resolve() in published
diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py
new file mode 100644
index 0000000000..6001a3d03f
--- /dev/null
+++ b/learning_resources/etl/canvas_utils.py
@@ -0,0 +1,590 @@
+import json
+import logging
+import sys
+import zipfile
+from collections import defaultdict
+from datetime import UTC
+from pathlib import Path
+from urllib.parse import unquote, unquote_plus
+from zoneinfo import ZoneInfo
+
+import dateutil
+from bs4 import BeautifulSoup
+from defusedxml import ElementTree
+from django.conf import settings
+
+from main.utils import now_in_utc
+
+log = logging.getLogger(__name__)
+
+# list of file regexes we should ignore
+IGNORE_FILES = [
+ "course_settings.xml",
+ "context.xml",
+ "files_meta.xml",
+ "module_meta.xml",
+ "imsmanifest.xml",
+ "assignment_settings.xml",
+]
+
+NAMESPACES = {
+ "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
+ "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
+ "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
+ "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
+}
+
+
+def is_file_published(file_meta: dict) -> bool:
+ """
+ Determine if a Canvas file (from files_meta.xml) is published/visible to students.
+
+ Args:
+ file_meta (dict): Parsed metadata for a file.
+ Returns:
+ bool: True if file is published/visible, False otherwise.
+ """
+
+ hidden = str(file_meta.get("hidden", "false")).lower() == "true"
+ locked = str(file_meta.get("locked", "false")).lower() == "true"
+ unlock_at = file_meta.get("unlock_at")
+ lock_at = file_meta.get("lock_at")
+ visibility = file_meta.get("visibility", "inherit")
+ # If explicitly hidden or locked → unpublished
+ if hidden or locked:
+ return False
+
+ if is_date_locked(lock_at, unlock_at):
+ return False
+ # Visibility rules
+ if visibility in ("course", "inherit"):
+ return True
+ elif visibility in ("institution", "public"):
+ return True # technically more visible
+ return False
+
+
+def parse_files_meta(course_archive_path: str) -> dict:
+ """
+ Parse course_settings/files_meta.xml and return publish/active status of resources.
+ """
+ publish_status = {"active": [], "unpublished": []}
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ files_meta_path = "course_settings/files_meta.xml"
+ if files_meta_path not in course_archive.namelist():
+ return publish_status
+ files_xml = course_archive.read(files_meta_path)
+ manifest_xml = course_archive.read("imsmanifest.xml")
+ resource_map = extract_resources_by_identifier(manifest_xml)
+ root = ElementTree.fromstring(files_xml)
+ try:
+ for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
+ meta = dict(file_elem.attrib)
+ for child in file_elem:
+ tag = child.tag
+ # strip namespace
+ if "}" in tag:
+ tag = tag.split("}", 1)[1]
+ if child.attrib.get("nil") == "true":
+ value = None
+ else:
+ value = (child.text or "").strip()
+ meta[tag] = value
+ item_info = resource_map.get(meta.get("identifier"), {})
+ meta["published"] = is_file_published(meta)
+ for file in item_info.get("files", []):
+ file_data = meta.copy()
+ file_path = Path(file)
+ file_data["path"] = file_path
+ file_data["title"] = file_data.get("display_name")
+ # explicitly exclude files in web_resources/ai/tutor
+ if file_data["published"] and not file.startswith(
+ settings.CANVAS_TUTORBOT_FOLDER
+ ):
+ publish_status["active"].append(file_data)
+ else:
+ publish_status["unpublished"].append(file_data)
+ except Exception:
+ log.exception("Error parsing XML: %s", sys.stderr)
+ return None
+ return publish_status
+
+
+def parse_module_meta(course_archive_path: str) -> dict:
+ """
+ Parse module_meta.xml and return publish/active status of resources.
+ """
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ module_xml = course_archive.read("course_settings/module_meta.xml")
+ manifest_xml = course_archive.read("imsmanifest.xml")
+ resource_map = extract_resources_by_identifierref(manifest_xml)
+ publish_status = {"active": [], "unpublished": []}
+ try:
+ root = ElementTree.fromstring(module_xml)
+ for module in root.findall(".//cccv1p0:module", NAMESPACES):
+ module_title = module.find("cccv1p0:title", NAMESPACES).text
+ for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
+ item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
+ item_title = item.find("cccv1p0:title", NAMESPACES).text
+ identifierref = (
+ item.find("cccv1p0:identifierref", NAMESPACES).text
+ if item.find("cccv1p0:identifierref", NAMESPACES) is not None
+ else None
+ )
+ content_type = item.find("cccv1p0:content_type", NAMESPACES).text
+ items = resource_map.get(identifierref, {})
+ for item_info in items:
+ for file in item_info.get("files", []):
+ file_path = Path(file)
+ status = "active" if item_state == "active" else "unpublished"
+ publish_status[status].append(
+ {
+ "title": item_title,
+ "type": content_type,
+ "path": file_path,
+ "module": module_title,
+ }
+ )
+ except Exception:
+ log.exception("Error parsing XML: %s", sys.stderr)
+ return None
+ return publish_status
+
+
+def _compact_element(element) -> dict | str | None:
+ """Recursively compact an element into a nested dictionary"""
+ if len(element) == 0: # No children, return text
+ return element.text.strip() if element.text else None
+ return {
+ child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
+ child
+ )
+ for child in element
+ }
+
+
+def _workflow_state_from_html(html: str) -> str:
+ """
+ Extract the workflow_state meta tag from html
+ """
+ soup = BeautifulSoup(html, "html.parser")
+ meta = soup.find("meta", attrs={"name": "workflow_state"})
+ return meta.get("content") if meta else None
+
+
+def _embedded_files_from_html(html: str) -> list[str]:
+ """
+ Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources
+ and returning URL-decoded paths without query params.
+ """
+ soup = BeautifulSoup(html, "html.parser")
+ links = []
+
+ for a in soup.find_all("a", href=True):
+ href = a["href"]
+ if href.startswith("$IMS-CC-FILEBASE$"):
+ # Remove query parameters if present
+ clean_href = href.split("?")[0]
+ # Replace $IMS-CC-FILEBASE$ with "web_resources"
+ clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources")
+ # URL decode
+ decoded = unquote(clean_href)
+ links.append(decoded)
+
+ return links
+
+
+def _workflow_state_from_xml(xml_string: str) -> bool:
+ """
+ Determine the workflow_state (published/unpublished) from assignment_settings.xml
+ """
+
+ def _get_text(tag):
+ el = root.find(f"cccv1p0:{tag}", NAMESPACES)
+ return el.text.strip() if el is not None and el.text else ""
+
+ try:
+ root = ElementTree.fromstring(xml_string)
+ except Exception:
+ log.exception("Error parsing XML: %s", sys.stderr)
+ return "unpublished"
+
+ if (
+ (
+ # workflow_state must be published
+ _get_text("workflow_state") != "published"
+ )
+ or (
+ # only_visible_to_overrides must not be true
+ _get_text("only_visible_to_overrides") == "true"
+ )
+ or (
+ # hide_in_gradebook must not be true (hidden from gradebook)
+ _get_text("hide_in_gradebook") == "true"
+ )
+ ):
+ return "unpublished"
+
+ lock_at = _get_text("lock_at")
+ unlock_at = _get_text("unlock_at")
+ if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
+ return "unpublished"
+
+ return "published"
+
+
+def _title_from_html(html: str) -> str:
+ """
+ Extract the title element from HTML content
+ """
+ soup = BeautifulSoup(html, "html.parser")
+ title = soup.find("title")
+ return title.get_text().strip() if title else ""
+
+
+def _title_from_assignment_settings(xml_string: str) -> str:
+ """
+ Extract the title from assignment_settings.xml
+ """
+ try:
+ root = ElementTree.fromstring(xml_string)
+ except Exception:
+ log.exception("Error parsing XML: %s", sys.stderr)
+ return ""
+ title_elem = root.find("cccv1p0:title", NAMESPACES)
+ return title_elem.text.strip() if title_elem is not None and title_elem.text else ""
+
+
+def parse_web_content(course_archive_path: str) -> dict:
+ """
+ Parse html pages and assignments and return publish/active status of resources
+ """
+
+ publish_status = {"active": [], "unpublished": []}
+ course_settings = parse_canvas_settings(course_archive_path)
+ public_syllabus_setting = course_settings.get("public_syllabus", "true").lower()
+ public_syllabus_to_auth_setting = course_settings.get(
+ "public_syllabus_to_auth", "true"
+ ).lower()
+ ingest_syllabus = True
+ if (
+ public_syllabus_setting == "false"
+ and public_syllabus_to_auth_setting == "false"
+ ):
+ ingest_syllabus = False
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ manifest_path = "imsmanifest.xml"
+ if manifest_path not in course_archive.namelist():
+ return publish_status
+ manifest_xml = course_archive.read(manifest_path)
+ resource_map = extract_resources_by_identifier(manifest_xml)
+
+ for item in resource_map:
+ resource_map_item = resource_map[item]
+ item_link = resource_map_item.get("href")
+ assignment_settings = None
+ for file in resource_map_item.get("files", []):
+ if file.endswith("assignment_settings.xml"):
+ assignment_settings = file
+ if item_link and item_link.endswith(".html"):
+ file_path = resource_map_item["href"]
+ html_content = course_archive.read(file_path)
+ embedded_files = _embedded_files_from_html(html_content)
+ if assignment_settings:
+ xml_content = course_archive.read(assignment_settings)
+ workflow_state = _workflow_state_from_xml(xml_content)
+ title = _title_from_assignment_settings(xml_content)
+ canvas_type = "assignment"
+ else:
+ workflow_state = _workflow_state_from_html(html_content)
+ title = _title_from_html(html_content)
+ canvas_type = "page"
+
+ lom_elem = (
+ resource_map_item.get("metadata", {})
+ .get("lom", {})
+ .get("educational", {})
+ )
+ # Determine if the content is intended for authors or instructors only
+ intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
+ authors_only = intended_role and intended_role.lower() != "student"
+ intended_use = resource_map_item.get("intendeduse", "")
+ if (
+ workflow_state in ["active", "published"]
+ and not authors_only
+ and intended_use != "syllabus"
+ ) or (ingest_syllabus and intended_use == "syllabus"):
+ publish_status["active"].append(
+ {
+ "title": title,
+ "path": file_path,
+ "canvas_type": canvas_type,
+ "embedded_files": embedded_files,
+ }
+ )
+ else:
+ publish_status["unpublished"].append(
+ {
+ "title": title,
+ "path": file_path,
+ "canvas_type": canvas_type,
+ "embedded_files": embedded_files,
+ }
+ )
+ return publish_status
+
+
+def extract_resources_by_identifierref(manifest_xml: str) -> dict:
+ """
+ Extract resources from an IMS manifest file and
+ return a map keyed by identifierref.
+ """
+ root = ElementTree.fromstring(manifest_xml)
+
+ # Dictionary to hold resources keyed by identifierref
+ resources_dict = defaultdict(list)
+ # Find all item elements with identifierref attributes
+ for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
+ identifierref = item.get("identifierref")
+ title = (
+ item.find("imscp:title", NAMESPACES).text
+ if item.find("imscp:title", NAMESPACES) is not None
+ else ""
+ )
+ resource = root.find(
+ f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
+ )
+ if resource is not None:
+ # Get all file elements within the resource
+ files = [
+ file_elem.get("href")
+ for file_elem in resource.findall("imscp:file", NAMESPACES)
+ ]
+
+ resources_dict[identifierref].append(
+ {
+ "title": title,
+ "files": files,
+ "type": resource.get("type"),
+ "intendeduse": resource.get("intendeduse"),
+ }
+ )
+ return dict(resources_dict)
+
+
+def extract_resources_by_identifier(manifest_xml: str) -> dict:
+ """
+ Extract resources from an IMS manifest
+ file and return a map keyed by identifier.
+ """
+ root = ElementTree.fromstring(manifest_xml)
+ resources_dict = {}
+ # Find all resource elements
+ for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
+ identifier = resource.get("identifier")
+ resource_type = resource.get("type")
+ href = resource.get("href")
+
+ # Get all file elements within the resource
+ files = [
+ file_elem.get("href")
+ for file_elem in resource.findall("imscp:file", NAMESPACES)
+ ]
+ # Extract metadata if present
+ metadata = {}
+ metadata_elem = resource.find("imscp:metadata", NAMESPACES)
+ if metadata_elem is not None:
+ metadata.update(_compact_element(metadata_elem))
+ resources_dict[identifier] = {
+ "identifier": identifier,
+ "type": resource_type,
+ "href": href,
+ "files": files,
+ "metadata": metadata,
+ "intendeduse": resource.get("intendeduse"),
+ }
+ return resources_dict
+
+
+def parse_context_xml(course_archive_path: str) -> dict:
+ """
+ Parse course_settings/context.xml and return context info
+ """
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ context = course_archive.read("course_settings/context.xml")
+ root = ElementTree.fromstring(context)
+ context_info = {}
+ item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
+ for key in item_keys:
+ element = root.find(f"cccv1p0:{key}", NAMESPACES)
+ if element is not None:
+ context_info[key] = element.text
+
+ return context_info
+
+
+def is_date_locked(lock_at: str, unlock_at: str) -> bool:
+ """
+ Determine if a resource is currently date-locked based
+ on lock_at and unlock_at strings.
+ Args:
+ lock_at (str): ISO 8601 date string when the resource locks
+ unlock_at (str): ISO 8601 date string when the resource unlocks
+ Returns:
+ bool: True if the resource is currently locked, False otherwise
+ """
+ now = now_in_utc()
+ if unlock_at and unlock_at.lower() != "nil":
+ try:
+ unlock_dt = (
+ dateutil.parser.parse(unlock_at)
+ .replace(tzinfo=ZoneInfo("US/Eastern"))
+ .astimezone(UTC)
+ )
+
+ if now < unlock_dt:
+ return True
+ except Exception:
+ log.exception("Error parsing unlock_at date: %s", unlock_at)
+
+ if lock_at and lock_at.lower() != "nil":
+ try:
+ lock_dt = (
+ dateutil.parser.parse(lock_at)
+ .replace(tzinfo=ZoneInfo("US/Eastern"))
+ .astimezone(UTC)
+ )
+ if now > lock_dt:
+ return True
+ except Exception:
+ log.exception("Error parsing lock_at date: %s", lock_at)
+ return False
+
+
+def parse_canvas_settings(course_archive_path):
+ """
+ Get course attributes from a Canvas course archive
+ """
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ settings_path = "course_settings/course_settings.xml"
+ if settings_path not in course_archive.namelist():
+ return {}
+ xml_string = course_archive.read(settings_path)
+ tree = ElementTree.fromstring(xml_string)
+ attributes = {}
+ for node in tree.iter():
+ tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
+ node_value = node.text
+ if tag == "tab_configuration":
+ tab_config = json.loads(node.text)
+
+ node_value = dict(zip([tc["id"] for tc in tab_config], tab_config))
+ attributes[tag] = node_value
+ return attributes
+
+
+def canvas_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
+ """
+ Get URL (citation) config from the metadata JSON file
+ """
+ url_config_path = Path(export_tempdir, url_config_file.split("/")[-1])
+ # download the url config file
+ bucket.download_file(url_config_file, url_config_path)
+ url_config = {}
+ with Path.open(url_config_path, "rb") as f:
+ url_json = json.loads(f.read().decode("utf-8"))
+ for url_item in url_json.get("course_files", []):
+ url_key = url_item["file_path"]
+ url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0]))
+ url_config[url_key] = url_item
+ for url_item in url_json.get("assignments", []) + url_json.get("pages", []):
+ url_key = url_item.get("name", url_item.get("title"))
+ # normalize url field
+ url_item["url"] = url_item.get("html_url")
+ url_config[url_key] = url_item
+ return url_config
+
+
+def canvas_course_url(course_archive_path) -> str:
+ context_info = parse_context_xml(course_archive_path)
+ return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/"
+
+
+def _url_config_key(item):
+ """
+ Get the key to look up an item from the url_config dictionary
+ """
+ if "web_resources" in str(item["path"]):
+ return str(item["path"]).split("web_resources")[-1]
+ return item.get("title")
+
+
+def _url_config_item_visible(item_configuration):
+ """
+ Determine if an item is visible based on its configuration
+ from the metadata json file
+ """
+ if item_configuration:
+ # check if explicitely unpublished
+ unpublished = not item_configuration.get("published", True)
+ lock_at = item_configuration.get("lock_at")
+ unlock_at = item_configuration.get("unlock_at")
+ return not any(
+ [
+ unpublished,
+ is_date_locked(lock_at, unlock_at),
+ item_configuration.get("hidden"), # file hidden
+ item_configuration.get("locked"), # file locked
+ item_configuration.get("folder", {}).get(
+ "hidden"
+ ), # parent folder hidden
+ item_configuration.get("folder", {}).get(
+ "locked"
+ ), # parent folder locked
+ ]
+ )
+ return True
+
+
+def get_published_items(zipfile_path, url_config):
+ """
+ Get all published items from a Canvas course archive
+ """
+ published_items = {}
+ course_settings = parse_canvas_settings(zipfile_path)
+ tab_configuration = course_settings.get("tab_configuration", {})
+ """
+ mappings for ids:
+ # https://developerdocs.instructure.com/services/dap/dataset/dataset-additional-notes
+ """
+ files_section_is_visible = not tab_configuration.get(11, {}).get("hidden", False)
+ all_published_items = (
+ parse_module_meta(zipfile_path)["active"]
+ + parse_files_meta(zipfile_path)["active"]
+ + parse_web_content(zipfile_path)["active"]
+ )
+
+ all_embedded_items = []
+ for item in all_published_items:
+ path = Path(item["path"]).resolve()
+ item_configuration = url_config.get(_url_config_key(item))
+ item_visible = _url_config_item_visible(item_configuration)
+
+ # if the item is not explicitely hidden and global files section is visible
+ if item_visible and (
+ str(Path(item["path"]).parent) != "web_resources"
+ or files_section_is_visible
+ ):
+ published_items[path] = item
+ for embedded_file in item.get("embedded_files", []):
+ embedded_path = Path(embedded_file).resolve()
+ embedded = {
+ "path": embedded_path,
+ "title": "",
+ }
+ all_embedded_items.append(embedded)
+ if embedded_path in all_published_items:
+ continue
+ published_items[embedded_path] = embedded
+
+ return published_items