From 80aeab0d19b047e837b6d0ca84cac18a4bc5b9d4 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 23 Sep 2025 09:52:39 -0400
Subject: [PATCH 1/7] stashing changes

---
 learning_resources/etl/canvas.py      | 506 +-------------------------
 learning_resources/etl/canvas_test.py |  14 +-
 2 files changed, 21 insertions(+), 499 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index e8b5049204..c01ea81504 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -1,21 +1,13 @@
 import base64
-import json
 import logging
-import sys
 import zipfile
-from collections import defaultdict
 from collections.abc import Generator
-from datetime import UTC, datetime
+from datetime import datetime
 from io import BytesIO
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from urllib.parse import unquote, unquote_plus
-from zoneinfo import ZoneInfo
 
-import dateutil
 import pypdfium2 as pdfium
-from bs4 import BeautifulSoup
-from defusedxml import ElementTree
 from django.conf import settings
 from litellm import completion
 from PIL import Image
@@ -25,6 +17,12 @@
     LearningResourceType,
     PlatformType,
 )
+from learning_resources.etl.canvas_utils import (
+    canvas_course_url,
+    canvas_url_config,
+    get_published_items,
+    parse_canvas_settings,
+)
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import (
     _process_olx_path,
@@ -40,27 +38,9 @@
 from learning_resources_search.constants import (
     CONTENT_FILE_TYPE,
 )
-from main.utils import now_in_utc
 
 log = logging.getLogger(__name__)
 
-# list of file regexes we should ignore
-IGNORE_FILES = [
-    "course_settings.xml",
-    "context.xml",
-    "files_meta.xml",
-    "module_meta.xml",
-    "imsmanifest.xml",
-    "assignment_settings.xml",
-]
-
-NAMESPACES = {
-    "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
-    "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
-    "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
-    "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
-}
-
 
 def sync_canvas_archive(bucket, key: str, overwrite):
     """
@@ -73,7 +53,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
     with TemporaryDirectory() as export_tempdir:
         course_archive_path = Path(export_tempdir, key.split("/")[-1])
         bucket.download_file(key, course_archive_path)
-        url_config = _get_url_config(bucket, export_tempdir, url_config_file)
+        url_config = canvas_url_config(bucket, export_tempdir, url_config_file)
         resource_readable_id, run = run_for_canvas_archive(
             course_archive_path, course_folder=course_folder, overwrite=overwrite
         )
@@ -101,31 +81,6 @@ def sync_canvas_archive(bucket, key: str, overwrite):
     return resource_readable_id
 
 
-def _get_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
-    """
-    Get URL (citation) config from the metadata JSON file
-    """
-    url_config_path = Path(export_tempdir, url_config_file.split("/")[-1])
-    # download the url config file
-    bucket.download_file(url_config_file, url_config_path)
-    url_config = {}
-    with Path.open(url_config_path, "rb") as f:
-        url_json = json.loads(f.read().decode("utf-8"))
-        for url_item in url_json.get("course_files", []):
-            url_key = url_item["file_path"]
-            url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0]))
-            url_config[url_key] = url_item["url"]
-        for url_item in url_json.get("assignments", []) + url_json.get("pages", []):
-            url_key = url_item.get("name", url_item.get("title"))
-            url_config[url_key] = url_item.get("html_url")
-    return url_config
-
-
-def _course_url(course_archive_path) -> str:
-    context_info = parse_context_xml(course_archive_path)
-    return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/"
-
-
 def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
     """
     Generate and return a LearningResourceRun for a Canvas course
@@ -133,7 +88,7 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
     checksum = calc_checksum(course_archive_path)
     course_info = parse_canvas_settings(course_archive_path)
     course_title = course_info.get("title")
-    url = _course_url(course_archive_path)
+    url = canvas_course_url(course_archive_path)
     start_at = course_info.get("start_at")
     end_at = course_info.get("conclude_at")
     if start_at:
@@ -181,20 +136,6 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite):
     return resource_readable_id, run
 
 
-def parse_canvas_settings(course_archive_path):
-    """
-    Get course attributes from a Canvas course archive
-    """
-    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        xml_string = course_archive.read("course_settings/course_settings.xml")
-    tree = ElementTree.fromstring(xml_string)
-    attributes = {}
-    for node in tree.iter():
-        tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
-        attributes[tag] = node.text
-    return attributes
-
-
 def transform_canvas_content_files(
     course_zipfile: Path, run: LearningResourceRun, url_config: dict, *, overwrite
 ) -> Generator[dict, None, None]:
@@ -203,23 +144,7 @@ def transform_canvas_content_files(
     """
     basedir = course_zipfile.name.split(".")[0]
     zipfile_path = course_zipfile.absolute()
-    all_published_items = (
-        parse_module_meta(zipfile_path)["active"]
-        + parse_files_meta(zipfile_path)["active"]
-        + parse_web_content(zipfile_path)["active"]
-    )
-    published_items = {}
-    for item in all_published_items:
-        path = Path(item["path"]).resolve()
-        published_items[path] = item
-        for embedded_file in item.get("embedded_files", []):
-            embedded_path = Path(embedded_file).resolve()
-            if embedded_path in all_published_items:
-                continue
-            published_items[embedded_path] = {
-                "path": embedded_path,
-                "title": "",
-            }
+    published_items = get_published_items(zipfile_path, url_config)
 
     def _generate_content():
         """Inner generator for yielding content data"""
@@ -242,10 +167,10 @@ def _generate_content():
                 item_meta = published_items.get(
                     Path(content_data["source_path"]).resolve(), {}
                 )
-
-                content_url = url_config.get(url_path) or url_config.get(
-                    item_meta.get("title")
+                item_url_config = url_config.get(url_path, {}) or url_config.get(
+                    item_meta.get("title"), {}
                 )
+                content_url = item_url_config.get("url")
                 content_data["content_title"] = item_meta.get("title")
                 if content_url:
                     content_data["url"] = content_url
@@ -312,411 +237,6 @@ def transform_canvas_problem_files(
             yield problem_file_data
 
 
-def parse_context_xml(course_archive_path: str) -> dict:
-    """
-    Parse course_settings/context.xml and return context info
-    """
-    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        context = course_archive.read("course_settings/context.xml")
-    root = ElementTree.fromstring(context)
-    context_info = {}
-    item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
-    for key in item_keys:
-        element = root.find(f"cccv1p0:{key}", NAMESPACES)
-        if element is not None:
-            context_info[key] = element.text
-
-    return context_info
-
-
-def is_date_locked(lock_at: str, unlock_at: str) -> bool:
-    """
-    Determine if a resource is currently date-locked based
-    on lock_at and unlock_at strings.
-    Args:
-        lock_at (str): ISO 8601 date string when the resource locks
-        unlock_at (str): ISO 8601 date string when the resource unlocks
-    Returns:
-        bool: True if the resource is currently locked, False otherwise
-    """
-    now = now_in_utc()
-    if unlock_at and unlock_at.lower() != "nil":
-        try:
-            unlock_dt = (
-                dateutil.parser.parse(unlock_at)
-                .replace(tzinfo=ZoneInfo("US/Eastern"))
-                .astimezone(UTC)
-            )
-
-            if now < unlock_dt:
-                return True
-        except Exception:
-            log.exception("Error parsing unlock_at date: %s", unlock_at)
-
-    if lock_at and lock_at.lower() != "nil":
-        try:
-            lock_dt = (
-                dateutil.parser.parse(lock_at)
-                .replace(tzinfo=ZoneInfo("US/Eastern"))
-                .astimezone(UTC)
-            )
-            if now > lock_dt:
-                return True
-        except Exception:
-            log.exception("Error parsing lock_at date: %s", lock_at)
-    return False
-
-
-def is_file_published(file_meta: dict) -> bool:
-    """
-    Determine if a Canvas file (from files_meta.xml) is published/visible to students.
-
-    Args:
-        file_meta (dict): Parsed metadata for a file.
-    Returns:
-        bool: True if file is published/visible, False otherwise.
-    """
-
-    hidden = str(file_meta.get("hidden", "false")).lower() == "true"
-    locked = str(file_meta.get("locked", "false")).lower() == "true"
-    unlock_at = file_meta.get("unlock_at")
-    lock_at = file_meta.get("lock_at")
-    visibility = file_meta.get("visibility", "inherit")
-    # If explicitly hidden or locked → unpublished
-    if hidden or locked:
-        return False
-
-    if is_date_locked(lock_at, unlock_at):
-        return False
-    # Visibility rules
-    if visibility in ("course", "inherit"):
-        return True
-    elif visibility in ("institution", "public"):
-        return True  # technically more visible
-    return False
-
-
-def parse_files_meta(course_archive_path: str) -> dict:
-    """
-    Parse course_settings/files_meta.xml and return publish/active status of resources.
-    """
-    publish_status = {"active": [], "unpublished": []}
-    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        files_meta_path = "course_settings/files_meta.xml"
-        if files_meta_path not in course_archive.namelist():
-            return publish_status
-        files_xml = course_archive.read(files_meta_path)
-        manifest_xml = course_archive.read("imsmanifest.xml")
-    resource_map = extract_resources_by_identifier(manifest_xml)
-    root = ElementTree.fromstring(files_xml)
-    try:
-        for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
-            meta = dict(file_elem.attrib)
-            for child in file_elem:
-                tag = child.tag
-                # strip namespace
-                if "}" in tag:
-                    tag = tag.split("}", 1)[1]
-                if child.attrib.get("nil") == "true":
-                    value = None
-                else:
-                    value = (child.text or "").strip()
-                meta[tag] = value
-            item_info = resource_map.get(meta.get("identifier"), {})
-            meta["published"] = is_file_published(meta)
-            for file in item_info.get("files", []):
-                file_data = meta.copy()
-                file_path = Path(file)
-                file_data["path"] = file_path
-                file_data["title"] = file_data.get("display_name")
-                # explicitly exclude files in web_resources/ai/tutor
-                if file_data["published"] and not file.startswith(
-                    settings.CANVAS_TUTORBOT_FOLDER
-                ):
-                    publish_status["active"].append(file_data)
-                else:
-                    publish_status["unpublished"].append(file_data)
-    except Exception:
-        log.exception("Error parsing XML: %s", sys.stderr)
-        return None
-    return publish_status
-
-
-def parse_module_meta(course_archive_path: str) -> dict:
-    """
-    Parse module_meta.xml and return publish/active status of resources.
-    """
-    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        module_xml = course_archive.read("course_settings/module_meta.xml")
-        manifest_xml = course_archive.read("imsmanifest.xml")
-    resource_map = extract_resources_by_identifierref(manifest_xml)
-    publish_status = {"active": [], "unpublished": []}
-    try:
-        root = ElementTree.fromstring(module_xml)
-        for module in root.findall(".//cccv1p0:module", NAMESPACES):
-            module_title = module.find("cccv1p0:title", NAMESPACES).text
-            for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
-                item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
-                item_title = item.find("cccv1p0:title", NAMESPACES).text
-                identifierref = (
-                    item.find("cccv1p0:identifierref", NAMESPACES).text
-                    if item.find("cccv1p0:identifierref", NAMESPACES) is not None
-                    else None
-                )
-                content_type = item.find("cccv1p0:content_type", NAMESPACES).text
-                items = resource_map.get(identifierref, {})
-                for item_info in items:
-                    for file in item_info.get("files", []):
-                        file_path = Path(file)
-                        status = "active" if item_state == "active" else "unpublished"
-                        publish_status[status].append(
-                            {
-                                "title": item_title,
-                                "type": content_type,
-                                "path": file_path,
-                                "module": module_title,
-                            }
-                        )
-    except Exception:
-        log.exception("Error parsing XML: %s", sys.stderr)
-        return None
-    return publish_status
-
-
-def _compact_element(element) -> dict | str | None:
-    """Recursively compact an element into a nested dictionary"""
-    if len(element) == 0:  # No children, return text
-        return element.text.strip() if element.text else None
-    return {
-        child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
-            child
-        )
-        for child in element
-    }
-
-
-def _workflow_state_from_html(html: str) -> str:
-    """
-    Extract the workflow_state meta tag from html
-    """
-    soup = BeautifulSoup(html, "html.parser")
-    meta = soup.find("meta", attrs={"name": "workflow_state"})
-    return meta.get("content") if meta else None
-
-
-def _embedded_files_from_html(html: str) -> list[str]:
-    """
-    Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources
-    and returning URL-decoded paths without query params.
-    """
-    soup = BeautifulSoup(html, "html.parser")
-    links = []
-
-    for a in soup.find_all("a", href=True):
-        href = a["href"]
-        if href.startswith("$IMS-CC-FILEBASE$"):
-            # Remove query parameters if present
-            clean_href = href.split("?")[0]
-            # Replace $IMS-CC-FILEBASE$ with "web_resources"
-            clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources")
-            # URL decode
-            decoded = unquote(clean_href)
-            links.append(decoded)
-
-    return links
-
-
-def _workflow_state_from_xml(xml_string: str) -> bool:
-    """
-    Determine the workflow_state (published/unpublished) from assignment_settings.xml
-    """
-
-    def _get_text(tag):
-        el = root.find(f"cccv1p0:{tag}", NAMESPACES)
-        return el.text.strip() if el is not None and el.text else ""
-
-    try:
-        root = ElementTree.fromstring(xml_string)
-    except Exception:
-        log.exception("Error parsing XML: %s", sys.stderr)
-        return "unpublished"
-
-    if (
-        (
-            # workflow_state must be published
-            _get_text("workflow_state") != "published"
-        )
-        or (
-            # only_visible_to_overrides must not be true
-            _get_text("only_visible_to_overrides") == "true"
-        )
-        or (
-            # hide_in_gradebook must not be true (hidden from gradebook)
-            _get_text("hide_in_gradebook") == "true"
-        )
-    ):
-        return "unpublished"
-
-    lock_at = _get_text("lock_at")
-    unlock_at = _get_text("unlock_at")
-    if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
-        return "unpublished"
-
-    return "published"
-
-
-def _title_from_html(html: str) -> str:
-    """
-    Extract the title element from HTML content
-    """
-    soup = BeautifulSoup(html, "html.parser")
-    title = soup.find("title")
-    return title.get_text().strip() if title else ""
-
-
-def _title_from_assignment_settings(xml_string: str) -> str:
-    """
-    Extract the title from assignment_settings.xml
-    """
-    try:
-        root = ElementTree.fromstring(xml_string)
-    except Exception:
-        log.exception("Error parsing XML: %s", sys.stderr)
-        return ""
-    title_elem = root.find("cccv1p0:title", NAMESPACES)
-    return title_elem.text.strip() if title_elem is not None and title_elem.text else ""
-
-
-def parse_web_content(course_archive_path: str) -> dict:
-    """
-    Parse html pages and assignments and return publish/active status of resources
-    """
-
-    publish_status = {"active": [], "unpublished": []}
-
-    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        manifest_path = "imsmanifest.xml"
-        if manifest_path not in course_archive.namelist():
-            return publish_status
-        manifest_xml = course_archive.read(manifest_path)
-        resource_map = extract_resources_by_identifier(manifest_xml)
-        for item in resource_map:
-            resource_map_item = resource_map[item]
-            item_link = resource_map_item.get("href")
-            assignment_settings = None
-            for file in resource_map_item.get("files", []):
-                if file.endswith("assignment_settings.xml"):
-                    assignment_settings = file
-            if item_link and item_link.endswith(".html"):
-                file_path = resource_map_item["href"]
-                html_content = course_archive.read(file_path)
-                embedded_files = _embedded_files_from_html(html_content)
-                if assignment_settings:
-                    xml_content = course_archive.read(assignment_settings)
-                    workflow_state = _workflow_state_from_xml(xml_content)
-                    title = _title_from_assignment_settings(xml_content)
-                    canvas_type = "assignment"
-                else:
-                    workflow_state = _workflow_state_from_html(html_content)
-                    title = _title_from_html(html_content)
-                    canvas_type = "page"
-
-                lom_elem = (
-                    resource_map_item.get("metadata", {})
-                    .get("lom", {})
-                    .get("educational", {})
-                )
-                # Determine if the content is intended for authors or instructors only
-                intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
-                authors_only = intended_role and intended_role.lower() != "student"
-
-                if workflow_state in ["active", "published"] and not authors_only:
-                    publish_status["active"].append(
-                        {
-                            "title": title,
-                            "path": file_path,
-                            "canvas_type": canvas_type,
-                            "embedded_files": embedded_files,
-                        }
-                    )
-                else:
-                    publish_status["unpublished"].append(
-                        {
-                            "title": title,
-                            "path": file_path,
-                            "canvas_type": canvas_type,
-                            "embedded_files": embedded_files,
-                        }
-                    )
-    return publish_status
-
-
-def extract_resources_by_identifierref(manifest_xml: str) -> dict:
-    """
-    Extract resources from an IMS manifest file and
-    return a map keyed by identifierref.
-    """
-    root = ElementTree.fromstring(manifest_xml)
-
-    # Dictionary to hold resources keyed by identifierref
-    resources_dict = defaultdict(list)
-    # Find all item elements with identifierref attributes
-    for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
-        identifierref = item.get("identifierref")
-        title = (
-            item.find("imscp:title", NAMESPACES).text
-            if item.find("imscp:title", NAMESPACES) is not None
-            else ""
-        )
-        resource = root.find(
-            f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
-        )
-        if resource is not None:
-            # Get all file elements within the resource
-            files = [
-                file_elem.get("href")
-                for file_elem in resource.findall("imscp:file", NAMESPACES)
-            ]
-
-            resources_dict[identifierref].append(
-                {"title": title, "files": files, "type": resource.get("type")}
-            )
-    return dict(resources_dict)
-
-
-def extract_resources_by_identifier(manifest_xml: str) -> dict:
-    """
-    Extract resources from an IMS manifest
-    file and return a map keyed by identifier.
-    """
-    root = ElementTree.fromstring(manifest_xml)
-    resources_dict = {}
-    # Find all resource elements
-    for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
-        identifier = resource.get("identifier")
-        resource_type = resource.get("type")
-        href = resource.get("href")
-
-        # Get all file elements within the resource
-        files = [
-            file_elem.get("href")
-            for file_elem in resource.findall("imscp:file", NAMESPACES)
-        ]
-        # Extract metadata if present
-        metadata = {}
-        metadata_elem = resource.find("imscp:metadata", NAMESPACES)
-        if metadata_elem is not None:
-            metadata.update(_compact_element(metadata_elem))
-        resources_dict[identifier] = {
-            "identifier": identifier,
-            "type": resource_type,
-            "href": href,
-            "files": files,
-            "metadata": metadata,
-        }
-    return resources_dict
-
-
 def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
     """
     Convert a PDF file to a list of base64 encoded images (one per page).
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 2c1c244022..4ca9fe7b17 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -10,15 +10,17 @@
 
 from learning_resources.constants import LearningResourceType, PlatformType
 from learning_resources.etl.canvas import (
+    run_for_canvas_archive,
+    transform_canvas_content_files,
+    transform_canvas_problem_files,
+)
+from learning_resources.etl.canvas_utils import (
     _compact_element,
     is_file_published,
     parse_canvas_settings,
     parse_files_meta,
     parse_module_meta,
     parse_web_content,
-    run_for_canvas_archive,
-    transform_canvas_content_files,
-    transform_canvas_problem_files,
 )
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import get_edx_module_id
@@ -120,7 +122,7 @@ def test_run_for_canvas_archive_creates_resource_and_run(tmp_path, mocker):
         return_value={"title": "Test Course", "course_code": "TEST101"},
     )
     mocker.patch(
-        "learning_resources.etl.canvas.parse_context_xml",
+        "learning_resources.etl.canvas_utils.parse_context_xml",
         return_value={"course_id": "123", "canvas_domain": "mit.edu"},
     )
 
@@ -152,7 +154,7 @@ def test_run_for_canvas_archive_creates_run_if_none_exists(tmp_path, mocker):
         return_value={"title": "Test Course", "course_code": "TEST104"},
     )
     mocker.patch(
-        "learning_resources.etl.canvas.parse_context_xml",
+        "learning_resources.etl.canvas_utils.parse_context_xml",
         return_value={"course_id": "123", "canvas_domain": "mit.edu"},
     )
     mocker.patch(
@@ -472,7 +474,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path):
         return_value=mock_content_data,
     )
     mocker.patch(
-        "learning_resources.etl.canvas.parse_module_meta",
+        "learning_resources.etl.canvas_utils.parse_module_meta",
         return_value={"active": [], "unpublished": []},
     )
     # Use a real zip file

From 1013ec57957ca05b9bd15b36f704844c1fe3b9d9 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Wed, 24 Sep 2025 10:21:16 -0400
Subject: [PATCH 2/7] adding more permission/hidden checks

---
 learning_resources/etl/canvas_utils.py | 553 +++++++++++++++++++++++++
 1 file changed, 553 insertions(+)
 create mode 100644 learning_resources/etl/canvas_utils.py

diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py
new file mode 100644
index 0000000000..a62f62c377
--- /dev/null
+++ b/learning_resources/etl/canvas_utils.py
@@ -0,0 +1,553 @@
+import json
+import logging
+import sys
+import zipfile
+from collections import defaultdict
+from datetime import UTC
+from pathlib import Path
+from urllib.parse import unquote, unquote_plus
+from zoneinfo import ZoneInfo
+
+import dateutil
+from bs4 import BeautifulSoup
+from defusedxml import ElementTree
+from django.conf import settings
+
+from main.utils import now_in_utc
+
+log = logging.getLogger(__name__)
+
+# list of file regexes we should ignore
+IGNORE_FILES = [
+    "course_settings.xml",
+    "context.xml",
+    "files_meta.xml",
+    "module_meta.xml",
+    "imsmanifest.xml",
+    "assignment_settings.xml",
+]
+
+NAMESPACES = {
+    "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
+    "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
+    "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
+    "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
+}
+
+
+def is_file_published(file_meta: dict) -> bool:
+    """
+    Determine if a Canvas file (from files_meta.xml) is published/visible to students.
+
+    Args:
+        file_meta (dict): Parsed metadata for a file.
+    Returns:
+        bool: True if file is published/visible, False otherwise.
+    """
+
+    hidden = str(file_meta.get("hidden", "false")).lower() == "true"
+    locked = str(file_meta.get("locked", "false")).lower() == "true"
+    unlock_at = file_meta.get("unlock_at")
+    lock_at = file_meta.get("lock_at")
+    visibility = file_meta.get("visibility", "inherit")
+    # If explicitly hidden or locked → unpublished
+    if hidden or locked:
+        return False
+
+    if is_date_locked(lock_at, unlock_at):
+        return False
+    # Visibility rules
+    if visibility in ("course", "inherit"):
+        return True
+    elif visibility in ("institution", "public"):
+        return True  # technically more visible
+    return False
+
+
+def parse_files_meta(course_archive_path: str) -> dict:
+    """
+    Parse course_settings/files_meta.xml and return publish/active status of resources.
+    """
+    publish_status = {"active": [], "unpublished": []}
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        files_meta_path = "course_settings/files_meta.xml"
+        if files_meta_path not in course_archive.namelist():
+            return publish_status
+        files_xml = course_archive.read(files_meta_path)
+        manifest_xml = course_archive.read("imsmanifest.xml")
+    resource_map = extract_resources_by_identifier(manifest_xml)
+    root = ElementTree.fromstring(files_xml)
+    try:
+        for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
+            meta = dict(file_elem.attrib)
+            for child in file_elem:
+                tag = child.tag
+                # strip namespace
+                if "}" in tag:
+                    tag = tag.split("}", 1)[1]
+                if child.attrib.get("nil") == "true":
+                    value = None
+                else:
+                    value = (child.text or "").strip()
+                meta[tag] = value
+            item_info = resource_map.get(meta.get("identifier"), {})
+            meta["published"] = is_file_published(meta)
+            for file in item_info.get("files", []):
+                file_data = meta.copy()
+                file_path = Path(file)
+                file_data["path"] = file_path
+                file_data["title"] = file_data.get("display_name")
+                # explicitly exclude files in web_resources/ai/tutor
+                if file_data["published"] and not file.startswith(
+                    settings.CANVAS_TUTORBOT_FOLDER
+                ):
+                    publish_status["active"].append(file_data)
+                else:
+                    publish_status["unpublished"].append(file_data)
+    except Exception:
+        log.exception("Error parsing XML: %s", sys.stderr)
+        return None
+    return publish_status
+
+
+def parse_module_meta(course_archive_path: str) -> dict:
+    """
+    Parse module_meta.xml and return publish/active status of resources.
+    """
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        module_xml = course_archive.read("course_settings/module_meta.xml")
+        manifest_xml = course_archive.read("imsmanifest.xml")
+    resource_map = extract_resources_by_identifierref(manifest_xml)
+    publish_status = {"active": [], "unpublished": []}
+    try:
+        root = ElementTree.fromstring(module_xml)
+        for module in root.findall(".//cccv1p0:module", NAMESPACES):
+            module_title = module.find("cccv1p0:title", NAMESPACES).text
+            for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
+                item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
+                item_title = item.find("cccv1p0:title", NAMESPACES).text
+                identifierref = (
+                    item.find("cccv1p0:identifierref", NAMESPACES).text
+                    if item.find("cccv1p0:identifierref", NAMESPACES) is not None
+                    else None
+                )
+                content_type = item.find("cccv1p0:content_type", NAMESPACES).text
+                items = resource_map.get(identifierref, {})
+                for item_info in items:
+                    for file in item_info.get("files", []):
+                        file_path = Path(file)
+                        status = "active" if item_state == "active" else "unpublished"
+                        publish_status[status].append(
+                            {
+                                "title": item_title,
+                                "type": content_type,
+                                "path": file_path,
+                                "module": module_title,
+                            }
+                        )
+    except Exception:
+        log.exception("Error parsing XML: %s", sys.stderr)
+        return None
+    return publish_status
+
+
+def _compact_element(element) -> dict | str | None:
+    """Recursively compact an element into a nested dictionary"""
+    if len(element) == 0:  # No children, return text
+        return element.text.strip() if element.text else None
+    return {
+        child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
+            child
+        )
+        for child in element
+    }
+
+
+def _workflow_state_from_html(html: str) -> str:
+    """
+    Extract the workflow_state meta tag from html
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    meta = soup.find("meta", attrs={"name": "workflow_state"})
+    return meta.get("content") if meta else None
+
+
+def _embedded_files_from_html(html: str) -> list[str]:
+    """
+    Extract Canvas file links from HTML, replacing $IMS-CC-FILEBASE$ with web_resources
+    and returning URL-decoded paths without query params.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    links = []
+
+    for a in soup.find_all("a", href=True):
+        href = a["href"]
+        if href.startswith("$IMS-CC-FILEBASE$"):
+            # Remove query parameters if present
+            clean_href = href.split("?")[0]
+            # Replace $IMS-CC-FILEBASE$ with "web_resources"
+            clean_href = clean_href.replace("$IMS-CC-FILEBASE$", "web_resources")
+            # URL decode
+            decoded = unquote(clean_href)
+            links.append(decoded)
+
+    return links
+
+
+def _workflow_state_from_xml(xml_string: str) -> bool:
+    """
+    Determine the workflow_state (published/unpublished) from assignment_settings.xml
+    """
+
+    def _get_text(tag):
+        el = root.find(f"cccv1p0:{tag}", NAMESPACES)
+        return el.text.strip() if el is not None and el.text else ""
+
+    try:
+        root = ElementTree.fromstring(xml_string)
+    except Exception:
+        log.exception("Error parsing XML: %s", sys.stderr)
+        return "unpublished"
+
+    if (
+        (
+            # workflow_state must be published
+            _get_text("workflow_state") != "published"
+        )
+        or (
+            # only_visible_to_overrides must not be true
+            _get_text("only_visible_to_overrides") == "true"
+        )
+        or (
+            # hide_in_gradebook must not be true (hidden from gradebook)
+            _get_text("hide_in_gradebook") == "true"
+        )
+    ):
+        return "unpublished"
+
+    lock_at = _get_text("lock_at")
+    unlock_at = _get_text("unlock_at")
+    if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
+        return "unpublished"
+
+    return "published"
+
+
+def _title_from_html(html: str) -> str:
+    """
+    Extract the title element from HTML content
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    title = soup.find("title")
+    return title.get_text().strip() if title else ""
+
+
+def _title_from_assignment_settings(xml_string: str) -> str:
+    """
+    Extract the title from assignment_settings.xml
+    """
+    try:
+        root = ElementTree.fromstring(xml_string)
+    except Exception:
+        log.exception("Error parsing XML: %s", sys.stderr)
+        return ""
+    title_elem = root.find("cccv1p0:title", NAMESPACES)
+    return title_elem.text.strip() if title_elem is not None and title_elem.text else ""
+
+
+def parse_web_content(course_archive_path: str) -> dict:
+    """
+    Parse html pages and assignments and return publish/active status of resources
+    """
+
+    publish_status = {"active": [], "unpublished": []}
+
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        manifest_path = "imsmanifest.xml"
+        if manifest_path not in course_archive.namelist():
+            return publish_status
+        manifest_xml = course_archive.read(manifest_path)
+        resource_map = extract_resources_by_identifier(manifest_xml)
+        for item in resource_map:
+            resource_map_item = resource_map[item]
+            item_link = resource_map_item.get("href")
+            assignment_settings = None
+            for file in resource_map_item.get("files", []):
+                if file.endswith("assignment_settings.xml"):
+                    assignment_settings = file
+            if item_link and item_link.endswith(".html"):
+                file_path = resource_map_item["href"]
+                html_content = course_archive.read(file_path)
+                embedded_files = _embedded_files_from_html(html_content)
+                if assignment_settings:
+                    xml_content = course_archive.read(assignment_settings)
+                    workflow_state = _workflow_state_from_xml(xml_content)
+                    title = _title_from_assignment_settings(xml_content)
+                    canvas_type = "assignment"
+                else:
+                    workflow_state = _workflow_state_from_html(html_content)
+                    title = _title_from_html(html_content)
+                    canvas_type = "page"
+
+                lom_elem = (
+                    resource_map_item.get("metadata", {})
+                    .get("lom", {})
+                    .get("educational", {})
+                )
+                # Determine if the content is intended for authors or instructors only
+                intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
+                authors_only = intended_role and intended_role.lower() != "student"
+
+                if workflow_state in ["active", "published"] and not authors_only:
+                    publish_status["active"].append(
+                        {
+                            "title": title,
+                            "path": file_path,
+                            "canvas_type": canvas_type,
+                            "embedded_files": embedded_files,
+                        }
+                    )
+                else:
+                    publish_status["unpublished"].append(
+                        {
+                            "title": title,
+                            "path": file_path,
+                            "canvas_type": canvas_type,
+                            "embedded_files": embedded_files,
+                        }
+                    )
+    return publish_status
+
+
+def extract_resources_by_identifierref(manifest_xml: str) -> dict:
+    """
+    Extract resources from an IMS manifest file and
+    return a map keyed by identifierref.
+    """
+    root = ElementTree.fromstring(manifest_xml)
+
+    # Dictionary to hold resources keyed by identifierref
+    resources_dict = defaultdict(list)
+    # Find all item elements with identifierref attributes
+    for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
+        identifierref = item.get("identifierref")
+        title = (
+            item.find("imscp:title", NAMESPACES).text
+            if item.find("imscp:title", NAMESPACES) is not None
+            else ""
+        )
+        resource = root.find(
+            f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
+        )
+        if resource is not None:
+            # Get all file elements within the resource
+            files = [
+                file_elem.get("href")
+                for file_elem in resource.findall("imscp:file", NAMESPACES)
+            ]
+
+            resources_dict[identifierref].append(
+                {"title": title, "files": files, "type": resource.get("type")}
+            )
+    return dict(resources_dict)
+
+
+def extract_resources_by_identifier(manifest_xml: str) -> dict:
+    """
+    Extract resources from an IMS manifest
+    file and return a map keyed by identifier.
+    """
+    root = ElementTree.fromstring(manifest_xml)
+    resources_dict = {}
+    # Find all resource elements
+    for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
+        identifier = resource.get("identifier")
+        resource_type = resource.get("type")
+        href = resource.get("href")
+
+        # Get all file elements within the resource
+        files = [
+            file_elem.get("href")
+            for file_elem in resource.findall("imscp:file", NAMESPACES)
+        ]
+        # Extract metadata if present
+        metadata = {}
+        metadata_elem = resource.find("imscp:metadata", NAMESPACES)
+        if metadata_elem is not None:
+            metadata.update(_compact_element(metadata_elem))
+        resources_dict[identifier] = {
+            "identifier": identifier,
+            "type": resource_type,
+            "href": href,
+            "files": files,
+            "metadata": metadata,
+        }
+    return resources_dict
+
+
+def parse_context_xml(course_archive_path: str) -> dict:
+    """
+    Parse course_settings/context.xml and return context info
+    """
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        context = course_archive.read("course_settings/context.xml")
+    root = ElementTree.fromstring(context)
+    context_info = {}
+    item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
+    for key in item_keys:
+        element = root.find(f"cccv1p0:{key}", NAMESPACES)
+        if element is not None:
+            context_info[key] = element.text
+
+    return context_info
+
+
+def is_date_locked(lock_at: str, unlock_at: str) -> bool:
+    """
+    Determine if a resource is currently date-locked based
+    on lock_at and unlock_at strings.
+    Args:
+        lock_at (str): ISO 8601 date string when the resource locks
+        unlock_at (str): ISO 8601 date string when the resource unlocks
+    Returns:
+        bool: True if the resource is currently locked, False otherwise
+    """
+    now = now_in_utc()
+    if unlock_at and unlock_at.lower() != "nil":
+        try:
+            unlock_dt = (
+                dateutil.parser.parse(unlock_at)
+                .replace(tzinfo=ZoneInfo("US/Eastern"))
+                .astimezone(UTC)
+            )
+
+            if now < unlock_dt:
+                return True
+        except Exception:
+            log.exception("Error parsing unlock_at date: %s", unlock_at)
+
+    if lock_at and lock_at.lower() != "nil":
+        try:
+            lock_dt = (
+                dateutil.parser.parse(lock_at)
+                .replace(tzinfo=ZoneInfo("US/Eastern"))
+                .astimezone(UTC)
+            )
+            if now > lock_dt:
+                return True
+        except Exception:
+            log.exception("Error parsing lock_at date: %s", lock_at)
+    return False
+
+
+def parse_canvas_settings(course_archive_path):
+    """
+    Get course attributes from a Canvas course archive
+    """
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        xml_string = course_archive.read("course_settings/course_settings.xml")
+    tree = ElementTree.fromstring(xml_string)
+    attributes = {}
+    for node in tree.iter():
+        tag = node.tag.split("}")[1] if "}" in node.tag else node.tag
+        node_value = node.text
+        if tag == "tab_configuration":
+            tab_config = json.loads(node.text)
+
+            node_value = dict(zip([tc["id"] for tc in tab_config], tab_config))
+        attributes[tag] = node_value
+    return attributes
+
+
+def canvas_url_config(bucket, export_tempdir: str, url_config_file: str) -> dict:
+    """
+    Get URL (citation) config from the metadata JSON file
+    """
+    url_config_path = Path(export_tempdir, url_config_file.split("/")[-1])
+    # download the url config file
+    bucket.download_file(url_config_file, url_config_path)
+    url_config = {}
+    with Path.open(url_config_path, "rb") as f:
+        url_json = json.loads(f.read().decode("utf-8"))
+        for url_item in url_json.get("course_files", []):
+            url_key = url_item["file_path"]
+            url_key = unquote_plus(url_key.lstrip(url_key.split("/")[0]))
+            url_config[url_key] = url_item
+        for url_item in url_json.get("assignments", []) + url_json.get("pages", []):
+            url_key = url_item.get("name", url_item.get("title"))
+            # normalize url field
+            url_item["url"] = url_item.get("html_url")
+            url_config[url_key] = url_item
+    return url_config
+
+
+def canvas_course_url(course_archive_path) -> str:
+    context_info = parse_context_xml(course_archive_path)
+    return f"https://{context_info.get('canvas_domain')}/courses/{context_info.get('course_id')}/"
+
+
+def _url_config_key(item):
+    if "web_resources" in str(item["path"]):
+        return str(item["path"]).split("web_resources")[-1]
+    return item.get("title")
+
+
+def _url_config_item_visible(item_configuration):
+    """
+    Determine if an item is visible based on its configuration
+    from the metadata json file
+    """
+    if item_configuration:
+        # check if explicitely unpublished
+        unpublished = not item_configuration.get("published", True)
+        return not any(
+            [
+                unpublished,
+                item_configuration.get("hidden"),  # file hidden
+                item_configuration.get("locked"),  # file locked
+                item_configuration.get("folder", {}).get(
+                    "hidden"
+                ),  # parent folder hidden
+                item_configuration.get("folder", {}).get(
+                    "locked"
+                ),  # parent folder locked
+            ]
+        )
+    return True
+
+
+def get_published_items(zipfile_path, url_config):
+    published_items = {}
+    course_settings = parse_canvas_settings(zipfile_path)
+    tab_configuration = course_settings.get("tab_configuration", {})
+    files_section_is_visible = not tab_configuration.get(11, {}).get("hidden", False)
+    all_published_items = (
+        parse_module_meta(zipfile_path)["active"]
+        + parse_files_meta(zipfile_path)["active"]
+        + parse_web_content(zipfile_path)["active"]
+    )
+
+    all_embedded_items = []
+    for item in all_published_items:
+        path = Path(item["path"]).resolve()
+        item_configuration = url_config.get(_url_config_key(item))
+        item_visible = _url_config_item_visible(item_configuration)
+
+        # if the item is not explicitely hidden and global files section is visible
+        if item_visible and (
+            str(Path(item["path"]).parent) != "web_resources"
+            or files_section_is_visible
+        ):
+            published_items[path] = item
+        for embedded_file in item.get("embedded_files", []):
+            embedded_path = Path(embedded_file).resolve()
+            embedded = {
+                "path": embedded_path,
+                "title": "",
+            }
+            all_embedded_items.append(embedded)
+            if embedded_path in all_published_items:
+                continue
+            published_items[embedded_path] = embedded
+
+    return published_items

From 4f85aeaf12ba38402878fb33e00ced7f92b3f05b Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Wed, 24 Sep 2025 10:56:02 -0400
Subject: [PATCH 3/7] adding more permission/hidden checks

---
 learning_resources/etl/canvas_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py
index a62f62c377..012871b444 100644
--- a/learning_resources/etl/canvas_utils.py
+++ b/learning_resources/etl/canvas_utils.py
@@ -500,9 +500,12 @@ def _url_config_item_visible(item_configuration):
     if item_configuration:
         # check if explicitely unpublished
         unpublished = not item_configuration.get("published", True)
+        lock_at = item_configuration.get("lock_at")
+        unlock_at = item_configuration.get("unlock_at")
         return not any(
             [
                 unpublished,
+                is_date_locked(lock_at, unlock_at),
                 item_configuration.get("hidden"),  # file hidden
                 item_configuration.get("locked"),  # file locked
                 item_configuration.get("folder", {}).get(
@@ -520,6 +523,10 @@ def get_published_items(zipfile_path, url_config):
     published_items = {}
     course_settings = parse_canvas_settings(zipfile_path)
     tab_configuration = course_settings.get("tab_configuration", {})
+    """
+    mappings for ids:
+    # https://developerdocs.instructure.com/services/dap/dataset/dataset-additional-notes
+    """
     files_section_is_visible = not tab_configuration.get(11, {}).get("hidden", False)
     all_published_items = (
         parse_module_meta(zipfile_path)["active"]

From e9da34ba353e361c6f056b4b9b848f7c9e3a7096 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Fri, 26 Sep 2025 10:05:45 -0400
Subject: [PATCH 4/7] test fix

---
 learning_resources/etl/canvas_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 00201cbef3..cba2138063 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -524,7 +524,7 @@ def test_transform_canvas_content_files_url_assignment(mocker, tmp_path):
     """
     run = MagicMock()
     run.id = 1
-    url_config = {"/folder/file1.html": "https://cdn.example.com/file1.html"}
+    url_config = {"/folder/file1.html": {"url": "https://cdn.example.com/file1.html"}}
     # Patch _process_olx_path to yield content_data with source_path
     mock_content_data = [
         {"source_path": "data/folder/file1.html", "key": "file1"},
@@ -1140,8 +1140,8 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path):
     hmtl_page_title = "html page"
 
     url_config = {
-        hmtl_page_title: "https://example.com/htmlpage",
-        "/file1.html": "https://example.com/file1",
+        hmtl_page_title: {"url": "https://example.com/htmlpage"},
+        "/file1.html": {"url": "https://example.com/file1"},
     }
 
     run = LearningResourceRunFactory.create()

From bf1a15793ebe20e266330d2243150bc5c6efbad8 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Fri, 26 Sep 2025 13:58:24 -0400
Subject: [PATCH 5/7] adding tests

---
 learning_resources/etl/canvas_test.py | 301 ++++++++++++++++++++++++++
 1 file changed, 301 insertions(+)

diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index cba2138063..1e96f49910 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -16,6 +16,7 @@
 )
 from learning_resources.etl.canvas_utils import (
     _compact_element,
+    get_published_items,
     is_file_published,
     parse_canvas_settings,
     parse_files_meta,
@@ -1235,3 +1236,303 @@ def test_get_url_config_assignments_and_pages(mocker, tmp_path):
     )
     assert results["html page"] == "https://example.com/htmlpage"
     assert results["Item 1"] == "https://example.com/file1"
+
+
+def test_get_published_items_for_unpublshed_file(tmp_path):
+    html_content = """
+    <html>
+    <head><meta name="workflow_state" content="active"/></head>
+        <body>
+            <a href="$IMS-CC-FILEBASE$/file1.pdf">Embedded File 1</a>
+        </body>
+    </html>
+    """
+    module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <modules xmlns="http://canvas.instructure.com/xsd/cccv1p0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    >
+      <module>
+
+        <title>Module 1</title>
+        <items>
+          <item identifier="RES3">
+            <workflow_state>active</workflow_state>
+            <title>Item 1</title>
+            <hidden>false</hidden>
+             <locked>false</locked>
+            <identifierref>RES3</identifierref>
+            <content_type>resource</content_type>
+          </item>
+        </items>
+      </module>
+    </modules>
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent"  href="web_resources/file1.pdf">
+          <file href="web_resources/file1.pdf"/>
+        </resource>
+        <resource identifier="RES2" type="webcontent" href="web_resources/file2.html">
+          <file href="web_resources/file2.html"/>
+        </resource>
+        <resource identifier="RES3" type="webcontent" href="web_resources/html_page.html">
+          <file href="web_resources/html_page.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    zip_path = make_canvas_zip(
+        tmp_path,
+        module_xml=module_xml,
+        manifest_xml=manifest_xml,
+        files=[
+            ("web_resources/file1.pdf", "content of file1"),
+            ("web_resources/file2.html", "content of file2"),
+            ("web_resources/html_page.html", html_content),
+        ],
+    )
+    url_config = {
+        "/html_page.html": {
+            "url": "https://cdn.example.com/",
+            "locked": True,
+        },
+    }
+    published = [item.name for item in get_published_items(zip_path, url_config)]
+    url_config = {
+        "/html_page.html": {
+            "url": "https://cdn.example.com/",
+            "hidden": True,
+        },
+    }
+    published.extend([item.name for item in get_published_items(zip_path, url_config)])
+    assert "html_page.html" not in published
+
+
+def test_get_published_items_for_unpublshed_parent_folder(mocker, tmp_path):
+    html_content = """
+    <html>
+    <head><meta name="workflow_state" content="active"/></head>
+        <body>
+            <a href="$IMS-CC-FILEBASE$/file1.pdf">Embedded File 1</a>
+        </body>
+    </html>
+    """
+    module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <modules xmlns="http://canvas.instructure.com/xsd/cccv1p0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    >
+      <module>
+
+        <title>Module 1</title>
+        <items>
+          <item identifier="RES3">
+            <workflow_state>active</workflow_state>
+            <title>Item 1</title>
+            <hidden>false</hidden>
+             <locked>false</locked>
+            <identifierref>RES3</identifierref>
+            <content_type>resource</content_type>
+          </item>
+        </items>
+      </module>
+    </modules>
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent"  href="web_resources/file1.pdf">
+          <file href="web_resources/file1.pdf"/>
+        </resource>
+        <resource identifier="RES2" type="webcontent" href="web_resources/file2.html">
+          <file href="web_resources/file2.html"/>
+        </resource>
+        <resource identifier="RES3" type="webcontent" href="web_resources/html_page.html">
+          <file href="web_resources/html_page.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    zip_path = make_canvas_zip(
+        tmp_path,
+        module_xml=module_xml,
+        manifest_xml=manifest_xml,
+        files=[
+            ("web_resources/file1.pdf", "content of file1"),
+            ("web_resources/file2.html", "content of file2"),
+            ("web_resources/html_page.html", html_content),
+        ],
+    )
+
+    url_config = {
+        "/html_page.html": {
+            "url": "https://cdn.example.com/",
+            "published": True,
+            "folder": {
+                "hidden": True,
+            },
+        },
+    }
+    published = [item.name for item in get_published_items(zip_path, url_config)]
+    assert "html_page.html" not in published
+
+
+def test_get_published_items_with_hidden_file_section(mocker, tmp_path):
+    """
+    Test that if the files section in the navbar is hidden,
+    no files are considered published even if they are marked as published
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent"  href="web_resources/file1.pdf">
+          <file href="web_resources/file1.pdf"/>
+        </resource>
+        <resource identifier="RES2" type="webcontent" href="web_resources/file2.html">
+          <file href="web_resources/file2.html"/>
+        </resource>
+        <resource identifier="RES3" type="webcontent" href="web_resources/html_page.html">
+          <file href="web_resources/html_page.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    files_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+        <fileMeta xmlns="http://canvas.instructure.com/xsd/cccv1p0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://canvas.instructure.com/xsd/cccv1p0 https://canvas.instructure.com/xsd/cccv1p0.xsd">
+        <files>
+        <file identifier="RES1">
+          <category>uncategorized</category>
+        </file>
+        <file identifier="RES2">
+          <category>uncategorized</category>
+        </file>
+        <file identifier="RES3">
+          <category>uncategorized</category>
+        </file>
+        </files>
+        </fileMeta>
+    """
+    zip_path = make_canvas_zip(
+        tmp_path,
+        manifest_xml=manifest_xml,
+        files=[
+            ("course_settings/files_meta.xml", files_xml),
+            ("web_resources/file1.pdf", "content of file1"),
+            ("web_resources/file2.html", "content of file2"),
+            ("web_resources/html_page.html", ""),
+        ],
+    )
+
+    url_config = {
+        "/html_page.html": {
+            "url": "https://cdn.example.com/",
+            "published": True,
+        },
+        "/file2.html": {
+            "url": "https://cdn.example.com/file2.html",
+            "published": True,
+        },
+        "/file3.html": {
+            "url": "https://cdn.example.com/file2.html",
+            "published": True,
+        },
+    }
+    published = [item.name for item in get_published_items(zip_path, url_config)]
+    assert sorted(published) == sorted(["html_page.html", "file2.html", "file1.pdf"])
+
+    # hide the files section in the navbar
+    settings_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+        <course identifier="gfef28ec71f16246c57edfeef25b26a54"
+        xmlns="http://canvas.instructure.com/xsd/cccv1p0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://canvas.instructure.com/xsd/cccv1p0
+        https://canvas.instructure.com/xsd/cccv1p0.xsd">
+            <title>Test Course Title</title>
+            <tab_configuration>[{"id":0},{"id":11, "hidden":true}]</tab_configuration>
+            <course_code>TEST-101</course_code>
+            <other_field>Other Value</other_field>
+        </course>
+    """
+
+    zip_path = make_canvas_zip(
+        tmp_path,
+        settings_xml=settings_xml,
+        manifest_xml=manifest_xml,
+        files=[
+            ("course_settings/files_meta.xml", files_xml),
+            ("web_resources/file1.pdf", "content of file1"),
+            ("web_resources/file2.html", "content of file2"),
+            ("web_resources/html_page.html", ""),
+        ],
+    )
+    published = [item.name for item in get_published_items(zip_path, url_config)]
+    assert len(published) == 0
+
+
+def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
+    html_content = """
+    <html>
+    <head><meta name="workflow_state" content="active"/></head>
+        <body>
+            <a href="$IMS-CC-FILEBASE$/file1.pdf">Embedded File 1</a>
+        </body>
+    </html>
+    """
+    module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <modules xmlns="http://canvas.instructure.com/xsd/cccv1p0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    >
+      <module>
+
+        <title>Module 1</title>
+        <items>
+          <item identifier="RES3">
+            <workflow_state>active</workflow_state>
+            <title>Item 1</title>
+            <hidden>false</hidden>
+             <locked>false</locked>
+            <identifierref>RES3</identifierref>
+            <content_type>resource</content_type>
+          </item>
+        </items>
+      </module>
+    </modules>
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent"  href="web_resources/file1.pdf">
+          <file href="web_resources/file1.pdf"/>
+        </resource>
+        <resource identifier="RES2" type="webcontent" href="web_resources/file2.html">
+          <file href="web_resources/file2.html"/>
+        </resource>
+        <resource identifier="RES3" type="webcontent" href="web_resources/html_page.html">
+          <file href="web_resources/html_page.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    zip_path = make_canvas_zip(
+        tmp_path,
+        module_xml=module_xml,
+        manifest_xml=manifest_xml,
+        files=[
+            ("web_resources/file1.pdf", "content of file1"),
+            ("web_resources/file2.html", "content of file2"),
+            ("web_resources/html_page.html", html_content),
+        ],
+    )
+    url_config = {
+        "/file1.pdf": {
+            "url": "https://cdn.example.com/file1.pdf",
+            "locked": True,
+        },
+    }
+    published = get_published_items(zip_path, url_config)
+    assert (
+        published[Path("/src/web_resources/html_page.html")]["embedded_files"][0]
+        == "web_resources/file1.pdf"
+    )

From c95cd67e53a1b84f343b17090160f135dad57842 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Fri, 26 Sep 2025 15:13:20 -0400
Subject: [PATCH 6/7] fix test

---
 learning_resources/etl/canvas_test.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 1e96f49910..b756dd5897 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -1529,10 +1529,8 @@ def test_get_published_items_for_unpublshed_but_embedded(mocker, tmp_path):
         "/file1.pdf": {
             "url": "https://cdn.example.com/file1.pdf",
             "locked": True,
+            "hidden": True,
         },
     }
     published = get_published_items(zip_path, url_config)
-    assert (
-        published[Path("/src/web_resources/html_page.html")]["embedded_files"][0]
-        == "web_resources/file1.pdf"
-    )
+    assert Path("web_resources/file1.pdf").resolve() in published

From e0ab8afd7dcafa344d5314ceac56be9df61c20c3 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Mon, 29 Sep 2025 10:08:44 -0400
Subject: [PATCH 7/7] missing docstrings

---
 learning_resources/etl/canvas_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py
index 2b4bb7718b..6001a3d03f 100644
--- a/learning_resources/etl/canvas_utils.py
+++ b/learning_resources/etl/canvas_utils.py
@@ -511,6 +511,9 @@ def canvas_course_url(course_archive_path) -> str:
 
 
 def _url_config_key(item):
+    """
+    Get the key to look up an item from the url_config dictionary
+    """
     if "web_resources" in str(item["path"]):
         return str(item["path"]).split("web_resources")[-1]
     return item.get("title")
@@ -544,6 +547,9 @@ def _url_config_item_visible(item_configuration):
 
 
 def get_published_items(zipfile_path, url_config):
+    """
+    Get all published items from a Canvas course archive
+    """
     published_items = {}
     course_settings = parse_canvas_settings(zipfile_path)
     tab_configuration = course_settings.get("tab_configuration", {})