From 6e0815fdcf20ed4ffe0429cfda01e1eba02c340d Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 08:52:04 -0400
Subject: [PATCH 1/9] initial working version of gathering assignments and
 pages

---
 learning_resources/etl/canvas.py | 88 +++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 7 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index a6646042d8..7bd4967e1b 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -12,6 +12,7 @@
 from urllib.parse import unquote_plus
 
 import pypdfium2 as pdfium
+from bs4 import BeautifulSoup
 from defusedxml import ElementTree
 from django.conf import settings
 from litellm import completion
@@ -180,13 +181,20 @@ def transform_canvas_content_files(
     basedir = course_zipfile.name.split(".")[0]
     zipfile_path = course_zipfile.absolute()
     # grab published module and file items
-    published_items = [
-        Path(item["path"]).resolve()
-        for item in parse_module_meta(zipfile_path)["active"]
-    ] + [
-        Path(item["path"]).resolve()
-        for item in parse_files_meta(zipfile_path)["active"]
-    ]
+    published_items = (
+        [
+            Path(item["path"]).resolve()
+            for item in parse_module_meta(zipfile_path)["active"]
+        ]
+        + [
+            Path(item["path"]).resolve()
+            for item in parse_files_meta(zipfile_path)["active"]
+        ]
+        + [
+            Path(item["path"]).resolve()
+            for item in parse_web_content(zipfile_path)["active"]
+        ]
+    )
 
     def _generate_content():
         """Inner generator for yielding content data"""
@@ -419,6 +427,72 @@ def parse_module_meta(course_archive_path: str) -> dict:
     return publish_status
 
 
+def _workflow_state_from_html(html):
+    soup = BeautifulSoup(html, "html.parser")
+
+    meta = soup.find("meta", attrs={"name": "workflow_state"})
+    return meta.get("content") if meta else None
+
+
+def _workflow_state_from_xml(xml):
+    try:
+        root = ElementTree.fromstring(xml)
+        namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
+        workflow_element = root.find("ns:workflow_state", namespace)
+        return workflow_element.text.strip() if workflow_element is not None else None
+
+    except Exception:
+        log.exception("Error parsing XML: %s", sys.stderr)
+
+
+def _title_from_html(html):
+    soup = BeautifulSoup(html, "html.parser")
+    title = soup.find("title")
+    return title.get_text().strip() if title else ""
+
+
+def parse_web_content(course_archive_path: str) -> dict:
+    """
+    Parse course_settings/files_meta.xml and return publish/active status of resources.
+    """
+
+    publish_status = {"active": [], "unpublished": []}
+
+    with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+        manifest_xml = course_archive.read("imsmanifest.xml")
+        resource_map = extract_resources_by_identifier(manifest_xml)
+        for item in resource_map:
+            item_link = resource_map[item].get("href")
+            assignment_settings = None
+            for file in resource_map[item].get("files", []):
+                if file.endswith("assignment_settings.xml"):
+                    assignment_settings = file
+            if item_link and item_link.endswith(".html"):
+                file_path = resource_map[item]["href"]
+                html_content = course_archive.read(file_path)
+                if assignment_settings:
+                    xml_content = course_archive.read(assignment_settings)
+                    workflow_state = _workflow_state_from_xml(xml_content)
+                else:
+                    workflow_state = _workflow_state_from_html(html_content)
+                title = _title_from_html(html_content)
+                if workflow_state in ["active", "published"]:
+                    publish_status["active"].append(
+                        {
+                            "title": title,
+                            "path": file_path,
+                        }
+                    )
+                else:
+                    publish_status["unpublished"].append(
+                        {
+                            "title": title,
+                            "path": file_path,
+                        }
+                    )
+    return publish_status
+
+
 def extract_resources_by_identifierref(manifest_xml: str) -> dict:
     """
     Extract resources from an IMS manifest file and

From 20064af8f33af0878cb3d064b3f0a60cee954fde Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 09:42:55 -0400
Subject: [PATCH 2/9] test fix

---
 learning_resources/etl/canvas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 7bd4967e1b..09e5eb8c7a 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -459,7 +459,10 @@ def parse_web_content(course_archive_path: str) -> dict:
     publish_status = {"active": [], "unpublished": []}
 
     with zipfile.ZipFile(course_archive_path, "r") as course_archive:
-        manifest_xml = course_archive.read("imsmanifest.xml")
+        manifest_path = "imsmanifest.xml"
+        if manifest_path not in course_archive.namelist():
+            return publish_status
+        manifest_xml = course_archive.read(manifest_path)
         resource_map = extract_resources_by_identifier(manifest_xml)
         for item in resource_map:
             item_link = resource_map[item].get("href")

From 3ada4b0562b8d488ac581778cd53914885a9a647 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 11:05:09 -0400
Subject: [PATCH 3/9] check metadata for visibility to students vs authors

---
 learning_resources/etl/canvas.py | 55 ++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 09e5eb8c7a..098df4f2cc 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -280,6 +280,9 @@ def transform_canvas_problem_files(
 
 
 def parse_context_xml(course_archive_path: str) -> dict:
+    """
+    Parse course_settings/context.xml and return context info
+    """
     with zipfile.ZipFile(course_archive_path, "r") as course_archive:
         context = course_archive.read("course_settings/context.xml")
     root = ElementTree.fromstring(context)
@@ -427,14 +430,31 @@ def parse_module_meta(course_archive_path: str) -> dict:
     return publish_status
 
 
+def _compact_element(element):
+    """Recursively compact an element into a nested dictionary"""
+    if len(element) == 0:  # No children, return text
+        return element.text.strip() if element.text else None
+    return {
+        child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
+            child
+        )
+        for child in element
+    }
+
+
 def _workflow_state_from_html(html):
+    """
+    Extract the workflow_state meta tag from html
+    """
     soup = BeautifulSoup(html, "html.parser")
-
     meta = soup.find("meta", attrs={"name": "workflow_state"})
     return meta.get("content") if meta else None
 
 
 def _workflow_state_from_xml(xml):
+    """
+    Extract the workflow_state element from xml
+    """
     try:
         root = ElementTree.fromstring(xml)
         namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
@@ -446,6 +466,9 @@ def _workflow_state_from_xml(xml):
 
 
 def _title_from_html(html):
+    """
+    Extract the title element from HTML content
+    """
     soup = BeautifulSoup(html, "html.parser")
     title = soup.find("title")
     return title.get_text().strip() if title else ""
@@ -453,7 +476,7 @@ def _title_from_html(html):
 
 def parse_web_content(course_archive_path: str) -> dict:
     """
-    Parse course_settings/files_meta.xml and return publish/active status of resources.
+    Parse html pages and assignments and return publish/active status of resources
     """
 
     publish_status = {"active": [], "unpublished": []}
@@ -465,13 +488,14 @@ def parse_web_content(course_archive_path: str) -> dict:
         manifest_xml = course_archive.read(manifest_path)
         resource_map = extract_resources_by_identifier(manifest_xml)
         for item in resource_map:
-            item_link = resource_map[item].get("href")
+            resource_map_item = resource_map[item]
+            item_link = resource_map_item.get("href")
             assignment_settings = None
-            for file in resource_map[item].get("files", []):
+            for file in resource_map_item.get("files", []):
                 if file.endswith("assignment_settings.xml"):
                     assignment_settings = file
             if item_link and item_link.endswith(".html"):
-                file_path = resource_map[item]["href"]
+                file_path = resource_map_item["href"]
                 html_content = course_archive.read(file_path)
                 if assignment_settings:
                     xml_content = course_archive.read(assignment_settings)
@@ -479,7 +503,17 @@ def parse_web_content(course_archive_path: str) -> dict:
                 else:
                     workflow_state = _workflow_state_from_html(html_content)
                 title = _title_from_html(html_content)
-                if workflow_state in ["active", "published"]:
+
+                lom_elem = (
+                    resource_map_item.get("metadata", {})
+                    .get("lom", {})
+                    .get("educational", {})
+                )
+                # Determine if the content is intended for authors or instructors only
+                intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
+                authors_only = intended_role and intended_role.lower() != "student"
+
+                if workflow_state in ["active", "published"] and not authors_only:
                     publish_status["active"].append(
                         {
                             "title": title,
@@ -540,15 +574,12 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
     file and return a map keyed by identifier.
     """
     root = ElementTree.fromstring(manifest_xml)
-
     namespaces = {
         "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
         "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
         "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
     }
-
     resources_dict = {}
-
     # Find all resource elements
     for resource in root.findall(".//imscp:resource[@identifier]", namespaces):
         identifier = resource.get("identifier")
@@ -560,14 +591,11 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
             file_elem.get("href")
             for file_elem in resource.findall("imscp:file", namespaces)
         ]
-
         # Extract metadata if present
         metadata = {}
         metadata_elem = resource.find("imscp:metadata", namespaces)
         if metadata_elem is not None:
-            # You can expand this to extract specific metadata fields as needed
-            metadata["has_metadata"] = True
-
+            metadata.update(_compact_element(metadata_elem))
         resources_dict[identifier] = {
             "identifier": identifier,
             "type": resource_type,
@@ -575,7 +603,6 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
             "files": files,
             "metadata": metadata,
         }
-
     return resources_dict
 
 

From aca1bc5d4a97ce0e20a528fff5ee63e1f0a1f473 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 11:06:38 -0400
Subject: [PATCH 4/9] type hints

---
 learning_resources/etl/canvas.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 098df4f2cc..50261fa96b 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -430,7 +430,7 @@ def parse_module_meta(course_archive_path: str) -> dict:
     return publish_status
 
 
-def _compact_element(element):
+def _compact_element(element: ElementTree.Element) -> dict | str | None:
     """Recursively compact an element into a nested dictionary"""
     if len(element) == 0:  # No children, return text
         return element.text.strip() if element.text else None
@@ -442,7 +442,7 @@ def _compact_element(element):
     }
 
 
-def _workflow_state_from_html(html):
+def _workflow_state_from_html(html: str) -> str:
     """
     Extract the workflow_state meta tag from html
     """
@@ -465,7 +465,7 @@ def _workflow_state_from_xml(xml):
         log.exception("Error parsing XML: %s", sys.stderr)
 
 
-def _title_from_html(html):
+def _title_from_html(html: str) -> str:
     """
     Extract the title element from HTML content
     """
@@ -642,6 +642,9 @@ def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
 
 
 def _pdf_to_markdown(pdf_path):
+    """
+    Convert a PDF file to markdown using an llm
+    """
     markdown = ""
     for im in pdf_to_base64_images(pdf_path):
         response = completion(

From 09c2ddaa7ce397e6e391cf6cbf42b5ad387c4a81 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 11:13:02 -0400
Subject: [PATCH 5/9] remove typehint

---
 learning_resources/etl/canvas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 50261fa96b..9398a4ee8f 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -430,7 +430,7 @@ def parse_module_meta(course_archive_path: str) -> dict:
     return publish_status
 
 
-def _compact_element(element: ElementTree.Element) -> dict | str | None:
+def _compact_element(element) -> dict | str | None:
     """Recursively compact an element into a nested dictionary"""
     if len(element) == 0:  # No children, return text
         return element.text.strip() if element.text else None

From d2f277e01e6c716cbf1d517d0491dcd8bf83cef2 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 12:55:02 -0400
Subject: [PATCH 6/9] consolidation of namespaces

---
 learning_resources/etl/canvas.py | 65 ++++++++++++++------------------
 1 file changed, 28 insertions(+), 37 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 9398a4ee8f..4ee74b762d 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -43,6 +43,14 @@
 log = logging.getLogger(__name__)
 
 
+NAMESPACES = {
+    "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
+    "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
+    "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
+    "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
+}
+
+
 def sync_canvas_archive(bucket, key: str, overwrite):
     """
     Sync a Canvas course archive from S3
@@ -286,11 +294,10 @@ def parse_context_xml(course_archive_path: str) -> dict:
     with zipfile.ZipFile(course_archive_path, "r") as course_archive:
         context = course_archive.read("course_settings/context.xml")
     root = ElementTree.fromstring(context)
-    namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
     context_info = {}
     item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
     for key in item_keys:
-        element = root.find(f"ns:{key}", namespaces)
+        element = root.find(f"cccv1p0:{key}", NAMESPACES)
         if element is not None:
             context_info[key] = element.text
 
@@ -310,12 +317,9 @@ def is_file_published(file_meta: dict) -> bool:
 
     hidden = str(file_meta.get("hidden", "false")).lower() == "true"
     locked = str(file_meta.get("locked", "false")).lower() == "true"
-
     unlock_at = file_meta.get("unlock_at")
     lock_at = file_meta.get("lock_at")
-
     visibility = file_meta.get("visibility", "inherit")
-
     # If explicitly hidden or locked → unpublished
     if hidden or locked:
         return False
@@ -355,11 +359,9 @@ def parse_files_meta(course_archive_path: str) -> dict:
         files_xml = course_archive.read(files_meta_path)
         manifest_xml = course_archive.read("imsmanifest.xml")
     resource_map = extract_resources_by_identifier(manifest_xml)
-
     root = ElementTree.fromstring(files_xml)
-    namespaces = {"c": "http://canvas.instructure.com/xsd/cccv1p0"}
     try:
-        for file_elem in root.findall(".//c:file", namespaces):
+        for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
             meta = dict(file_elem.attrib)
             for child in file_elem:
                 tag = child.tag
@@ -398,19 +400,18 @@ def parse_module_meta(course_archive_path: str) -> dict:
     resource_map = extract_resources_by_identifierref(manifest_xml)
     publish_status = {"active": [], "unpublished": []}
     try:
-        namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
         root = ElementTree.fromstring(module_xml)
-        for module in root.findall(".//ns:module", namespaces):
-            module_title = module.find("ns:title", namespaces).text
-            for item in module.findall("ns:items/ns:item", namespaces):
-                item_state = item.find("ns:workflow_state", namespaces).text
-                item_title = item.find("ns:title", namespaces).text
+        for module in root.findall(".//cccv1p0:module", NAMESPACES):
+            module_title = module.find("cccv1p0:title", NAMESPACES).text
+            for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
+                item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
+                item_title = item.find("cccv1p0:title", NAMESPACES).text
                 identifierref = (
-                    item.find("ns:identifierref", namespaces).text
-                    if item.find("ns:identifierref", namespaces) is not None
+                    item.find("cccv1p0:identifierref", NAMESPACES).text
+                    if item.find("cccv1p0:identifierref", NAMESPACES) is not None
                     else None
                 )
-                content_type = item.find("ns:content_type", namespaces).text
+                content_type = item.find("cccv1p0:content_type", NAMESPACES).text
                 items = resource_map.get(identifierref, {})
                 for item_info in items:
                     for file in item_info.get("files", []):
@@ -457,8 +458,8 @@ def _workflow_state_from_xml(xml):
     """
     try:
         root = ElementTree.fromstring(xml)
-        namespace = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
-        workflow_element = root.find("ns:workflow_state", namespace)
+
+        workflow_element = root.find("cccv1p0:workflow_state", NAMESPACES)
         return workflow_element.text.strip() if workflow_element is not None else None
 
     except Exception:
@@ -537,29 +538,24 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
     """
     root = ElementTree.fromstring(manifest_xml)
 
-    namespaces = {
-        "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
-        "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
-        "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
-    }
     # Dictionary to hold resources keyed by identifierref
     resources_dict = defaultdict(list)
     # Find all item elements with identifierref attributes
-    for item in root.findall(".//imscp:item[@identifierref]", namespaces):
+    for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
         identifierref = item.get("identifierref")
         title = (
-            item.find("imscp:title", namespaces).text
-            if item.find("imscp:title", namespaces) is not None
+            item.find("imscp:title", NAMESPACES).text
+            if item.find("imscp:title", NAMESPACES) is not None
             else "No Title"
         )
         resource = root.find(
-            f'.//imscp:resource[@identifier="{identifierref}"]', namespaces
+            f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
         )
         if resource is not None:
             # Get all file elements within the resource
             files = [
                 file_elem.get("href")
-                for file_elem in resource.findall("imscp:file", namespaces)
+                for file_elem in resource.findall("imscp:file", NAMESPACES)
             ]
 
             resources_dict[identifierref].append(
@@ -574,14 +570,9 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
     file and return a map keyed by identifier.
     """
     root = ElementTree.fromstring(manifest_xml)
-    namespaces = {
-        "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
-        "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
-        "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
-    }
     resources_dict = {}
     # Find all resource elements
-    for resource in root.findall(".//imscp:resource[@identifier]", namespaces):
+    for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
         identifier = resource.get("identifier")
         resource_type = resource.get("type")
         href = resource.get("href")
@@ -589,11 +580,11 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
         # Get all file elements within the resource
         files = [
             file_elem.get("href")
-            for file_elem in resource.findall("imscp:file", namespaces)
+            for file_elem in resource.findall("imscp:file", NAMESPACES)
         ]
         # Extract metadata if present
         metadata = {}
-        metadata_elem = resource.find("imscp:metadata", namespaces)
+        metadata_elem = resource.find("imscp:metadata", NAMESPACES)
         if metadata_elem is not None:
             metadata.update(_compact_element(metadata_elem))
         resources_dict[identifier] = {

From 9f128d45846630b3bb5f95df0c68ece12e161b9f Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 13:42:31 -0400
Subject: [PATCH 7/9] explicitely exclude tutor folder no matter what and added
 tests

---
 learning_resources/etl/canvas.py      |   6 +-
 learning_resources/etl/canvas_test.py | 167 +++++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 4ee74b762d..5a431a6f22 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -314,7 +314,6 @@ def is_file_published(file_meta: dict) -> bool:
         bool: True if file is published/visible, False otherwise.
     """
     now = now_in_utc()
-
     hidden = str(file_meta.get("hidden", "false")).lower() == "true"
     locked = str(file_meta.get("locked", "false")).lower() == "true"
     unlock_at = file_meta.get("unlock_at")
@@ -380,7 +379,10 @@ def parse_files_meta(course_archive_path: str) -> dict:
                 file_path = Path(file)
                 file_data["path"] = file_path
                 file_data["title"] = file_data.get("display_name")
-                if file_data["published"]:
+                # explicitly exclude files in web_resources/ai/tutor
+                if file_data["published"] and not file.startswith(
+                    settings.CANVAS_TUTORBOT_FOLDER
+                ):
                     publish_status["active"].append(file_data)
                 else:
                     publish_status["unpublished"].append(file_data)
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 0f9928e2ae..05e3dc3fc1 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -6,12 +6,15 @@
 from unittest.mock import MagicMock
 
 import pytest
+from defusedxml import ElementTree
 
 from learning_resources.constants import LearningResourceType, PlatformType
 from learning_resources.etl.canvas import (
+    _compact_element,
     is_file_published,
     parse_canvas_settings,
     parse_module_meta,
+    parse_web_content,
     run_for_canvas_archive,
     transform_canvas_content_files,
     transform_canvas_problem_files,
@@ -591,7 +594,7 @@ def test_is_file_published(file_meta, expected):
 
 def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path):
     """
-    Test published files from files_meta and module_meta are retained,
+    Test published files from files_meta and module_meta are retained
     """
 
     module_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
@@ -692,3 +695,165 @@ def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path):
     assert "/file1.html" in result_paths
     assert "/file3.html" in result_paths
     assert bulk_unpub.mock_calls[0].args[0] == [stale_contentfile.id]
+
+
+@pytest.mark.parametrize(
+    ("element", "expected"),
+    [
+        # Test case: Element with no children, only text
+        (
+            ElementTree.fromstring("""<tag>Sample Text</tag>"""),
+            "Sample Text",
+        ),
+        # Test case: Element with children, nested structure
+        (
+            ElementTree.fromstring(
+                """<parent>
+                <child1>Child 1 Text</child1>
+                <child2>
+                    <subchild>Subchild Text</subchild>
+                </child2>
+            </parent>
+        """
+            ),
+            {
+                "child1": "Child 1 Text",
+                "child2": {"subchild": "Subchild Text"},
+            },
+        ),
+        # Test case: Element with mixed text and children
+        (
+            ElementTree.fromstring(
+                """<mixed>Parent Text<child>Child Text</child></mixed>"""
+            ),
+            {
+                "child": "Child Text",
+            },
+        ),
+    ],
+)
+def test_compact_element(element, expected):
+    """
+    Test _compact_element function with nested tags.
+    """
+    result = _compact_element(element)
+    assert result == expected
+
+
+def test_parse_web_content_returns_active_and_unpublished(tmp_path):
+    """
+    Test that parse_web_content returns active and unpublished items
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent" href="file1.html">
+          <file href="file1.html"/>
+        </resource>
+        <resource identifier="RES2" type="webcontent" href="file2.html">
+          <file href="file2.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    html_content_active = b"""<html>
+    <head><title>Active Content</title><meta name="workflow_state" content="active"></head>
+    <body>Content</body>
+    </html>
+    """
+    html_content_unpublished = b"""<html>
+    <head><title>Unpublished Content</title><meta name="workflow_state" content="unpublished"></head>
+    <body>Content</body>
+    </html>
+    """
+    zip_path = tmp_path / "canvas_course.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        zf.writestr("imsmanifest.xml", manifest_xml)
+        zf.writestr("file1.html", html_content_active)
+        zf.writestr("file2.html", html_content_unpublished)
+
+    result = parse_web_content(zip_path)
+    assert "active" in result
+    assert "unpublished" in result
+
+    assert any(
+        item["title"] == "Active Content" and item["path"] == "file1.html"
+        for item in result["active"]
+    )
+    assert any(
+        item["title"] == "Unpublished Content" and item["path"] == "file2.html"
+        for item in result["unpublished"]
+    )
+
+
+def test_parse_web_content_handles_missing_workflow_state(tmp_path):
+    """
+    Test that parse_web_content handles missing workflow_state gracefully
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent" href="file1.html">
+          <file href="file1.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    html_content = b"""<html>
+    <head><title>No Workflow State</title></head>
+    <body>Content</body>
+    </html>
+    """
+    zip_path = tmp_path / "canvas_course.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        zf.writestr("imsmanifest.xml", manifest_xml)
+        zf.writestr("file1.html", html_content)
+
+    result = parse_web_content(zip_path)
+    assert "active" in result
+    assert "unpublished" in result
+    assert len(result["active"]) == 0
+    assert len(result["unpublished"]) == 1
+    assert result["unpublished"][0]["title"] == "No Workflow State"
+    assert result["unpublished"][0]["path"] == "file1.html"
+
+
+def test_parse_web_content_excludes_instructor_only_content(tmp_path):
+    """
+    Test that parse_web_content excludes content intended for Instructors only
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="RES1" type="webcontent" href="file1.html">
+          <file href="file1.html"/>
+          <metadata>
+            <lom>
+              <educational>
+                <intendedEndUserRole>
+                  <value>Instructor</value>
+                </intendedEndUserRole>
+              </educational>
+            </lom>
+          </metadata>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    html_content = b"""<html>
+    <head><title>Instructor Only Content</title><meta name="workflow_state" content="active"></head>
+    <body>Content</body>
+    </html>
+    """
+    zip_path = tmp_path / "canvas_course.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        zf.writestr("imsmanifest.xml", manifest_xml)
+        zf.writestr("file1.html", html_content)
+
+    result = parse_web_content(zip_path)
+    assert "active" in result
+    assert "unpublished" in result
+    assert len(result["active"]) == 0
+    assert len(result["unpublished"]) == 1
+    assert result["unpublished"][0]["title"] == "Instructor Only Content"
+    assert result["unpublished"][0]["path"] == "file1.html"

From f65ce82145433310e88d505d724fc1838fc5dd89 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 13:48:13 -0400
Subject: [PATCH 8/9] fix test

---
 learning_resources/etl/canvas_test.py | 49 +++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 05e3dc3fc1..5f9e5536e8 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -13,6 +13,7 @@
     _compact_element,
     is_file_published,
     parse_canvas_settings,
+    parse_files_meta,
     parse_module_meta,
     parse_web_content,
     run_for_canvas_archive,
@@ -857,3 +858,51 @@ def test_parse_web_content_excludes_instructor_only_content(tmp_path):
     assert len(result["unpublished"]) == 1
     assert result["unpublished"][0]["title"] == "Instructor Only Content"
     assert result["unpublished"][0]["path"] == "file1.html"
+
+
+def test_parse_files_meta_excludes_tutorbot_folder(tmp_path, settings):
+    """
+    Test that parse_files_meta explicitly excludes files in the tutorbot folder
+    even if they are marked as published in the files_meta.xml.
+    """
+    settings.CANVAS_TUTORBOT_FOLDER = "web_resources/ai/tutor/"
+    files_meta_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <files xmlns="http://canvas.instructure.com/xsd/cccv1p0">
+      <file identifier="file1">
+        <display_name>Published File</display_name>
+        <hidden>false</hidden>
+        <locked>false</locked>
+        <path>web_resources/ai/tutor/tutorfile.html</path>
+      </file>
+      <file identifier="file2">
+        <display_name>Regular File</display_name>
+        <hidden>false</hidden>
+        <locked>false</locked>
+        <path>regular_folder/file2.html</path>
+      </file>
+    </files>
+    """
+    manifest_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
+    <manifest xmlns="http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1">
+      <resources>
+        <resource identifier="file1" type="webcontent">
+          <file href="web_resources/ai/tutor/tutorfile.html"/>
+        </resource>
+        <resource identifier="file2" type="webcontent">
+          <file href="regular_folder/file2.html"/>
+        </resource>
+      </resources>
+    </manifest>
+    """
+    zip_path = tmp_path / "canvas_course.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        zf.writestr("course_settings/files_meta.xml", files_meta_xml)
+        zf.writestr("imsmanifest.xml", manifest_xml)
+
+    result = parse_files_meta(zip_path)
+
+    # Ensure the file in the tutorbot folder is excluded
+    assert len(result["active"]) == 1
+    assert result["active"][0]["path"].name == "file2.html"
+    assert len(result["unpublished"]) == 1
+    assert result["unpublished"][0]["path"].name == "tutorfile.html"

From 48b7027d8f8123cf931191c578f099ecb42bd146 Mon Sep 17 00:00:00 2001
From: shankar ambady <ambady@mit.edu>
Date: Tue, 9 Sep 2025 20:16:28 -0400
Subject: [PATCH 9/9] more conditionals for assignment workflow state

---
 learning_resources/etl/canvas.py | 85 +++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index 5a431a6f22..f9fa6c7aff 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -304,6 +304,35 @@ def parse_context_xml(course_archive_path: str) -> dict:
     return context_info
 
 
+def is_date_locked(lock_at: str, unlock_at: str) -> bool:
+    """
+    Determine if a resource is currently date-locked based
+    on lock_at and unlock_at strings.
+    Args:
+        lock_at (str): ISO 8601 date string when the resource locks
+        unlock_at (str): ISO 8601 date string when the resource unlocks
+    Returns:
+        bool: True if the resource is currently locked, False otherwise
+    """
+    now = now_in_utc()
+    if unlock_at and unlock_at.lower() != "nil":
+        try:
+            unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00"))
+            if now < unlock_dt:
+                return True
+        except Exception:
+            log.exception("Error parsing unlock_at date: %s", unlock_at)
+
+    if lock_at and lock_at.lower() != "nil":
+        try:
+            lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00"))
+            if now > lock_dt:
+                return True
+        except Exception:
+            log.exception("Error parsing lock_at date: %s", lock_at)
+    return False
+
+
 def is_file_published(file_meta: dict) -> bool:
     """
     Determine if a Canvas file (from files_meta.xml) is published/visible to students.
@@ -313,7 +342,7 @@ def is_file_published(file_meta: dict) -> bool:
     Returns:
         bool: True if file is published/visible, False otherwise.
     """
-    now = now_in_utc()
+
     hidden = str(file_meta.get("hidden", "false")).lower() == "true"
     locked = str(file_meta.get("locked", "false")).lower() == "true"
     unlock_at = file_meta.get("unlock_at")
@@ -323,21 +352,8 @@ def is_file_published(file_meta: dict) -> bool:
     if hidden or locked:
         return False
 
-    if unlock_at and unlock_at.lower() != "nil":
-        try:
-            unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00"))
-            if now < unlock_dt:
-                return False
-        except Exception:
-            log.exception("Error parsing date: %s", unlock_at)
-
-    if lock_at and lock_at.lower() != "nil":
-        try:
-            lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00"))
-            if now > lock_dt:
-                return False
-        except Exception:
-            log.exception("Error parsing date: %s", lock_at)
+    if is_date_locked(lock_at, unlock_at):
+        return False
     # Visibility rules
     if visibility in ("course", "inherit"):
         return True
@@ -454,18 +470,43 @@ def _workflow_state_from_html(html: str) -> str:
     return meta.get("content") if meta else None
 
 
-def _workflow_state_from_xml(xml):
+def _workflow_state_from_xml(xml_string: str) -> bool:
     """
-    Extract the workflow_state element from xml
+    Determine the workflow_state (published/unpublished) from assignment_settings.xml
     """
-    try:
-        root = ElementTree.fromstring(xml)
 
-        workflow_element = root.find("cccv1p0:workflow_state", NAMESPACES)
-        return workflow_element.text.strip() if workflow_element is not None else None
+    def _get_text(tag):
+        el = root.find(f"cccv1p0:{tag}", NAMESPACES)
+        return el.text.strip() if el is not None and el.text else ""
 
+    try:
+        root = ElementTree.fromstring(xml_string)
     except Exception:
         log.exception("Error parsing XML: %s", sys.stderr)
+        return "unpublished"
+
+    if (
+        (
+            # workflow_state must be published
+            _get_text("workflow_state") != "published"
+        )
+        or (
+            # only_visible_to_overrides must not be true
+            _get_text("only_visible_to_overrides") == "true"
+        )
+        or (
+            # hide_in_gradebook must not be true (hidden from gradebook)
+            _get_text("hide_in_gradebook") == "true"
+        )
+    ):
+        return "unpublished"
+
+    lock_at = _get_text("lock_at")
+    unlock_at = _get_text("unlock_at")
+    if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
+        return "unpublished"
+
+    return "published"
 
 
 def _title_from_html(html: str) -> str: