diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py index e1fb19e511..277d27d540 100644 --- a/learning_resources/etl/canvas.py +++ b/learning_resources/etl/canvas.py @@ -85,7 +85,7 @@ def run_for_canvas_archive(course_archive_path, course_folder, overwrite): """ checksum = calc_checksum(course_archive_path) course_info = parse_canvas_settings(course_archive_path) - course_title = course_info.get("title") + course_title = course_info.get("title", f"canvas course {course_folder}") url = canvas_course_url(course_archive_path) start_at = course_info.get("start_at") end_at = course_info.get("conclude_at") diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py index 0fb111e068..710f80c853 100644 --- a/learning_resources/etl/canvas_test.py +++ b/learning_resources/etl/canvas_test.py @@ -241,9 +241,12 @@ def make_canvas_zip( files = files or [] zip_path = tmp_path / "canvas_course.zip" with zipfile.ZipFile(zip_path, "w") as zf: - zf.writestr("course_settings/course_settings.xml", settings_xml) - zf.writestr("course_settings/module_meta.xml", module_xml) - zf.writestr("imsmanifest.xml", manifest_xml) + if settings_xml: + zf.writestr("course_settings/course_settings.xml", settings_xml) + if module_xml: + zf.writestr("course_settings/module_meta.xml", module_xml) + if manifest_xml: + zf.writestr("imsmanifest.xml", manifest_xml) for filename, content in files: zf.writestr(filename, content) return zip_path @@ -1737,3 +1740,67 @@ def test_get_published_items_for_attachment_module(mocker, tmp_path): } published = get_published_items(zip_path, url_config) assert Path("web_resources/visible_attachment_module.txt").resolve() in published + + +def test_ingestion_finishes_with_missing_xml_files(tmp_path, mocker): + """ + Test that canvas course ingestion succeeds even if some config XML files are missing + """ + mocker.patch( + "learning_resources.etl.canvas_utils.parse_context_xml", + return_value={"course_id": "123", "canvas_domain": "mit.edu"}, + ) + manifest_xml = b""" + + + + + + + + + + + + + + """ + files_xml = b""" + + + + uncategorized + + + uncategorized + + + uncategorized + + + + """ + zip_path = make_canvas_zip( + tmp_path, + manifest_xml=manifest_xml, + files=[ + ("course_settings/files_meta.xml", files_xml), + ("web_resources/file1.pdf", "content of file1"), + ("web_resources/file2.html", "content of file2"), + ("web_resources/html_page.html", ""), + ], + ) + mocker.patch( + "learning_resources.etl.utils.extract_text_metadata", + return_value={"content": "test"}, + ) + _, run = run_for_canvas_archive(zip_path, tmp_path, overwrite=True) + content_results = list( + transform_canvas_content_files( + Path(zip_path), run, url_config={}, overwrite=True + ) + ) + assert run is not None + assert len(content_results) > 0 diff --git a/learning_resources/etl/canvas_utils.py b/learning_resources/etl/canvas_utils.py index e915bf4d0a..516f82214d 100644 --- a/learning_resources/etl/canvas_utils.py +++ b/learning_resources/etl/canvas_utils.py @@ -114,7 +114,10 @@ def parse_module_meta(course_archive_path: str) -> dict: """ Parse module_meta.xml and return publish/active status of resources. """ + with zipfile.ZipFile(course_archive_path, "r") as course_archive: + if "course_settings/module_meta.xml" not in course_archive.namelist(): + return {"active": [], "unpublished": []} module_xml = course_archive.read("course_settings/module_meta.xml") manifest_xml = course_archive.read("imsmanifest.xml") resource_map = extract_resources_by_identifierref(manifest_xml) @@ -412,6 +415,8 @@ def parse_context_xml(course_archive_path: str) -> dict: Parse course_settings/context.xml and return context info """ with zipfile.ZipFile(course_archive_path, "r") as course_archive: + if "course_settings/context.xml" not in course_archive.namelist(): + return {} context = course_archive.read("course_settings/context.xml") root = ElementTree.fromstring(context) context_info = {}