diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 121cffd28f..4340078b60 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
- 6379:6379
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- name: update apt
run: sudo apt-get update -y
@@ -92,7 +92,7 @@ jobs:
javascript-tests:
runs-on: ubuntu-22.04
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: "^22.0.0"
@@ -151,7 +151,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- name: Build the Docker image
env:
@@ -199,7 +199,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
@@ -223,7 +223,7 @@ jobs:
GENERATOR_OUTPUT_DIR_VC: ./frontends/api/src/generated/v0
runs-on: ubuntu-22.04
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: "^22.0.0"
@@ -262,7 +262,7 @@ jobs:
GENERATOR_OUTPUT_DIR_VC: ./frontends/api/src/generated/v1
runs-on: ubuntu-22.04
steps:
- - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ - uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: "^22.0.0"
diff --git a/.github/workflows/openapi-diff.yml b/.github/workflows/openapi-diff.yml
index 7611fe521b..2e963acf57 100644
--- a/.github/workflows/openapi-diff.yml
+++ b/.github/workflows/openapi-diff.yml
@@ -5,12 +5,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout HEAD
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
with:
ref: ${{ github.head_ref }}
path: head
- name: Checkout BASE
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
with:
ref: ${{ github.base_ref }}
path: base
diff --git a/.github/workflows/publish-pages.yml b/.github/workflows/publish-pages.yml
index 8be9f1897d..90271f976d 100644
--- a/.github/workflows/publish-pages.yml
+++ b/.github/workflows/publish-pages.yml
@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-22.04
steps:
- name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
+ uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
diff --git a/RELEASE.rst b/RELEASE.rst
index 5de3bd8312..389c1bfe6c 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -1,6 +1,19 @@
Release Notes
=============
+Version 0.42.3
+--------------
+
+- ingest canvas html content (#2502)
+- Fix userlist stale data (#2501)
+- redirect org users to org dashboard (#2482)
+- Set page size to 30 for the course query for programs in the org dash (#2494)
+- Update dependency Django to v4.2.24 (#2496)
+- adding fix for flaky test (#2491)
+- Toggle NextJS image optimization with an env var (#2385)
+- Stop running Sloan news/event tasks (#2493)
+- Update actions/checkout digest to 08eba0b (#2454)
+
Version 0.42.2 (Released September 08, 2025)
--------------
diff --git a/authentication/views.py b/authentication/views.py
index 992988186a..6ab3592a3c 100644
--- a/authentication/views.py
+++ b/authentication/views.py
@@ -1,14 +1,16 @@
"""Authentication views"""
import logging
+from urllib.parse import urljoin
from django.contrib.auth import logout
from django.shortcuts import redirect
from django.utils.http import url_has_allowed_host_and_scheme, urlencode
+from django.utils.text import slugify
from django.views import View
from main import settings
-from main.middleware.apisix_user import ApisixUserMiddleware
+from main.middleware.apisix_user import ApisixUserMiddleware, decode_apisix_headers
log = logging.getLogger(__name__)
@@ -64,6 +66,8 @@ class CustomLoginView(View):
Redirect the user to the appropriate url after login
"""
+ header = "HTTP_X_USERINFO"
+
def get(
self,
request,
@@ -76,10 +80,32 @@ def get(
redirect_url = get_redirect_url(request)
if not request.user.is_anonymous:
profile = request.user.profile
- if not profile.has_logged_in:
- profile.has_logged_in = True
- profile.save()
- if (
+
+ apisix_header = decode_apisix_headers(request, self.header)
+
+ # Check if user belongs to any organizations
+ user_organizations = (
+ apisix_header.get("organizations", {}) if apisix_header else {}
+ )
+
+ if user_organizations:
+ # First-time login for org user: redirect to org dashboard
+ if not profile.has_logged_in:
+ first_org_name = next(iter(user_organizations.keys()))
+ org_slug = slugify(first_org_name)
+
+ log.info(
+ "User %s belongs to organization: %s (slug: %s)",
+ request.user.email,
+ first_org_name,
+ org_slug,
+ )
+
+ redirect_url = urljoin(
+ settings.APP_BASE_URL, f"/dashboard/organization/{org_slug}"
+ )
+ # Non-organization users: apply onboarding logic
+ elif (
not profile.completed_onboarding
and request.GET.get("skip_onboarding", "0") == "0"
):
@@ -87,4 +113,9 @@ def get(
redirect_url = f"{settings.MITOL_NEW_USER_LOGIN_URL}?{params}"
profile.completed_onboarding = True
profile.save()
+
+ if not profile.has_logged_in:
+ profile.has_logged_in = True
+ profile.save()
+
return redirect(redirect_url)
diff --git a/authentication/views_test.py b/authentication/views_test.py
index 637af019ba..5f8da73272 100644
--- a/authentication/views_test.py
+++ b/authentication/views_test.py
@@ -3,9 +3,9 @@
import json
from base64 import b64encode
from unittest.mock import MagicMock
+from urllib.parse import urljoin
import pytest
-from django.conf import settings
from django.test import RequestFactory
from django.urls import reverse
@@ -28,10 +28,18 @@ def test_custom_login(mocker, next_url, allowed):
assert get_redirect_url(mock_request) == (next_url if allowed else "/app")
-@pytest.mark.parametrize("has_apisix_header", [True, False])
-@pytest.mark.parametrize("next_url", ["/search", None])
-def test_logout(mocker, next_url, client, user, has_apisix_header):
+@pytest.mark.parametrize(
+ "test_params",
+ [
+ (True, "/search"),
+ (True, None),
+ (False, "/search"),
+ (False, None),
+ ],
+)
+def test_logout(mocker, client, user, test_params, settings):
"""User should be properly redirected and logged out"""
+ has_apisix_header, next_url = test_params
header_str = b64encode(
json.dumps(
{
@@ -55,10 +63,10 @@ def test_logout(mocker, next_url, client, user, has_apisix_header):
mock_logout.assert_called_once()
-@pytest.mark.parametrize("is_authenticated", [True])
-@pytest.mark.parametrize("has_next", [False])
-def test_next_logout(mocker, client, user, is_authenticated, has_next):
+@pytest.mark.parametrize("test_params", [(True, False)])
+def test_next_logout(mocker, client, user, test_params, settings):
"""Test logout redirect cache assignment"""
+ is_authenticated, has_next = test_params
next_url = "https://ocw.mit.edu"
mock_request = mocker.MagicMock(
GET={"next": next_url if has_next else None},
@@ -211,3 +219,60 @@ def test_custom_login_view_first_time_login_sets_has_logged_in(mocker):
# Verify redirect was called with the correct URL
mock_redirect.assert_called_once_with("/dashboard")
+
+
+@pytest.mark.parametrize(
+ "test_case",
+ [
+ (
+ (False, False),
+ "/dashboard/organization/test-organization",
+ ), # First-time login → org dashboard
+ (
+ (False, True),
+ "/dashboard/organization/test-organization",
+ ), # First-time login → org dashboard
+ ((True, False), "/app"), # Subsequent login → normal app (not onboarding!)
+ ((True, True), "/app"), # Subsequent login → normal app
+ ],
+)
+def test_login_org_user_redirect(mocker, client, user, test_case, settings):
+ """Test organization user redirect behavior - org users skip onboarding regardless of onboarding status"""
+ # Unpack test case
+ profile_state, expected_url = test_case
+ has_logged_in, completed_onboarding = profile_state
+
+ # Set up user profile based on test scenario
+ user.profile.has_logged_in = has_logged_in
+ user.profile.completed_onboarding = completed_onboarding
+ user.profile.save()
+
+ header_str = b64encode(
+ json.dumps(
+ {
+ "preferred_username": user.username,
+ "email": user.email,
+ "sub": user.global_id,
+ "organization": {
+ "Test Organization": {
+ "role": "member",
+ "id": "org-123",
+ }
+ },
+ }
+ ).encode()
+ )
+ client.force_login(user)
+ response = client.get(
+ "/login/",
+ follow=False,
+ HTTP_X_USERINFO=header_str,
+ )
+ assert response.status_code == 302
+ # Handle environment differences - in some envs it returns full URL, in others just path
+ expected_full_url = urljoin(settings.APP_BASE_URL, expected_url)
+ assert response.url in [expected_url, expected_full_url]
+
+ # Verify that org users are never sent to onboarding
+ # (onboarding URL would contain settings.MITOL_NEW_USER_LOGIN_URL)
+ assert settings.MITOL_NEW_USER_LOGIN_URL not in response.url
diff --git a/env/backend.env b/env/backend.env
index acb1a8eecd..ce267160e2 100644
--- a/env/backend.env
+++ b/env/backend.env
@@ -48,6 +48,7 @@ KEYCLOAK_CLIENT_ID=apisix
KEYCLOAK_CLIENT_SECRET=HckCZXToXfaetbBx0Fo3xbjnC468oMi4 # pragma: allowlist-secret
KEYCLOAK_DISCOVERY_URL=http://kc.ol.local:8066/realms/ol-local/.well-known/openid-configuration
KEYCLOAK_REALM_NAME=ol-local
+# For some organization related functionality, add organization:* if you have orgs enabled
KEYCLOAK_SCOPES="openid profile ol-profile"
KEYCLOAK_SVC_KEYSTORE_PASSWORD=supertopsecret1234
KEYCLOAK_SVC_HOSTNAME=kc.ol.local
diff --git a/env/backend.local.example.env b/env/backend.local.example.env
index ef9edf1fba..89ad1cf4e3 100644
--- a/env/backend.local.example.env
+++ b/env/backend.local.example.env
@@ -31,6 +31,7 @@ KEYCLOAK_CLIENT_ID=apisix
KEYCLOAK_CLIENT_SECRET=HckCZXToXfaetbBx0Fo3xbjnC468oMi4 # pragma: allowlist-secret
KEYCLOAK_DISCOVERY_URL=http://kc.ol.local:8066/realms/ol-local/.well-known/openid-configuration
KEYCLOAK_REALM_NAME=ol-local
+# For some organization related functionality, add organization:* if you have orgs enabled
KEYCLOAK_SCOPES="openid profile ol-profile"
KEYCLOAK_SVC_KEYSTORE_PASSWORD=supertopsecret1234
KEYCLOAK_SVC_HOSTNAME=kc.ol.local
diff --git a/env/frontend.env b/env/frontend.env
index 0434f13695..0225b9a761 100644
--- a/env/frontend.env
+++ b/env/frontend.env
@@ -1,6 +1,7 @@
NODE_ENV=development
PORT=8062
SENTRY_ENV=dev # Re-enable sentry
+OPTIMIZE_IMAGES="true"
# Environment variables with `NEXT_PUBLIC_` prefix are exposed to the client side
NEXT_PUBLIC_ORIGIN=${MITOL_APP_BASE_URL}
diff --git a/frontends/api/src/hooks/learningResources/index.ts b/frontends/api/src/hooks/learningResources/index.ts
index b8dad9135a..34774d47c6 100644
--- a/frontends/api/src/hooks/learningResources/index.ts
+++ b/frontends/api/src/hooks/learningResources/index.ts
@@ -97,7 +97,16 @@ const useLearningResourceSetUserListRelationships = () => {
params: LearningResourcesApiLearningResourcesUserlistsPartialUpdateRequest,
) => learningResourcesApi.learningResourcesUserlistsPartialUpdate(params),
onSettled: () => {
- queryClient.invalidateQueries({ queryKey: userlistKeys.membershipList() })
+ /**
+ * We need to invalidate:
+ * - membership check
+ * - list of user lists (count has changed)
+ * - userlist detail annd listing for any lists we modified
+ *
+ * That's a lot. Let's just invalidate root.
+ * Additionally, the lists we've removed from the resource are not easily available.
+ */
+ queryClient.invalidateQueries({ queryKey: userlistKeys.root })
},
})
}
@@ -110,9 +119,16 @@ const useLearningResourceSetLearningPathRelationships = () => {
) =>
learningResourcesApi.learningResourcesLearningPathsPartialUpdate(params),
onSettled: () => {
- queryClient.invalidateQueries({
- queryKey: learningPathKeys.membershipList(),
- })
+ /**
+ * We need to invalidate:
+ * - membership check
+ * - list of user lists (count has changed)
+ * - userlist detail annd listing for any lists we modified
+ *
+ * That's a lot. Let's just invalidate root.
+ * Additionally, the lists we've removed from the resource are not easily available.
+ */
+ queryClient.invalidateQueries({ queryKey: learningPathKeys.root })
},
})
}
diff --git a/frontends/main/next.config.js b/frontends/main/next.config.js
index f2f968dbb7..23a66e4ddb 100644
--- a/frontends/main/next.config.js
+++ b/frontends/main/next.config.js
@@ -3,6 +3,10 @@ const { validateEnv } = require("./validateEnv")
validateEnv()
+const OPTIMIZE_IMAGES = Boolean(
+ (process.env.OPTIMIZE_IMAGES ?? "true") === "true",
+)
+
const processFeatureFlags = () => {
const featureFlagPrefix =
process.env.NEXT_PUBLIC_POSTHOG_FEATURE_PREFIX || "FEATURE_"
@@ -90,6 +94,7 @@ const nextConfig = {
transpilePackages: ["@mitodl/smoot-design/ai"],
images: {
+ unoptimized: !OPTIMIZE_IMAGES,
remotePatterns: [
{
hostname: "**",
diff --git a/frontends/main/src/app-pages/DashboardPage/CoursewareDisplay/test-utils.ts b/frontends/main/src/app-pages/DashboardPage/CoursewareDisplay/test-utils.ts
index ae3ca0c42a..e97fe24e60 100644
--- a/frontends/main/src/app-pages/DashboardPage/CoursewareDisplay/test-utils.ts
+++ b/frontends/main/src/app-pages/DashboardPage/CoursewareDisplay/test-utils.ts
@@ -154,13 +154,21 @@ const setupProgramsAndCourses = () => {
{ results: [programB] },
)
setMockResponse.get(
- urls.courses.coursesList({ id: programA.courses, org_id: orgX.id }),
+ urls.courses.coursesList({
+ id: programA.courses,
+ org_id: orgX.id,
+ page_size: 30,
+ }),
{
results: coursesA.results,
},
)
setMockResponse.get(
- urls.courses.coursesList({ id: programB.courses, org_id: orgX.id }),
+ urls.courses.coursesList({
+ id: programB.courses,
+ org_id: orgX.id,
+ page_size: 30,
+ }),
{
results: coursesB.results,
},
@@ -262,6 +270,7 @@ function setupOrgDashboardMocks(
mitxonline.urls.courses.coursesList({
id: program.courses,
org_id: org.id,
+ page_size: 30,
}),
{ results: courses },
)
diff --git a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
index 27b91f0861..f3dacd8356 100644
--- a/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
+++ b/frontends/main/src/app-pages/DashboardPage/OrganizationContent.tsx
@@ -250,7 +250,11 @@ const OrgProgramDisplay: React.FC<{
)
const hasValidCertificate = !!programEnrollment?.certificate
const courses = useQuery(
- coursesQueries.coursesList({ id: program.courseIds, org_id: orgId }),
+ coursesQueries.coursesList({
+ id: program.courseIds,
+ org_id: orgId,
+ page_size: 30,
+ }),
)
const skeleton = (
diff --git a/frontends/main/src/app/dashboard/my-lists/[id]/page.tsx b/frontends/main/src/app/dashboard/my-lists/[id]/page.tsx
index ce9897ceee..4567085512 100644
--- a/frontends/main/src/app/dashboard/my-lists/[id]/page.tsx
+++ b/frontends/main/src/app/dashboard/my-lists/[id]/page.tsx
@@ -1,14 +1,15 @@
import React from "react"
import { UserListDetailsContent } from "@/app-pages/DashboardPage/UserListDetailsContent"
-import { PageParams } from "@/app/types"
import invariant from "tiny-invariant"
+import { PageParams } from "@/app/types"
-const Page: React.FC> = async ({
+const Page: React.FC> = async ({
params,
}) => {
- const resolved = await params
- invariant(resolved?.id, "id is required")
- return
+ const resolved = await params!
+ const id = Number(resolved.id)
+ invariant(id, "id is required")
+ return
}
export default Page
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
index a6646042d8..f9fa6c7aff 100644
--- a/learning_resources/etl/canvas.py
+++ b/learning_resources/etl/canvas.py
@@ -12,6 +12,7 @@
from urllib.parse import unquote_plus
import pypdfium2 as pdfium
+from bs4 import BeautifulSoup
from defusedxml import ElementTree
from django.conf import settings
from litellm import completion
@@ -42,6 +43,14 @@
log = logging.getLogger(__name__)
+NAMESPACES = {
+ "cccv1p0": "http://canvas.instructure.com/xsd/cccv1p0",
+ "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
+ "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
+ "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
+}
+
+
def sync_canvas_archive(bucket, key: str, overwrite):
"""
Sync a Canvas course archive from S3
@@ -180,13 +189,20 @@ def transform_canvas_content_files(
basedir = course_zipfile.name.split(".")[0]
zipfile_path = course_zipfile.absolute()
# grab published module and file items
- published_items = [
- Path(item["path"]).resolve()
- for item in parse_module_meta(zipfile_path)["active"]
- ] + [
- Path(item["path"]).resolve()
- for item in parse_files_meta(zipfile_path)["active"]
- ]
+ published_items = (
+ [
+ Path(item["path"]).resolve()
+ for item in parse_module_meta(zipfile_path)["active"]
+ ]
+ + [
+ Path(item["path"]).resolve()
+ for item in parse_files_meta(zipfile_path)["active"]
+ ]
+ + [
+ Path(item["path"]).resolve()
+ for item in parse_web_content(zipfile_path)["active"]
+ ]
+ )
def _generate_content():
"""Inner generator for yielding content data"""
@@ -272,20 +288,51 @@ def transform_canvas_problem_files(
def parse_context_xml(course_archive_path: str) -> dict:
+ """
+ Parse course_settings/context.xml and return context info
+ """
with zipfile.ZipFile(course_archive_path, "r") as course_archive:
context = course_archive.read("course_settings/context.xml")
root = ElementTree.fromstring(context)
- namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
context_info = {}
item_keys = ["course_id", "root_account_id", "canvas_domain", "root_account_name"]
for key in item_keys:
- element = root.find(f"ns:{key}", namespaces)
+ element = root.find(f"cccv1p0:{key}", NAMESPACES)
if element is not None:
context_info[key] = element.text
return context_info
+def is_date_locked(lock_at: str, unlock_at: str) -> bool:
+ """
+ Determine if a resource is currently date-locked based
+ on lock_at and unlock_at strings.
+ Args:
+ lock_at (str): ISO 8601 date string when the resource locks
+ unlock_at (str): ISO 8601 date string when the resource unlocks
+ Returns:
+ bool: True if the resource is currently locked, False otherwise
+ """
+ now = now_in_utc()
+ if unlock_at and unlock_at.lower() != "nil":
+ try:
+ unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00"))
+ if now < unlock_dt:
+ return True
+ except Exception:
+ log.exception("Error parsing unlock_at date: %s", unlock_at)
+
+ if lock_at and lock_at.lower() != "nil":
+ try:
+ lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00"))
+ if now > lock_dt:
+ return True
+ except Exception:
+ log.exception("Error parsing lock_at date: %s", lock_at)
+ return False
+
+
def is_file_published(file_meta: dict) -> bool:
"""
Determine if a Canvas file (from files_meta.xml) is published/visible to students.
@@ -295,35 +342,18 @@ def is_file_published(file_meta: dict) -> bool:
Returns:
bool: True if file is published/visible, False otherwise.
"""
- now = now_in_utc()
hidden = str(file_meta.get("hidden", "false")).lower() == "true"
locked = str(file_meta.get("locked", "false")).lower() == "true"
-
unlock_at = file_meta.get("unlock_at")
lock_at = file_meta.get("lock_at")
-
visibility = file_meta.get("visibility", "inherit")
-
# If explicitly hidden or locked → unpublished
if hidden or locked:
return False
- if unlock_at and unlock_at.lower() != "nil":
- try:
- unlock_dt = datetime.fromisoformat(unlock_at.replace("Z", "+00:00"))
- if now < unlock_dt:
- return False
- except Exception:
- log.exception("Error parsing date: %s", unlock_at)
-
- if lock_at and lock_at.lower() != "nil":
- try:
- lock_dt = datetime.fromisoformat(lock_at.replace("Z", "+00:00"))
- if now > lock_dt:
- return False
- except Exception:
- log.exception("Error parsing date: %s", lock_at)
+ if is_date_locked(lock_at, unlock_at):
+ return False
# Visibility rules
if visibility in ("course", "inherit"):
return True
@@ -344,11 +374,9 @@ def parse_files_meta(course_archive_path: str) -> dict:
files_xml = course_archive.read(files_meta_path)
manifest_xml = course_archive.read("imsmanifest.xml")
resource_map = extract_resources_by_identifier(manifest_xml)
-
root = ElementTree.fromstring(files_xml)
- namespaces = {"c": "http://canvas.instructure.com/xsd/cccv1p0"}
try:
- for file_elem in root.findall(".//c:file", namespaces):
+ for file_elem in root.findall(".//cccv1p0:file", NAMESPACES):
meta = dict(file_elem.attrib)
for child in file_elem:
tag = child.tag
@@ -367,7 +395,10 @@ def parse_files_meta(course_archive_path: str) -> dict:
file_path = Path(file)
file_data["path"] = file_path
file_data["title"] = file_data.get("display_name")
- if file_data["published"]:
+ # explicitly exclude files in web_resources/ai/tutor
+ if file_data["published"] and not file.startswith(
+ settings.CANVAS_TUTORBOT_FOLDER
+ ):
publish_status["active"].append(file_data)
else:
publish_status["unpublished"].append(file_data)
@@ -387,19 +418,18 @@ def parse_module_meta(course_archive_path: str) -> dict:
resource_map = extract_resources_by_identifierref(manifest_xml)
publish_status = {"active": [], "unpublished": []}
try:
- namespaces = {"ns": "http://canvas.instructure.com/xsd/cccv1p0"}
root = ElementTree.fromstring(module_xml)
- for module in root.findall(".//ns:module", namespaces):
- module_title = module.find("ns:title", namespaces).text
- for item in module.findall("ns:items/ns:item", namespaces):
- item_state = item.find("ns:workflow_state", namespaces).text
- item_title = item.find("ns:title", namespaces).text
+ for module in root.findall(".//cccv1p0:module", NAMESPACES):
+ module_title = module.find("cccv1p0:title", NAMESPACES).text
+ for item in module.findall("cccv1p0:items/cccv1p0:item", NAMESPACES):
+ item_state = item.find("cccv1p0:workflow_state", NAMESPACES).text
+ item_title = item.find("cccv1p0:title", NAMESPACES).text
identifierref = (
- item.find("ns:identifierref", namespaces).text
- if item.find("ns:identifierref", namespaces) is not None
+ item.find("cccv1p0:identifierref", NAMESPACES).text
+ if item.find("cccv1p0:identifierref", NAMESPACES) is not None
else None
)
- content_type = item.find("ns:content_type", namespaces).text
+ content_type = item.find("cccv1p0:content_type", NAMESPACES).text
items = resource_map.get(identifierref, {})
for item_info in items:
for file in item_info.get("files", []):
@@ -419,6 +449,131 @@ def parse_module_meta(course_archive_path: str) -> dict:
return publish_status
+def _compact_element(element) -> dict | str | None:
+ """Recursively compact an element into a nested dictionary"""
+ if len(element) == 0: # No children, return text
+ return element.text.strip() if element.text else None
+ return {
+ child.tag.split("}")[-1] if "}" in child.tag else child.tag: _compact_element(
+ child
+ )
+ for child in element
+ }
+
+
+def _workflow_state_from_html(html: str) -> str:
+ """
+ Extract the workflow_state meta tag from html
+ """
+ soup = BeautifulSoup(html, "html.parser")
+ meta = soup.find("meta", attrs={"name": "workflow_state"})
+ return meta.get("content") if meta else None
+
+
+def _workflow_state_from_xml(xml_string: str) -> bool:
+ """
+ Determine the workflow_state (published/unpublished) from assignment_settings.xml
+ """
+
+ def _get_text(tag):
+ el = root.find(f"cccv1p0:{tag}", NAMESPACES)
+ return el.text.strip() if el is not None and el.text else ""
+
+ try:
+ root = ElementTree.fromstring(xml_string)
+ except Exception:
+ log.exception("Error parsing XML: %s", sys.stderr)
+ return "unpublished"
+
+ if (
+ (
+ # workflow_state must be published
+ _get_text("workflow_state") != "published"
+ )
+ or (
+ # only_visible_to_overrides must not be true
+ _get_text("only_visible_to_overrides") == "true"
+ )
+ or (
+ # hide_in_gradebook must not be true (hidden from gradebook)
+ _get_text("hide_in_gradebook") == "true"
+ )
+ ):
+ return "unpublished"
+
+ lock_at = _get_text("lock_at")
+ unlock_at = _get_text("unlock_at")
+ if _get_text("module_locked") == "true" or is_date_locked(lock_at, unlock_at):
+ return "unpublished"
+
+ return "published"
+
+
+def _title_from_html(html: str) -> str:
+ """
+ Extract the title element from HTML content
+ """
+ soup = BeautifulSoup(html, "html.parser")
+ title = soup.find("title")
+ return title.get_text().strip() if title else ""
+
+
+def parse_web_content(course_archive_path: str) -> dict:
+ """
+ Parse html pages and assignments and return publish/active status of resources
+ """
+
+ publish_status = {"active": [], "unpublished": []}
+
+ with zipfile.ZipFile(course_archive_path, "r") as course_archive:
+ manifest_path = "imsmanifest.xml"
+ if manifest_path not in course_archive.namelist():
+ return publish_status
+ manifest_xml = course_archive.read(manifest_path)
+ resource_map = extract_resources_by_identifier(manifest_xml)
+ for item in resource_map:
+ resource_map_item = resource_map[item]
+ item_link = resource_map_item.get("href")
+ assignment_settings = None
+ for file in resource_map_item.get("files", []):
+ if file.endswith("assignment_settings.xml"):
+ assignment_settings = file
+ if item_link and item_link.endswith(".html"):
+ file_path = resource_map_item["href"]
+ html_content = course_archive.read(file_path)
+ if assignment_settings:
+ xml_content = course_archive.read(assignment_settings)
+ workflow_state = _workflow_state_from_xml(xml_content)
+ else:
+ workflow_state = _workflow_state_from_html(html_content)
+ title = _title_from_html(html_content)
+
+ lom_elem = (
+ resource_map_item.get("metadata", {})
+ .get("lom", {})
+ .get("educational", {})
+ )
+ # Determine if the content is intended for authors or instructors only
+ intended_role = lom_elem.get("intendedEndUserRole", {}).get("value")
+ authors_only = intended_role and intended_role.lower() != "student"
+
+ if workflow_state in ["active", "published"] and not authors_only:
+ publish_status["active"].append(
+ {
+ "title": title,
+ "path": file_path,
+ }
+ )
+ else:
+ publish_status["unpublished"].append(
+ {
+ "title": title,
+ "path": file_path,
+ }
+ )
+ return publish_status
+
+
def extract_resources_by_identifierref(manifest_xml: str) -> dict:
"""
Extract resources from an IMS manifest file and
@@ -426,29 +581,24 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
"""
root = ElementTree.fromstring(manifest_xml)
- namespaces = {
- "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
- "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
- "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
- }
# Dictionary to hold resources keyed by identifierref
resources_dict = defaultdict(list)
# Find all item elements with identifierref attributes
- for item in root.findall(".//imscp:item[@identifierref]", namespaces):
+ for item in root.findall(".//imscp:item[@identifierref]", NAMESPACES):
identifierref = item.get("identifierref")
title = (
- item.find("imscp:title", namespaces).text
- if item.find("imscp:title", namespaces) is not None
+ item.find("imscp:title", NAMESPACES).text
+ if item.find("imscp:title", NAMESPACES) is not None
else "No Title"
)
resource = root.find(
- f'.//imscp:resource[@identifier="{identifierref}"]', namespaces
+ f'.//imscp:resource[@identifier="{identifierref}"]', NAMESPACES
)
if resource is not None:
# Get all file elements within the resource
files = [
file_elem.get("href")
- for file_elem in resource.findall("imscp:file", namespaces)
+ for file_elem in resource.findall("imscp:file", NAMESPACES)
]
resources_dict[identifierref].append(
@@ -463,17 +613,9 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
file and return a map keyed by identifier.
"""
root = ElementTree.fromstring(manifest_xml)
-
- namespaces = {
- "imscp": "http://www.imsglobal.org/xsd/imsccv1p1/imscp_v1p1",
- "lom": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/resource",
- "lomimscc": "http://ltsc.ieee.org/xsd/imsccv1p1/LOM/manifest",
- }
-
resources_dict = {}
-
# Find all resource elements
- for resource in root.findall(".//imscp:resource[@identifier]", namespaces):
+ for resource in root.findall(".//imscp:resource[@identifier]", NAMESPACES):
identifier = resource.get("identifier")
resource_type = resource.get("type")
href = resource.get("href")
@@ -481,16 +623,13 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
# Get all file elements within the resource
files = [
file_elem.get("href")
- for file_elem in resource.findall("imscp:file", namespaces)
+ for file_elem in resource.findall("imscp:file", NAMESPACES)
]
-
# Extract metadata if present
metadata = {}
- metadata_elem = resource.find("imscp:metadata", namespaces)
+ metadata_elem = resource.find("imscp:metadata", NAMESPACES)
if metadata_elem is not None:
- # You can expand this to extract specific metadata fields as needed
- metadata["has_metadata"] = True
-
+ metadata.update(_compact_element(metadata_elem))
resources_dict[identifier] = {
"identifier": identifier,
"type": resource_type,
@@ -498,7 +637,6 @@ def extract_resources_by_identifier(manifest_xml: str) -> dict:
"files": files,
"metadata": metadata,
}
-
return resources_dict
@@ -538,6 +676,9 @@ def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
def _pdf_to_markdown(pdf_path):
+ """
+ Convert a PDF file to markdown using an llm
+ """
markdown = ""
for im in pdf_to_base64_images(pdf_path):
response = completion(
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
index 0f9928e2ae..5f9e5536e8 100644
--- a/learning_resources/etl/canvas_test.py
+++ b/learning_resources/etl/canvas_test.py
@@ -6,12 +6,16 @@
from unittest.mock import MagicMock
import pytest
+from defusedxml import ElementTree
from learning_resources.constants import LearningResourceType, PlatformType
from learning_resources.etl.canvas import (
+ _compact_element,
is_file_published,
parse_canvas_settings,
+ parse_files_meta,
parse_module_meta,
+ parse_web_content,
run_for_canvas_archive,
transform_canvas_content_files,
transform_canvas_problem_files,
@@ -591,7 +595,7 @@ def test_is_file_published(file_meta, expected):
def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path):
"""
- Test published files from files_meta and module_meta are retained,
+ Test published files from files_meta and module_meta are retained
"""
module_xml = b"""
@@ -692,3 +696,213 @@ def test_published_module_and_files_meta_content_ingestion(mocker, tmp_path):
assert "/file1.html" in result_paths
assert "/file3.html" in result_paths
assert bulk_unpub.mock_calls[0].args[0] == [stale_contentfile.id]
+
+
+@pytest.mark.parametrize(
+ ("element", "expected"),
+ [
+ # Test case: Element with no children, only text
+ (
+ ElementTree.fromstring("""Sample Text"""),
+ "Sample Text",
+ ),
+ # Test case: Element with children, nested structure
+ (
+ ElementTree.fromstring(
+ """
+ Child 1 Text
+
+ Subchild Text
+
+
+ """
+ ),
+ {
+ "child1": "Child 1 Text",
+ "child2": {"subchild": "Subchild Text"},
+ },
+ ),
+ # Test case: Element with mixed text and children
+ (
+ ElementTree.fromstring(
+ """Parent TextChild Text"""
+ ),
+ {
+ "child": "Child Text",
+ },
+ ),
+ ],
+)
+def test_compact_element(element, expected):
+ """
+ Test _compact_element function with nested tags.
+ """
+ result = _compact_element(element)
+ assert result == expected
+
+
+def test_parse_web_content_returns_active_and_unpublished(tmp_path):
+ """
+ Test that parse_web_content returns active and unpublished items
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+ """
+ html_content_active = b"""
+ Active Content
+ Content
+
+ """
+ html_content_unpublished = b"""
+ Unpublished Content
+ Content
+
+ """
+ zip_path = tmp_path / "canvas_course.zip"
+ with zipfile.ZipFile(zip_path, "w") as zf:
+ zf.writestr("imsmanifest.xml", manifest_xml)
+ zf.writestr("file1.html", html_content_active)
+ zf.writestr("file2.html", html_content_unpublished)
+
+ result = parse_web_content(zip_path)
+ assert "active" in result
+ assert "unpublished" in result
+
+ assert any(
+ item["title"] == "Active Content" and item["path"] == "file1.html"
+ for item in result["active"]
+ )
+ assert any(
+ item["title"] == "Unpublished Content" and item["path"] == "file2.html"
+ for item in result["unpublished"]
+ )
+
+
+def test_parse_web_content_handles_missing_workflow_state(tmp_path):
+ """
+ Test that parse_web_content handles missing workflow_state gracefully
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+ """
+ html_content = b"""
+ No Workflow State
+ Content
+
+ """
+ zip_path = tmp_path / "canvas_course.zip"
+ with zipfile.ZipFile(zip_path, "w") as zf:
+ zf.writestr("imsmanifest.xml", manifest_xml)
+ zf.writestr("file1.html", html_content)
+
+ result = parse_web_content(zip_path)
+ assert "active" in result
+ assert "unpublished" in result
+ assert len(result["active"]) == 0
+ assert len(result["unpublished"]) == 1
+ assert result["unpublished"][0]["title"] == "No Workflow State"
+ assert result["unpublished"][0]["path"] == "file1.html"
+
+
+def test_parse_web_content_excludes_instructor_only_content(tmp_path):
+ """
+ Test that parse_web_content excludes content intended for Instructors only
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+ Instructor
+
+
+
+
+
+
+
+ """
+ html_content = b"""
+ Instructor Only Content
+ Content
+
+ """
+ zip_path = tmp_path / "canvas_course.zip"
+ with zipfile.ZipFile(zip_path, "w") as zf:
+ zf.writestr("imsmanifest.xml", manifest_xml)
+ zf.writestr("file1.html", html_content)
+
+ result = parse_web_content(zip_path)
+ assert "active" in result
+ assert "unpublished" in result
+ assert len(result["active"]) == 0
+ assert len(result["unpublished"]) == 1
+ assert result["unpublished"][0]["title"] == "Instructor Only Content"
+ assert result["unpublished"][0]["path"] == "file1.html"
+
+
+def test_parse_files_meta_excludes_tutorbot_folder(tmp_path, settings):
+ """
+ Test that parse_files_meta explicitly excludes files in the tutorbot folder
+ even if they are marked as published in the files_meta.xml.
+ """
+ settings.CANVAS_TUTORBOT_FOLDER = "web_resources/ai/tutor/"
+ files_meta_xml = b"""
+
+
+ Published File
+ false
+ false
+ web_resources/ai/tutor/tutorfile.html
+
+
+ Regular File
+ false
+ false
+ regular_folder/file2.html
+
+
+ """
+ manifest_xml = b"""
+
+
+
+
+
+
+
+
+
+
+ """
+ zip_path = tmp_path / "canvas_course.zip"
+ with zipfile.ZipFile(zip_path, "w") as zf:
+ zf.writestr("course_settings/files_meta.xml", files_meta_xml)
+ zf.writestr("imsmanifest.xml", manifest_xml)
+
+ result = parse_files_meta(zip_path)
+
+ # Ensure the file in the tutorbot folder is excluded
+ assert len(result["active"]) == 1
+ assert result["active"][0]["path"].name == "file2.html"
+ assert len(result["unpublished"]) == 1
+ assert result["unpublished"][0]["path"].name == "tutorfile.html"
diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py
index 152c26590d..26e060e4e3 100644
--- a/learning_resources/tasks_test.py
+++ b/learning_resources/tasks_test.py
@@ -16,6 +16,7 @@
from learning_resources.etl.constants import MARKETING_PAGE_FILE_TYPE, ETLSource
from learning_resources.factories import (
LearningResourceFactory,
+ LearningResourcePlatformFactory,
)
from learning_resources.models import LearningResource
from learning_resources.tasks import (
@@ -686,8 +687,19 @@ def test_remove_duplicate_resources(mocker, mocked_celery):
"""
duplicate_id = "duplicate_id"
- LearningResourceFactory.create_batch(3, readable_id=duplicate_id, published=False)
- LearningResourceFactory.create(readable_id=duplicate_id)
+ for platform_type in [PlatformType.edx, PlatformType.xpro, PlatformType.youtube]:
+ LearningResourceFactory.create(
+ readable_id=duplicate_id,
+ published=False,
+ platform=LearningResourcePlatformFactory.create(code=platform_type.name),
+ )
+
+ LearningResourceFactory.create(
+ readable_id=duplicate_id,
+ platform=LearningResourcePlatformFactory.create(
+ code=platform_type.mitxonline.name
+ ),
+ )
assert LearningResource.objects.filter(readable_id=duplicate_id).count() == 4
with pytest.raises(mocked_celery.replace_exception_class):
remove_duplicate_resources()
diff --git a/main/settings.py b/main/settings.py
index 71f5793700..3db1565ee2 100644
--- a/main/settings.py
+++ b/main/settings.py
@@ -34,7 +34,7 @@
from main.settings_pluggy import * # noqa: F403
from openapi.settings_spectacular import open_spectacular_settings
-VERSION = "0.42.2"
+VERSION = "0.42.3"
log = logging.getLogger()
@@ -356,6 +356,7 @@
"first_name": "given_name",
"last_name": "family_name",
"name": "name",
+ "organizations": "organization",
},
"profiles.Profile": {
"name": "name",
diff --git a/main/settings_celery.py b/main/settings_celery.py
index 41995aa665..eb6c0e265f 100644
--- a/main/settings_celery.py
+++ b/main/settings_celery.py
@@ -108,18 +108,6 @@
"NEWS_EVENTS_MITPE_NEWS_SCHEDULE_SECONDS", 60 * 60 * 3
), # default is every 3 hours
},
- "update_sloan_news": {
- "task": "news_events.tasks.get_sloan_exec_news",
- "schedule": get_int(
- "NEWS_EVENTS_SLOAN_EXEC_NEWS_SCHEDULE_SECONDS", 60 * 60 * 3
- ), # default is every 3 hours
- },
- "update_sloan_webinars": {
- "task": "news_events.tasks.get_sloan_exec_webinars",
- "schedule": get_int(
- "NEWS_EVENTS_SLOAN_EXEC_WEBINAR_SCHEDULE_SECONDS", 60 * 60 * 12
- ), # default is every 12 hours
- },
"update_sloan_courses": {
"task": "learning_resources.tasks.get_sloan_data",
"schedule": crontab(minute=30, hour=4), # 12:30am EST
diff --git a/poetry.lock b/poetry.lock
index d535c8d88d..115dc17662 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1431,14 +1431,14 @@ static3 = "*"
[[package]]
name = "django"
-version = "4.2.23"
+version = "4.2.24"
description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
- {file = "django-4.2.23-py3-none-any.whl", hash = "sha256:dafbfaf52c2f289bd65f4ab935791cb4fb9a198f2a5ba9faf35d7338a77e9803"},
- {file = "django-4.2.23.tar.gz", hash = "sha256:42fdeaba6e6449d88d4f66de47871015097dc6f1b87910db00a91946295cfae4"},
+ {file = "django-4.2.24-py3-none-any.whl", hash = "sha256:a6527112c58821a0dfc5ab73013f0bdd906539790a17196658e36e66af43c350"},
+ {file = "django-4.2.24.tar.gz", hash = "sha256:40cd7d3f53bc6cd1902eadce23c337e97200888df41e4a73b42d682f23e71d80"},
]
[package.dependencies]
@@ -9063,4 +9063,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "~3.12"
-content-hash = "aaf3bc99e70404eca795dc3c392b8730509d674637e69836e51ac25aaca0b5ce"
+content-hash = "9b822fba05eb04db225295485b3c412f1dc240bd8a34270ff24bf0980026098b"
diff --git a/pyproject.toml b/pyproject.toml
index ac8873b96c..e5314fbebe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requires-poetry = ">2.1,<3"
[tool.poetry.dependencies]
python = "~3.12"
-Django = "4.2.23"
+Django = "4.2.24"
attrs = "^25.0.0"
base36 = "^0.1.1"
beautifulsoup4 = "^4.8.2"