From 990aeebc7a6cd42d9de7838a275d125e088857d6 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Mon, 24 Apr 2023 13:20:53 -0400 Subject: [PATCH 1/7] chore: Update catalog-info.yaml --- catalog-info.yaml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/catalog-info.yaml b/catalog-info.yaml index 6fb2ae8..28736d3 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -2,9 +2,9 @@ # https://open-edx-proposals.readthedocs.io/en/latest/processes/oep-0055-proc-project-maintainers.html apiVersion: backstage.io/v1alpha1 -kind: "" +kind: "Component" metadata: - name: 'openedx_event_sink_clickhouse' + name: 'openedx-event-sink-clickhouse' description: "A sink for Open edX events to send them to ClickHouse" annotations: # (Optional) Annotation keys and values can be whatever you want. @@ -15,18 +15,10 @@ metadata: spec: # (Required) This can be a group(`group:` or a user(`user:`) - owner: "" + owner: "group:openedx-event-sink-clickhouse-maintainers" # (Required) Acceptable Type Values: service, website, library - type: '' + type: 'library' # (Required) Acceptable Lifecycle Values: experimental, production, deprecated lifecycle: 'experimental' - - # (Optional) The value can be the name of any known component. - subcomponentOf: '' - - # (Optional) An array of different components or resources. - dependsOn: - - '' - - '' From 7747b55ba6ccd32887c2ced4f178f07cd425d5b3 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Mon, 24 Apr 2023 16:22:59 -0400 Subject: [PATCH 2/7] feat: Add event listener for course publish Creates the edx-platform plugin plumbing, adds some new requirements, maps the appropriate Django Signal to push course structure to ClickHouse. --- README.rst | 52 +++- event_sink_clickhouse/apps.py | 39 +++ event_sink_clickhouse/settings/__init__.py | 0 event_sink_clickhouse/settings/common.py | 19 ++ event_sink_clickhouse/settings/production.py | 13 + event_sink_clickhouse/signals.py | 13 + event_sink_clickhouse/sinks/__init__.py | 0 event_sink_clickhouse/sinks/base_sink.py | 26 ++ .../sinks/course_published.py | 226 ++++++++++++++++++ event_sink_clickhouse/tasks.py | 30 +++ requirements/base.in | 5 +- requirements/base.txt | 74 +++++- requirements/dev.txt | 115 +++++++++ requirements/doc.txt | 124 ++++++++-- requirements/quality.txt | 119 ++++++++- requirements/test.in | 1 + requirements/test.txt | 131 +++++++++- setup.py | 8 +- test_utils/helpers.py | 210 ++++++++++++++++ tests/test_course_published.py | 117 +++++++++ tests/test_django_settings.py | 55 +++++ 21 files changed, 1345 insertions(+), 32 deletions(-) create mode 100644 event_sink_clickhouse/settings/__init__.py create mode 100644 event_sink_clickhouse/settings/common.py create mode 100644 event_sink_clickhouse/settings/production.py create mode 100644 event_sink_clickhouse/signals.py create mode 100644 event_sink_clickhouse/sinks/__init__.py create mode 100644 event_sink_clickhouse/sinks/base_sink.py create mode 100644 event_sink_clickhouse/sinks/course_published.py create mode 100644 event_sink_clickhouse/tasks.py create mode 100644 test_utils/helpers.py create mode 100644 tests/test_course_published.py create mode 100644 tests/test_django_settings.py diff --git a/README.rst b/README.rst index 3ad6808..9172454 100644 --- a/README.rst +++ b/README.rst @@ -7,17 +7,25 @@ Event Sink ClickHouse Purpose ******* -A listener for `Open edX events`_ to send them to ClickHouse. This project -acts as a plugin to the Edx Platform, listens for configured Open edX events, -and sends them to a ClickHouse database for analytics or other processing. This -is being maintained as part of the Open Analytics Reference System (OARS) -project. +This project acts as a plugin to the `Edx Platform`_, listens for +configured `Open edX events`_, and sends them to a `ClickHouse`_ database for +analytics or other processing. This is being maintained as part of the Open +Analytics Reference System (`OARS`_) project. OARS consumes the data sent to ClickHouse by this plugin as part of data enrichment for reporting, or capturing data that otherwise does not fit in xAPI. +Currently the only sink is in the CMS. It listens for the ``COURSE_PUBLISHED`` +signal and serializes a subset of the published course blocks into one table +and the relationships between blocks into another table. With those we are +able to recreate the "graph" of the course and get relevant data, such as +block names, for reporting. + .. _Open edX events: https://github.com/openedx/openedx-events +.. _Edx Platform: https://github.com/openedx/edx-platform +.. _ClickHouse: https://clickhouse.com +.. _OARS: https://docs.openedx.org/projects/openedx-oars/en/latest/index.html Getting Started *************** @@ -75,12 +83,38 @@ Every time you develop something in this repo Deploying ========= -TODO: How can a new user go about deploying this component? Is it just a few -commands? Is there a larger how-to that should be linked here? +This plugin will be deployed by default in an OARS Tutor environment. For other +deployments install the library or add it to private requirements of your +virtual environment ( ``requirements/private.txt`` ). + +#. Run ``pip install openedx-event-sink-clickhouse``. + +#. Run migrations: + +- ``python manage.py lms migrate`` + +- ``python manage.py cms migrate`` -PLACEHOLDER: For details on how to deploy this component, see the `deployment how-to`_ +#. Restart LMS service and celery workers of edx-platform. + +Configuration +=============== + +Currently all events will be listened to by default (there is only one). So +the only necessary configuration is a ClickHouse connection: + +.. code-block:: -.. _deployment how-to: https://docs.openedx.org/projects/openedx-event-sink-clickhouse/how-tos/how-to-deploy-this-component.html + EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG = { + # URL to a running ClickHouse server's HTTP interface. ex: https://foo.openedx.org:8443/ or + # http://foo.openedx.org:8123/ . Note that we only support the ClickHouse HTTP interface + # to avoid pulling in more dependencies to the platform than necessary. + "url": "http://clickhouse:8123", + "username": "changeme", + "password": "changeme", + "database": "event_sink", + "timeout_secs": 3, + } Getting Help ************ diff --git a/event_sink_clickhouse/apps.py b/event_sink_clickhouse/apps.py index 4981b8f..2d666d2 100644 --- a/event_sink_clickhouse/apps.py +++ b/event_sink_clickhouse/apps.py @@ -3,6 +3,7 @@ """ from django.apps import AppConfig +from edx_django_utils.plugins import PluginSettings, PluginSignals class EventSinkClickhouseConfig(AppConfig): @@ -11,3 +12,41 @@ class EventSinkClickhouseConfig(AppConfig): """ name = 'event_sink_clickhouse' + verbose_name = "Event Sink ClickHouse" + + plugin_app = { + PluginSettings.CONFIG: { + 'lms.djangoapp': { + 'production': {PluginSettings.RELATIVE_PATH: 'settings.production'}, + 'common': {PluginSettings.RELATIVE_PATH: 'settings.common'}, + 'devstack': {PluginSettings.RELATIVE_PATH: 'settings.devstack'}, + }, + 'cms.djangoapp': { + 'production': {PluginSettings.RELATIVE_PATH: 'settings.production'}, + 'common': {PluginSettings.RELATIVE_PATH: 'settings.common'}, + 'devstack': {PluginSettings.RELATIVE_PATH: 'settings.devstack'}, + } + }, + # Configuration setting for Plugin Signals for this app. + PluginSignals.CONFIG: { + # Configure the Plugin Signals for each Project Type, as needed. + 'cms.djangoapp': { + # List of all plugin Signal receivers for this app and project type. + PluginSignals.RECEIVERS: [{ + # The name of the app's signal receiver function. + PluginSignals.RECEIVER_FUNC_NAME: 'receive_course_publish', + + # The full path to the module where the signal is defined. + PluginSignals.SIGNAL_PATH: 'xmodule.modulestore.django.COURSE_PUBLISHED', + }], + } + }, + } + + def ready(self): + """ + Import our Celery tasks for initialization. + """ + super().ready() + + from . import tasks # pylint: disable=import-outside-toplevel, unused-import diff --git a/event_sink_clickhouse/settings/__init__.py b/event_sink_clickhouse/settings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/event_sink_clickhouse/settings/common.py b/event_sink_clickhouse/settings/common.py new file mode 100644 index 0000000..1b6d117 --- /dev/null +++ b/event_sink_clickhouse/settings/common.py @@ -0,0 +1,19 @@ +""" +Default settings for the openedx_event_sink_clickhouse app. +""" + + +def plugin_settings(settings): + """ + Adds default settings + """ + settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG = { + # URL to a running ClickHouse server's HTTP interface. ex: https://foo.openedx.org:8443/ or + # http://foo.openedx.org:8123/ . Note that we only support the ClickHouse HTTP interface + # to avoid pulling in more dependencies to the platform than necessary. + "url": "http://clickhouse:8123", + "username": "changeme", + "password": "changeme", + "database": "event_sink", + "timeout_secs": 3, + } diff --git a/event_sink_clickhouse/settings/production.py b/event_sink_clickhouse/settings/production.py new file mode 100644 index 0000000..18fee66 --- /dev/null +++ b/event_sink_clickhouse/settings/production.py @@ -0,0 +1,13 @@ +""" +Production settings for the openedx_event_sink_clickhouse app. +""" + + +def plugin_settings(settings): + """ + Override the default app settings with production settings. + """ + settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG = settings.ENV_TOKENS.get( + 'EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG', + settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG + ) diff --git a/event_sink_clickhouse/signals.py b/event_sink_clickhouse/signals.py new file mode 100644 index 0000000..f752120 --- /dev/null +++ b/event_sink_clickhouse/signals.py @@ -0,0 +1,13 @@ +""" +Signal handler functions, mapped to specific signals in apps.py. +""" + + +def receive_course_publish(sender, course_key, **kwargs): # pylint: disable=unused-argument + """ + Receives COURSE_PUBLISHED signal and queues the dump job. + """ + # import here, because signal is registered at startup, but items in tasks are not yet able to be loaded + from .tasks import dump_course_to_clickhouse # pylint: disable=import-outside-toplevel + + dump_course_to_clickhouse.delay(str(course_key)) diff --git a/event_sink_clickhouse/sinks/__init__.py b/event_sink_clickhouse/sinks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/event_sink_clickhouse/sinks/base_sink.py b/event_sink_clickhouse/sinks/base_sink.py new file mode 100644 index 0000000..89ca440 --- /dev/null +++ b/event_sink_clickhouse/sinks/base_sink.py @@ -0,0 +1,26 @@ +""" +Base classes for event sinks +""" + +from django.conf import settings + + +class BaseSink: + """ + Base class for ClickHouse event sink, allows overwriting of default settings + """ + def __init__(self, connection_overrides, log): + self.log = log + self.ch_url = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["url"] + self.ch_auth = (settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["username"], + settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["password"]) + self.ch_database = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["database"] + self.ch_timeout_secs = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["timeout_secs"] + + # If any overrides to the ClickHouse connection + if connection_overrides: + self.ch_url = connection_overrides.get("url", self.ch_url) + self.ch_auth = (connection_overrides.get("username", self.ch_auth[0]), + connection_overrides.get("password", self.ch_auth[1])) + self.ch_database = connection_overrides.get("database", self.ch_database) + self.ch_timeout_secs = connection_overrides.get("timeout_secs", self.ch_timeout_secs) diff --git a/event_sink_clickhouse/sinks/course_published.py b/event_sink_clickhouse/sinks/course_published.py new file mode 100644 index 0000000..92c7567 --- /dev/null +++ b/event_sink_clickhouse/sinks/course_published.py @@ -0,0 +1,226 @@ +""" +Handler for the CMS COURSE_PUBLISHED event + +Does the following: +- Pulls the course structure from modulestore +- Serialize the xblocks and their parent/child relationships +- Sends them to ClickHouse in CSV format + +Note that the serialization format does not include all fields as there may be things like +LTI passwords and other secrets. We just take the fields necessary for reporting at this time. +""" + +import csv +import io + +import requests +from django.utils import timezone + +from .base_sink import BaseSink + + +class CoursePublishedSink(BaseSink): + """ + Event sink for the COURSE_PUBLISHED signal + """ + @staticmethod + def _get_detached_xblock_types(): + """ + Import and return DETACHED_XBLOCK_TYPES. + Placed here to avoid model import at startup and to facilitate mocking them in testing. + """ + # pylint: disable=import-outside-toplevel,import-error + from xmodule.modulestore.store_utilities import DETACHED_XBLOCK_TYPES + return DETACHED_XBLOCK_TYPES + + @staticmethod + def _get_modulestore(): + """ + Import and return modulestore. + Placed here to avoid model import at startup and to facilitate mocking them in testing. + """ + # pylint: disable=import-outside-toplevel,import-error + from xmodule.modulestore.django import modulestore + return modulestore() + + @staticmethod + def _get_course_overview_model(): + """ + Import and return CourseOverview. + Placed here to avoid model import at startup and to facilitate mocking them in testing. + """ + # pylint: disable=import-outside-toplevel,import-error + from openedx.core.djangoapps.content.course_overviews.models import CourseOverview + return CourseOverview + + @staticmethod + def strip_branch_and_version(location): + """ + Removes the branch and version information from a location. + Args: + location: an xblock's location. + Returns: that xblock's location without branch and version information. + """ + return location.for_branch(None) + + @staticmethod + def get_course_last_published(course_key): + """ + Get approximate last publish date for the given course. + + We use the 'modified' column in the CourseOverview table as a quick and easy + (although perhaps inexact) way of determining when a course was last + published. This works because CourseOverview rows are re-written upon + course publish. + + Args: + course_key: a CourseKey + + Returns: The datetime the course was last published at, stringified. + Uses Python's default str(...) implementation for datetimes, which + is sortable and similar to ISO 8601: + https://docs.python.org/3/library/datetime.html#datetime.date.__str__ + """ + CourseOverview = CoursePublishedSink._get_course_overview_model() + approx_last_published = CourseOverview.get_from_id(course_key).modified + return str(approx_last_published) + + @staticmethod + def serialize_item(item, index, detached_xblock_types): + """ + Args: + item: an XBlock + index: a number indicating where the item falls in the course hierarchy + + Returns: + fields: a *limited* dictionary of an XBlock's field names and values + block_type: the name of the XBlock's type (i.e. 'course' + or 'problem') + """ + course_key = item.scope_ids.usage_id.course_key + block_type = item.scope_ids.block_type + + rtn_fields = { + 'org': course_key.org, + 'course_key': str(course_key), + 'course': course_key.course, + 'run': course_key.run, + 'location': str(item.location), + 'display_name': item.display_name_with_default.replace("'", "\'"), + 'block_type': block_type, + 'detached': 1 if block_type in detached_xblock_types else 0, + 'edited_on': str(getattr(item, 'edited_on', '')), + 'time_last_dumped': str(timezone.now()), + 'order': index, + } + + return rtn_fields + + def serialize_course(self, course_id): + """ + Serializes a course into a CSV of nodes and relationships. + + Args: + course_id: CourseKey of the course we want to serialize + + Returns: + nodes: a list of dicts representing xblocks for the course + relationships: a list of dicts representing relationships between nodes + """ + modulestore = CoursePublishedSink._get_modulestore() + detached_xblock_types = CoursePublishedSink._get_detached_xblock_types() + + # create a location to node mapping we'll need later for + # writing relationships + location_to_node = {} + items = modulestore.get_items(course_id) + + # create nodes + i = 0 + for item in items: + i += 1 + fields = self.serialize_item(item, i, detached_xblock_types) + location_to_node[self.strip_branch_and_version(item.location)] = fields + + # create relationships + relationships = [] + for item in items: + for index, child in enumerate(item.get_children()): + parent_node = location_to_node.get(self.strip_branch_and_version(item.location)) + child_node = location_to_node.get(self.strip_branch_and_version(child.location)) + + if parent_node is not None and child_node is not None: + relationship = { + 'course_key': str(course_id), + 'parent_location': str(parent_node["location"]), + 'child_location': str(child_node["location"]), + 'order': index + } + relationships.append(relationship) + + nodes = list(location_to_node.values()) + return nodes, relationships + + def dump(self, course_key): + """ + Do the serialization and send to ClickHouse + """ + nodes, relationships = self.serialize_course(course_key) + + self.log.info( + "Now dumping %s to ClickHouse: %d nodes and %d relationships", + course_key, + len(nodes), + len(relationships), + ) + + course_string = str(course_key) + + try: + # Params that begin with "param_" will be used in the query replacement + # all others are ClickHouse settings. + params = { + # Fail early on bulk inserts + "input_format_allow_errors_num": 1, + "input_format_allow_errors_ratio": 0.1, + } + + # "query" is a special param for the query, it's the best way to get the FORMAT CSV in there. + params["query"] = f"INSERT INTO {self.ch_database}.course_blocks FORMAT CSV" + + output = io.StringIO() + writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) + + for node in nodes: + writer.writerow(node.values()) + + response = requests.post(self.ch_url, data=output.getvalue(), params=params, auth=self.ch_auth, + timeout=self.ch_timeout_secs) + self.log.info(response.headers) + self.log.info(response) + self.log.info(response.text) + response.raise_for_status() + + # Just overwriting the previous query + params["query"] = f"INSERT INTO {self.ch_database}.course_relationships FORMAT CSV" + output = io.StringIO() + writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) + + for relationship in relationships: + writer.writerow(relationship.values()) + + response = requests.post(self.ch_url, data=output.getvalue(), params=params, auth=self.ch_auth, + timeout=self.ch_timeout_secs) + self.log.info(response.headers) + self.log.info(response) + self.log.info(response.text) + response.raise_for_status() + + self.log.info("Completed dumping %s to ClickHouse", course_key) + + except Exception: + self.log.exception( + "Error trying to dump course %s to ClickHouse!", + course_string + ) + raise diff --git a/event_sink_clickhouse/tasks.py b/event_sink_clickhouse/tasks.py new file mode 100644 index 0000000..60d54e1 --- /dev/null +++ b/event_sink_clickhouse/tasks.py @@ -0,0 +1,30 @@ +""" +This file contains a management command for exporting course modulestore data to ClickHouse. +""" + +import logging + +from celery import shared_task +from edx_django_utils.monitoring import set_code_owner_attribute +from opaque_keys.edx.keys import CourseKey + +from .sinks.course_published import CoursePublishedSink + +log = logging.getLogger(__name__) +celery_log = logging.getLogger('edx.celery.task') + + +@shared_task +@set_code_owner_attribute +def dump_course_to_clickhouse(course_key_string, connection_overrides=None): + """ + Serialize a course and writes it to ClickHouse. + + Arguments: + course_key_string: course key for the course to be exported + connection_overrides (dict): overrides to ClickHouse connection + parameters specified in `settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG`. + """ + course_key = CourseKey.from_string(course_key_string) + sink = CoursePublishedSink(connection_overrides=connection_overrides, log=celery_log) + sink.dump(course_key) diff --git a/requirements/base.in b/requirements/base.in index a954780..c08ac0a 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,5 +1,8 @@ # Core requirements for using this application -c constraints.txt +celery # Asynchronous task execution library Django # Web application framework - +requests # HTTP request library +edx-django-utils # Django utilities, we use caching and monitoring +edx-opaque-keys # Parsing library for course and usage keys diff --git a/requirements/base.txt b/requirements/base.txt index a615544..6dc376e 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,13 +4,85 @@ # # make upgrade # +amqp==5.1.1 + # via kombu asgiref==3.6.0 # via django +billiard==3.6.4.0 + # via celery +celery==5.2.7 + # via -r requirements/base.in +certifi==2022.12.7 + # via requests +cffi==1.15.1 + # via pynacl +charset-normalizer==3.1.0 + # via requests +click==8.1.3 + # via + # celery + # click-didyoumean + # click-plugins + # click-repl + # edx-django-utils +click-didyoumean==0.3.0 + # via celery +click-plugins==1.1.1 + # via celery +click-repl==0.2.0 + # via celery django==3.2.18 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/base.in + # django-crum + # edx-django-utils +django-crum==0.7.9 + # via edx-django-utils +django-waffle==3.0.0 + # via edx-django-utils +edx-django-utils==5.4.0 + # via -r requirements/base.in +edx-opaque-keys==2.3.0 + # via -r requirements/base.in +idna==3.4 + # via requests +kombu==5.2.4 + # via celery +newrelic==8.8.0 + # via edx-django-utils +pbr==5.11.1 + # via stevedore +prompt-toolkit==3.0.38 + # via click-repl +psutil==5.9.5 + # via edx-django-utils +pycparser==2.21 + # via cffi +pymongo==3.13.0 + # via edx-opaque-keys +pynacl==1.5.0 + # via edx-django-utils pytz==2023.3 - # via django + # via + # celery + # django +requests==2.29.0 + # via -r requirements/base.in +six==1.16.0 + # via click-repl sqlparse==0.4.4 # via django +stevedore==5.0.0 + # via + # edx-django-utils + # edx-opaque-keys +urllib3==1.26.15 + # via requests +vine==5.0.0 + # via + # amqp + # celery + # kombu +wcwidth==0.2.6 + # via prompt-toolkit diff --git a/requirements/dev.txt b/requirements/dev.txt index 2392e4a..bea09b2 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,6 +4,10 @@ # # make upgrade # +amqp==5.1.1 + # via + # -r requirements/quality.txt + # kombu asgiref==3.6.0 # via # -r requirements/quality.txt @@ -13,24 +17,59 @@ astroid==2.15.4 # -r requirements/quality.txt # pylint # pylint-celery +billiard==3.6.4.0 + # via + # -r requirements/quality.txt + # celery build==0.10.0 # via # -r requirements/pip-tools.txt # pip-tools +celery==5.2.7 + # via -r requirements/quality.txt +certifi==2022.12.7 + # via + # -r requirements/quality.txt + # requests +cffi==1.15.1 + # via + # -r requirements/quality.txt + # pynacl chardet==5.1.0 # via diff-cover +charset-normalizer==3.1.0 + # via + # -r requirements/quality.txt + # requests click==8.1.3 # via # -r requirements/pip-tools.txt # -r requirements/quality.txt + # celery + # click-didyoumean # click-log + # click-plugins + # click-repl # code-annotations + # edx-django-utils # edx-lint # pip-tools +click-didyoumean==0.3.0 + # via + # -r requirements/quality.txt + # celery click-log==0.4.0 # via # -r requirements/quality.txt # edx-lint +click-plugins==1.1.1 + # via + # -r requirements/quality.txt + # celery +click-repl==0.2.0 + # via + # -r requirements/quality.txt + # celery code-annotations==1.3.0 # via # -r requirements/quality.txt @@ -53,11 +92,25 @@ django==3.2.18 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/quality.txt + # django-crum + # edx-django-utils # edx-i18n-tools +django-crum==0.7.9 + # via + # -r requirements/quality.txt + # edx-django-utils +django-waffle==3.0.0 + # via + # -r requirements/quality.txt + # edx-django-utils +edx-django-utils==5.4.0 + # via -r requirements/quality.txt edx-i18n-tools==0.9.2 # via -r requirements/dev.in edx-lint==5.3.4 # via -r requirements/quality.txt +edx-opaque-keys==2.3.0 + # via -r requirements/quality.txt exceptiongroup==1.1.1 # via # -r requirements/quality.txt @@ -67,6 +120,10 @@ filelock==3.12.0 # -r requirements/ci.txt # tox # virtualenv +idna==3.4 + # via + # -r requirements/quality.txt + # requests iniconfig==2.0.0 # via # -r requirements/quality.txt @@ -80,6 +137,10 @@ jinja2==3.1.2 # -r requirements/quality.txt # code-annotations # diff-cover +kombu==5.2.4 + # via + # -r requirements/quality.txt + # celery lazy-object-proxy==1.9.0 # via # -r requirements/quality.txt @@ -92,6 +153,10 @@ mccabe==0.7.0 # via # -r requirements/quality.txt # pylint +newrelic==8.8.0 + # via + # -r requirements/quality.txt + # edx-django-utils packaging==23.1 # via # -r requirements/ci.txt @@ -123,12 +188,24 @@ pluggy==1.0.0 # tox polib==1.2.0 # via edx-i18n-tools +prompt-toolkit==3.0.38 + # via + # -r requirements/quality.txt + # click-repl +psutil==5.9.5 + # via + # -r requirements/quality.txt + # edx-django-utils py==1.11.0 # via # -r requirements/ci.txt # tox pycodestyle==2.10.0 # via -r requirements/quality.txt +pycparser==2.21 + # via + # -r requirements/quality.txt + # cffi pydocstyle==6.3.0 # via -r requirements/quality.txt pygments==2.15.1 @@ -153,6 +230,14 @@ pylint-plugin-utils==0.7 # -r requirements/quality.txt # pylint-celery # pylint-django +pymongo==3.13.0 + # via + # -r requirements/quality.txt + # edx-opaque-keys +pynacl==1.5.0 + # via + # -r requirements/quality.txt + # edx-django-utils pyproject-hooks==1.0.0 # via # -r requirements/pip-tools.txt @@ -173,16 +258,25 @@ python-slugify==8.0.1 pytz==2023.3 # via # -r requirements/quality.txt + # celery # django pyyaml==6.0 # via # -r requirements/quality.txt # code-annotations # edx-i18n-tools + # responses +requests==2.29.0 + # via + # -r requirements/quality.txt + # responses +responses==0.23.1 + # via -r requirements/quality.txt six==1.16.0 # via # -r requirements/ci.txt # -r requirements/quality.txt + # click-repl # edx-lint # tox snowballstemmer==2.2.0 @@ -197,6 +291,8 @@ stevedore==5.0.0 # via # -r requirements/quality.txt # code-annotations + # edx-django-utils + # edx-opaque-keys text-unidecode==1.3 # via # -r requirements/quality.txt @@ -223,15 +319,34 @@ tox==3.28.0 # tox-battery tox-battery==0.6.1 # via -r requirements/ci.txt +types-pyyaml==6.0.12.9 + # via + # -r requirements/quality.txt + # responses typing-extensions==4.5.0 # via # -r requirements/quality.txt # astroid # pylint +urllib3==1.26.15 + # via + # -r requirements/quality.txt + # requests + # responses +vine==5.0.0 + # via + # -r requirements/quality.txt + # amqp + # celery + # kombu virtualenv==20.23.0 # via # -r requirements/ci.txt # tox +wcwidth==0.2.6 + # via + # -r requirements/quality.txt + # prompt-toolkit wheel==0.40.0 # via # -r requirements/pip-tools.txt diff --git a/requirements/doc.txt b/requirements/doc.txt index 55ff9ea..23fc8d3 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -8,6 +8,10 @@ accessible-pygments==0.0.4 # via pydata-sphinx-theme alabaster==0.7.13 # via sphinx +amqp==5.1.1 + # via + # -r requirements/test.txt + # kombu asgiref==3.6.0 # via # -r requirements/test.txt @@ -18,32 +22,69 @@ babel==2.12.1 # sphinx beautifulsoup4==4.12.2 # via pydata-sphinx-theme +billiard==3.6.4.0 + # via + # -r requirements/test.txt + # celery bleach==6.0.0 # via readme-renderer build==0.10.0 # via -r requirements/doc.in +celery==5.2.7 + # via -r requirements/test.txt certifi==2022.12.7 - # via requests + # via + # -r requirements/test.txt + # requests cffi==1.15.1 - # via cryptography + # via + # -r requirements/test.txt + # pynacl charset-normalizer==3.1.0 - # via requests + # via + # -r requirements/test.txt + # requests click==8.1.3 # via # -r requirements/test.txt + # celery + # click-didyoumean + # click-plugins + # click-repl # code-annotations + # edx-django-utils +click-didyoumean==0.3.0 + # via + # -r requirements/test.txt + # celery +click-plugins==1.1.1 + # via + # -r requirements/test.txt + # celery +click-repl==0.2.0 + # via + # -r requirements/test.txt + # celery code-annotations==1.3.0 # via -r requirements/test.txt coverage[toml]==7.2.5 # via # -r requirements/test.txt # pytest-cov -cryptography==40.0.2 - # via secretstorage django==3.2.18 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/test.txt + # django-crum + # edx-django-utils +django-crum==0.7.9 + # via + # -r requirements/test.txt + # edx-django-utils +django-waffle==3.0.0 + # via + # -r requirements/test.txt + # edx-django-utils doc8==1.1.1 # via -r requirements/doc.in docutils==0.19 @@ -53,12 +94,18 @@ docutils==0.19 # readme-renderer # restructuredtext-lint # sphinx +edx-django-utils==5.4.0 + # via -r requirements/test.txt +edx-opaque-keys==2.3.0 + # via -r requirements/test.txt exceptiongroup==1.1.1 # via # -r requirements/test.txt # pytest idna==3.4 - # via requests + # via + # -r requirements/test.txt + # requests imagesize==1.4.1 # via sphinx importlib-metadata==6.6.0 @@ -74,10 +121,6 @@ iniconfig==2.0.0 # pytest jaraco-classes==3.2.3 # via keyring -jeepney==0.8.0 - # via - # keyring - # secretstorage jinja2==3.1.2 # via # -r requirements/test.txt @@ -85,6 +128,10 @@ jinja2==3.1.2 # sphinx keyring==23.13.1 # via twine +kombu==5.2.4 + # via + # -r requirements/test.txt + # celery markdown-it-py==2.2.0 # via rich markupsafe==2.1.2 @@ -95,6 +142,10 @@ mdurl==0.1.2 # via markdown-it-py more-itertools==9.1.0 # via jaraco-classes +newrelic==8.8.0 + # via + # -r requirements/test.txt + # edx-django-utils packaging==23.1 # via # -r requirements/test.txt @@ -112,8 +163,18 @@ pluggy==1.0.0 # via # -r requirements/test.txt # pytest +prompt-toolkit==3.0.38 + # via + # -r requirements/test.txt + # click-repl +psutil==5.9.5 + # via + # -r requirements/test.txt + # edx-django-utils pycparser==2.21 - # via cffi + # via + # -r requirements/test.txt + # cffi pydata-sphinx-theme==0.13.3 # via sphinx-book-theme pygments==2.15.1 @@ -124,6 +185,14 @@ pygments==2.15.1 # readme-renderer # rich # sphinx +pymongo==3.13.0 + # via + # -r requirements/test.txt + # edx-opaque-keys +pynacl==1.5.0 + # via + # -r requirements/test.txt + # edx-django-utils pyproject-hooks==1.0.0 # via build pytest==7.3.1 @@ -143,30 +212,37 @@ pytz==2023.3 # via # -r requirements/test.txt # babel + # celery # django pyyaml==6.0 # via # -r requirements/test.txt # code-annotations + # responses readme-renderer==37.3 # via twine requests==2.29.0 # via + # -r requirements/test.txt # requests-toolbelt + # responses # sphinx # twine -requests-toolbelt==0.10.1 +requests-toolbelt==1.0.0 # via twine +responses==0.23.1 + # via -r requirements/test.txt restructuredtext-lint==1.4.0 # via doc8 rfc3986==2.0.0 # via twine rich==13.3.5 # via twine -secretstorage==3.3.3 - # via keyring six==1.16.0 - # via bleach + # via + # -r requirements/test.txt + # bleach + # click-repl snowballstemmer==2.2.0 # via sphinx soupsieve==2.4.1 @@ -200,6 +276,8 @@ stevedore==5.0.0 # -r requirements/test.txt # code-annotations # doc8 + # edx-django-utils + # edx-opaque-keys text-unidecode==1.3 # via # -r requirements/test.txt @@ -214,14 +292,30 @@ tomli==2.0.1 # pytest twine==4.0.2 # via -r requirements/doc.in +types-pyyaml==6.0.12.9 + # via + # -r requirements/test.txt + # responses typing-extensions==4.5.0 # via # pydata-sphinx-theme # rich urllib3==1.26.15 # via + # -r requirements/test.txt # requests + # responses # twine +vine==5.0.0 + # via + # -r requirements/test.txt + # amqp + # celery + # kombu +wcwidth==0.2.6 + # via + # -r requirements/test.txt + # prompt-toolkit webencodings==0.5.1 # via bleach zipp==3.15.0 diff --git a/requirements/quality.txt b/requirements/quality.txt index 328a6ee..241350d 100644 --- a/requirements/quality.txt +++ b/requirements/quality.txt @@ -4,6 +4,10 @@ # # make upgrade # +amqp==5.1.1 + # via + # -r requirements/test.txt + # kombu asgiref==3.6.0 # via # -r requirements/test.txt @@ -12,14 +16,49 @@ astroid==2.15.4 # via # pylint # pylint-celery +billiard==3.6.4.0 + # via + # -r requirements/test.txt + # celery +celery==5.2.7 + # via -r requirements/test.txt +certifi==2022.12.7 + # via + # -r requirements/test.txt + # requests +cffi==1.15.1 + # via + # -r requirements/test.txt + # pynacl +charset-normalizer==3.1.0 + # via + # -r requirements/test.txt + # requests click==8.1.3 # via # -r requirements/test.txt + # celery + # click-didyoumean # click-log + # click-plugins + # click-repl # code-annotations + # edx-django-utils # edx-lint +click-didyoumean==0.3.0 + # via + # -r requirements/test.txt + # celery click-log==0.4.0 # via edx-lint +click-plugins==1.1.1 + # via + # -r requirements/test.txt + # celery +click-repl==0.2.0 + # via + # -r requirements/test.txt + # celery code-annotations==1.3.0 # via # -r requirements/test.txt @@ -34,12 +73,30 @@ django==3.2.18 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/test.txt + # django-crum + # edx-django-utils +django-crum==0.7.9 + # via + # -r requirements/test.txt + # edx-django-utils +django-waffle==3.0.0 + # via + # -r requirements/test.txt + # edx-django-utils +edx-django-utils==5.4.0 + # via -r requirements/test.txt edx-lint==5.3.4 # via -r requirements/quality.in +edx-opaque-keys==2.3.0 + # via -r requirements/test.txt exceptiongroup==1.1.1 # via # -r requirements/test.txt # pytest +idna==3.4 + # via + # -r requirements/test.txt + # requests iniconfig==2.0.0 # via # -r requirements/test.txt @@ -52,6 +109,10 @@ jinja2==3.1.2 # via # -r requirements/test.txt # code-annotations +kombu==5.2.4 + # via + # -r requirements/test.txt + # celery lazy-object-proxy==1.9.0 # via astroid markupsafe==2.1.2 @@ -60,6 +121,10 @@ markupsafe==2.1.2 # jinja2 mccabe==0.7.0 # via pylint +newrelic==8.8.0 + # via + # -r requirements/test.txt + # edx-django-utils packaging==23.1 # via # -r requirements/test.txt @@ -74,8 +139,20 @@ pluggy==1.0.0 # via # -r requirements/test.txt # pytest +prompt-toolkit==3.0.38 + # via + # -r requirements/test.txt + # click-repl +psutil==5.9.5 + # via + # -r requirements/test.txt + # edx-django-utils pycodestyle==2.10.0 # via -r requirements/quality.in +pycparser==2.21 + # via + # -r requirements/test.txt + # cffi pydocstyle==6.3.0 # via -r requirements/quality.in pylint==2.17.3 @@ -92,6 +169,14 @@ pylint-plugin-utils==0.7 # via # pylint-celery # pylint-django +pymongo==3.13.0 + # via + # -r requirements/test.txt + # edx-opaque-keys +pynacl==1.5.0 + # via + # -r requirements/test.txt + # edx-django-utils pytest==7.3.1 # via # -r requirements/test.txt @@ -108,13 +193,24 @@ python-slugify==8.0.1 pytz==2023.3 # via # -r requirements/test.txt + # celery # django pyyaml==6.0 # via # -r requirements/test.txt # code-annotations + # responses +requests==2.29.0 + # via + # -r requirements/test.txt + # responses +responses==0.23.1 + # via -r requirements/test.txt six==1.16.0 - # via edx-lint + # via + # -r requirements/test.txt + # click-repl + # edx-lint snowballstemmer==2.2.0 # via pydocstyle sqlparse==0.4.4 @@ -125,6 +221,8 @@ stevedore==5.0.0 # via # -r requirements/test.txt # code-annotations + # edx-django-utils + # edx-opaque-keys text-unidecode==1.3 # via # -r requirements/test.txt @@ -137,9 +235,28 @@ tomli==2.0.1 # pytest tomlkit==0.11.8 # via pylint +types-pyyaml==6.0.12.9 + # via + # -r requirements/test.txt + # responses typing-extensions==4.5.0 # via # astroid # pylint +urllib3==1.26.15 + # via + # -r requirements/test.txt + # requests + # responses +vine==5.0.0 + # via + # -r requirements/test.txt + # amqp + # celery + # kombu +wcwidth==0.2.6 + # via + # -r requirements/test.txt + # prompt-toolkit wrapt==1.15.0 # via astroid diff --git a/requirements/test.in b/requirements/test.in index 6797160..bb78ea0 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -6,3 +6,4 @@ pytest-cov # pytest extension for code coverage statistics pytest-django # pytest extension for better Django support code-annotations # provides commands used by the pii_check make target. +responses # mocks for the requests library diff --git a/requirements/test.txt b/requirements/test.txt index 66eec31..dd30475 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -4,12 +4,53 @@ # # make upgrade # +amqp==5.1.1 + # via + # -r requirements/base.txt + # kombu asgiref==3.6.0 # via # -r requirements/base.txt # django +billiard==3.6.4.0 + # via + # -r requirements/base.txt + # celery +celery==5.2.7 + # via -r requirements/base.txt +certifi==2022.12.7 + # via + # -r requirements/base.txt + # requests +cffi==1.15.1 + # via + # -r requirements/base.txt + # pynacl +charset-normalizer==3.1.0 + # via + # -r requirements/base.txt + # requests click==8.1.3 - # via code-annotations + # via + # -r requirements/base.txt + # celery + # click-didyoumean + # click-plugins + # click-repl + # code-annotations + # edx-django-utils +click-didyoumean==0.3.0 + # via + # -r requirements/base.txt + # celery +click-plugins==1.1.1 + # via + # -r requirements/base.txt + # celery +click-repl==0.2.0 + # via + # -r requirements/base.txt + # celery code-annotations==1.3.0 # via -r requirements/test.in coverage[toml]==7.2.5 @@ -17,20 +58,68 @@ coverage[toml]==7.2.5 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/base.txt + # django-crum + # edx-django-utils +django-crum==0.7.9 + # via + # -r requirements/base.txt + # edx-django-utils +django-waffle==3.0.0 + # via + # -r requirements/base.txt + # edx-django-utils +edx-django-utils==5.4.0 + # via -r requirements/base.txt +edx-opaque-keys==2.3.0 + # via -r requirements/base.txt exceptiongroup==1.1.1 # via pytest +idna==3.4 + # via + # -r requirements/base.txt + # requests iniconfig==2.0.0 # via pytest jinja2==3.1.2 # via code-annotations +kombu==5.2.4 + # via + # -r requirements/base.txt + # celery markupsafe==2.1.2 # via jinja2 +newrelic==8.8.0 + # via + # -r requirements/base.txt + # edx-django-utils packaging==23.1 # via pytest pbr==5.11.1 - # via stevedore + # via + # -r requirements/base.txt + # stevedore pluggy==1.0.0 # via pytest +prompt-toolkit==3.0.38 + # via + # -r requirements/base.txt + # click-repl +psutil==5.9.5 + # via + # -r requirements/base.txt + # edx-django-utils +pycparser==2.21 + # via + # -r requirements/base.txt + # cffi +pymongo==3.13.0 + # via + # -r requirements/base.txt + # edx-opaque-keys +pynacl==1.5.0 + # via + # -r requirements/base.txt + # edx-django-utils pytest==7.3.1 # via # pytest-cov @@ -44,18 +133,52 @@ python-slugify==8.0.1 pytz==2023.3 # via # -r requirements/base.txt + # celery # django pyyaml==6.0 - # via code-annotations + # via + # code-annotations + # responses +requests==2.29.0 + # via + # -r requirements/base.txt + # responses +responses==0.23.1 + # via -r requirements/test.in +six==1.16.0 + # via + # -r requirements/base.txt + # click-repl sqlparse==0.4.4 # via # -r requirements/base.txt # django stevedore==5.0.0 - # via code-annotations + # via + # -r requirements/base.txt + # code-annotations + # edx-django-utils + # edx-opaque-keys text-unidecode==1.3 # via python-slugify tomli==2.0.1 # via # coverage # pytest +types-pyyaml==6.0.12.9 + # via responses +urllib3==1.26.15 + # via + # -r requirements/base.txt + # requests + # responses +vine==5.0.0 + # via + # -r requirements/base.txt + # amqp + # celery + # kombu +wcwidth==0.2.6 + # via + # -r requirements/base.txt + # prompt-toolkit diff --git a/setup.py b/setup.py index ea1feb7..55e68c3 100755 --- a/setup.py +++ b/setup.py @@ -112,7 +112,13 @@ def is_requirement(line): include=['event_sink_clickhouse', 'event_sink_clickhouse.*'], exclude=["*tests"], ), - + entry_points={ + "lms.djangoapp": [ + ], + "cms.djangoapp": [ + "event-sink-clickhouse = event_sink_clickhouse.apps:EventSinkClickhouseConfig", + ], + }, include_package_data=True, install_requires=load_requirements('requirements/base.in'), python_requires=">=3.8", diff --git a/test_utils/helpers.py b/test_utils/helpers.py new file mode 100644 index 0000000..b1de1fd --- /dev/null +++ b/test_utils/helpers.py @@ -0,0 +1,210 @@ +""" +Helper functions for tests +""" + +import csv +import random +import string +from datetime import datetime +from io import StringIO +from unittest.mock import MagicMock, Mock + +from opaque_keys.edx.keys import CourseKey +from opaque_keys.edx.locator import BlockUsageLocator + +from event_sink_clickhouse.sinks.course_published import CoursePublishedSink + +ORIG_IMPORT = __import__ +ORG = "testorg" +COURSE = "testcourse" +COURSE_RUN = "2023_Fall" + + +class FakeXBlock: + """ + Fakes the parameters of an XBlock that we care about. + """ + def __init__(self, identifier, detached_block=False): + self.block_type = "course_info" if detached_block else "vertical" + self.scope_ids = Mock() + self.scope_ids.usage_id.course_key = course_key_factory() + self.scope_ids.block_type = self.block_type + self.location = block_usage_locator_factory() + self.display_name_with_default = f"Display name {identifier}" + self.edited_on = datetime.now() + self.children = [] + + def get_children(self): + """ + Fakes the method of the same name from an XBlock. + """ + return self.children + + +def course_str_factory(): + """ + Return a valid course key string. + """ + course_str = f"course-v1:{ORG}+{COURSE}+{COURSE_RUN}" + return course_str + + +def course_key_factory(): + """ + Return a CourseKey object from our course key string. + """ + return CourseKey.from_string(course_str_factory()) + + +def block_usage_locator_factory(): + """ + Create a BlockUsageLocator with a random id. + """ + block_id = ''.join(random.choices(string.ascii_letters, k=10)) + return BlockUsageLocator(course_key_factory(), block_type="category", block_id=block_id, deprecated=True) + + +def mock_course_overview(): + """ + Create a fake CourseOverview object that supports just the things we care about. + """ + mock_overview = MagicMock() + mock_overview.get_from_id = MagicMock() + mock_overview.get_from_id.return_value.modified = datetime.now() + return mock_overview + + +def mock_detached_xblock_types(): + """ + Mock the return results of xmodule.modulestore.store_utilities.DETACHED_XBLOCK_TYPES + """ + # Current values as of 2023-05-01 + return {'static_tab', 'about', 'course_info'} + + +def get_clickhouse_http_params(): + """ + Get the params used in ClickHouse queries. + """ + blocks_params = { + "input_format_allow_errors_num": 1, + "input_format_allow_errors_ratio": 0.1, + "query": "INSERT INTO cool_data.course_blocks FORMAT CSV" + } + relationships_params = { + "input_format_allow_errors_num": 1, + "input_format_allow_errors_ratio": 0.1, + "query": "INSERT INTO cool_data.course_relationships FORMAT CSV" + } + + return blocks_params, relationships_params + + +def course_factory(): + """ + Return a fake course structure that exercises most of the serialization features. + """ + # Create a base block + top_block = FakeXBlock("top") + course = [top_block, ] + + # Create a few children + for i in range(3): + block = FakeXBlock(f"Child {i}") + course.append(block) + top_block.children.append(block) + + # Create grandchildren on some children + if i > 0: + sub_block = FakeXBlock(f"Grandchild {i}") + course.append(sub_block) + block.children.append(sub_block) + + # Create some detached blocks at the top level + for i in range(3): + course.append(FakeXBlock(f"Detached {i}", detached_block=True)) + + return course + + +def check_block_csv_matcher(course): + """ + Match the course structure CSV against the test course. + + This is a matcher for the "responses" library. It returns a function + that actually does the matching. + """ + def match(request): + body = request.body + lines = body.split("\n")[:-1] + + # There should be one CSV line for each block in the test course + if len(lines) != len(course): + return False, f"Body has {len(lines)} lines, course has {len(course)}" + + f = StringIO(body) + reader = csv.reader(f) + + i = 0 + try: + # The CSV should be in the same order as our course, make sure + # everything matches + for row in reader: + block = course[i] + assert row[0] == block.location.org + assert row[1] == str(block.location.course_key) + assert row[2] == block.location.course + assert row[3] == block.location.run + assert row[4] == str(course[i].location) + assert row[5] == block.display_name_with_default + assert row[6] == str(block.block_type) + i += 1 + except AssertionError as e: + return False, f"Mismatch in row {i}: {e}" + return True, "" + return match + + +def check_relationship_csv_matcher(course): + """ + Match the relationship CSV against the test course. + + This is a matcher for the "responses" library. It returns a function + that actually does the matching. + """ + # Build our own copy of the test relationships first + relationships = [] + for block in course: + course_key = str(block.location.course_key) + for _, child in enumerate(block.get_children()): + parent_node = str(CoursePublishedSink.strip_branch_and_version(block.location)) + child_node = str(CoursePublishedSink.strip_branch_and_version(child.location)) + relationships.append((course_key, parent_node, child_node)) + + def match(request): + body = request.body + lines = body.split("\n")[:-1] + + # The relationships CSV should have the same number of relationships as our test + if len(lines) != len(relationships): + return False, f"Body has {len(lines)} lines but there are {len(relationships)} relationships" + + f = StringIO(body) + reader = csv.reader(f) + + i = 0 + try: + # The CSV should be in the same order as our relationships, make sure + # everything matches + for row in reader: + print(row) + print(relationships[i]) + relation = relationships[i] + assert row[0] == relation[0] + assert row[1] == relation[1] + assert row[2] == relation[2] + i += 1 + except AssertionError as e: + return False, f"Mismatch in row {i}: {e}" + return True, "" + return match diff --git a/tests/test_course_published.py b/tests/test_course_published.py new file mode 100644 index 0000000..e7bc4c5 --- /dev/null +++ b/tests/test_course_published.py @@ -0,0 +1,117 @@ +""" +Tests for the course_published sinks. +""" +import logging +from datetime import datetime +from unittest.mock import patch + +import pytest +import responses +from responses import matchers +from responses.registries import OrderedRegistry + +from event_sink_clickhouse.sinks.course_published import CoursePublishedSink +from event_sink_clickhouse.tasks import dump_course_to_clickhouse +from test_utils.helpers import ( + check_block_csv_matcher, + check_relationship_csv_matcher, + course_factory, + course_str_factory, + get_clickhouse_http_params, + mock_course_overview, + mock_detached_xblock_types, +) + + +@responses.activate(registry=OrderedRegistry) # pylint: disable=unexpected-keyword-arg,no-value-for-parameter +@patch("event_sink_clickhouse.sinks.course_published.CoursePublishedSink._get_detached_xblock_types") +@patch("event_sink_clickhouse.sinks.course_published.CoursePublishedSink._get_modulestore") +def test_course_publish_success(mock_modulestore, mock_detached, caplog): + """ + Test of a successful end-to-end run. + """ + # Necessary to get logs from the task + caplog.set_level(logging.INFO, logger="edx.celery.task") + + # Create a fake course structure with a few fake XBlocks + course = course_factory() + mock_modulestore.return_value.get_items.return_value = course + + # Fake the "detached types" list since we can't import it here + mock_detached.return_value = mock_detached_xblock_types() + + # Use the responses library to catch the POSTs to ClickHouse + # and match them against the expected values, including CSV + # content + blocks_params, relationships_params = get_clickhouse_http_params() + + responses.post( + "https://foo.bar/", + match=[ + matchers.query_param_matcher(blocks_params), + check_block_csv_matcher(course) + ], + ) + responses.post( + "https://foo.bar/", + match=[ + matchers.query_param_matcher(relationships_params), + check_relationship_csv_matcher(course) + ], + ) + + course = course_str_factory() + dump_course_to_clickhouse(course) + + # Just to make sure we're not calling things more than we need to + assert mock_modulestore.call_count == 1 + assert mock_detached.call_count == 1 + + +@responses.activate(registry=OrderedRegistry) # pylint: disable=unexpected-keyword-arg,no-value-for-parameter +@patch("event_sink_clickhouse.sinks.course_published.CoursePublishedSink._get_detached_xblock_types") +@patch("event_sink_clickhouse.sinks.course_published.CoursePublishedSink._get_modulestore") +def test_course_publish_clickhouse_error(mock_modulestore, mock_detached, caplog): + """ + Test the case where a ClickHouse POST fails. + """ + caplog.set_level(logging.INFO, logger="edx.celery.task") + course = course_factory() + mock_modulestore.return_value.get_items.return_value = course + mock_detached.return_value = mock_detached_xblock_types() + + # This will raise an exception when we try to post to ClickHouse + responses.post( + "https://foo.bar/", + body=Exception("Bogus test exception in ClickHouse call") + ) + + course = course_str_factory() + + with pytest.raises(Exception): + dump_course_to_clickhouse(course) + + # Make sure everything was called as we expect + assert mock_modulestore.call_count == 1 + assert mock_detached.call_count == 1 + + # Make sure our log message went through. + assert f"Error trying to dump course {course} to ClickHouse!" in caplog.text + + +@patch("event_sink_clickhouse.sinks.course_published.CoursePublishedSink._get_course_overview_model") +def test_get_course_last_published(mock_overview): + """ + This function isn't in use yet, but we'll need it for the management command + """ + # Create a fake course overview, which will return a datetime object + course = mock_course_overview() + mock_overview.return_value = course + + # Request our course last published date + course_key = course_str_factory() + + # Confirm that the string date we get back is a valid date + last_published_date = CoursePublishedSink.get_course_last_published(course_key) + dt = datetime.strptime(last_published_date, "%Y-%m-%d %H:%M:%S.%f") + assert dt diff --git a/tests/test_django_settings.py b/tests/test_django_settings.py new file mode 100644 index 0000000..52ff506 --- /dev/null +++ b/tests/test_django_settings.py @@ -0,0 +1,55 @@ +""" +Test plugin settings +""" + +from django.conf import settings +from django.test import TestCase + +from event_sink_clickhouse.settings import common as common_settings +from event_sink_clickhouse.settings import production as production_setttings + + +class TestPluginSettings(TestCase): + """ + Tests plugin settings + """ + + def test_common_settings(self): + """ + Test common settings + """ + common_settings.plugin_settings(settings) + + for key in ("url", "username", "password", "database", "timeout_secs"): + assert key in settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG + + def test_production_settings(self): + """ + Test production settings + """ + test_url = "https://foo.bar" + test_username = "bob" + test_password = "secret" + test_database = "cool_data" + test_timeout = 1 + + settings.ENV_TOKENS = { + 'EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG': { + "url": test_url, + "username": test_username, + "password": test_password, + "database": test_database, + "timeout_secs": test_timeout + } + } + production_setttings.plugin_settings(settings) + + for key, val in ( + ("url", test_url), + ("username", test_username), + ("password", test_password), + ("database", test_database), + ("timeout_secs", test_timeout), + ): + assert key in settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG + assert settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG[key] == val From 185c12c4f844d0d239336c363b93e6c7f26194f1 Mon Sep 17 00:00:00 2001 From: edX requirements bot Date: Sun, 30 Apr 2023 20:08:47 -0400 Subject: [PATCH 3/7] chore: Updating Python Requirements --- requirements/dev.txt | 1 + requirements/doc.txt | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/requirements/dev.txt b/requirements/dev.txt index bea09b2..89c3dcd 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -328,6 +328,7 @@ typing-extensions==4.5.0 # -r requirements/quality.txt # astroid # pylint +virtualenv==20.23.0 urllib3==1.26.15 # via # -r requirements/quality.txt diff --git a/requirements/doc.txt b/requirements/doc.txt index 23fc8d3..d043172 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -71,6 +71,8 @@ coverage[toml]==7.2.5 # via # -r requirements/test.txt # pytest-cov +cryptography==40.0.2 + # via secretstorage django==3.2.18 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt @@ -238,6 +240,8 @@ rfc3986==2.0.0 # via twine rich==13.3.5 # via twine +secretstorage==3.3.3 + # via keyring six==1.16.0 # via # -r requirements/test.txt From e2a0e089bb2d2a0299c188913749046684a17367 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Tue, 2 May 2023 19:20:57 -0400 Subject: [PATCH 4/7] docs: Update changelog --- CHANGELOG.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 12c711d..83c2841 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,7 +14,9 @@ Change Log Unreleased ********** -* +* First functional version, includes a CMS listener for COURSE_PUBLISHED +* README updates +* New configuration settings for connection to ClickHouse 0.1.0 – 2023-04-24 ********************************************** From db42bc78618a9f53ce3fa1cc676cfeb8a78a6c02 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Wed, 3 May 2023 08:39:49 -0400 Subject: [PATCH 5/7] fix: Add unique dump id and consistent dump time In order to connect the nodes in a dump, where there may be many dumps per course, these columns are necessary to find the dump that corresponds most closely to an event or set of events. --- event_sink_clickhouse/sinks/course_published.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/event_sink_clickhouse/sinks/course_published.py b/event_sink_clickhouse/sinks/course_published.py index 92c7567..6096102 100644 --- a/event_sink_clickhouse/sinks/course_published.py +++ b/event_sink_clickhouse/sinks/course_published.py @@ -12,6 +12,7 @@ import csv import io +import uuid import requests from django.utils import timezone @@ -86,7 +87,7 @@ def get_course_last_published(course_key): return str(approx_last_published) @staticmethod - def serialize_item(item, index, detached_xblock_types): + def serialize_item(item, index, detached_xblock_types, dump_id, dump_timestamp): """ Args: item: an XBlock @@ -109,9 +110,10 @@ def serialize_item(item, index, detached_xblock_types): 'display_name': item.display_name_with_default.replace("'", "\'"), 'block_type': block_type, 'detached': 1 if block_type in detached_xblock_types else 0, - 'edited_on': str(getattr(item, 'edited_on', '')), - 'time_last_dumped': str(timezone.now()), 'order': index, + 'edited_on': str(getattr(item, 'edited_on', '')), + 'dump_id': dump_id, + 'time_last_dumped': dump_timestamp, } return rtn_fields @@ -130,6 +132,9 @@ def serialize_course(self, course_id): modulestore = CoursePublishedSink._get_modulestore() detached_xblock_types = CoursePublishedSink._get_detached_xblock_types() + dump_id = str(uuid.uuid4()) + dump_timestamp = str(timezone.now()) + # create a location to node mapping we'll need later for # writing relationships location_to_node = {} @@ -139,7 +144,7 @@ def serialize_course(self, course_id): i = 0 for item in items: i += 1 - fields = self.serialize_item(item, i, detached_xblock_types) + fields = self.serialize_item(item, i, detached_xblock_types, dump_id, dump_timestamp) location_to_node[self.strip_branch_and_version(item.location)] = fields # create relationships @@ -154,7 +159,9 @@ def serialize_course(self, course_id): 'course_key': str(course_id), 'parent_location': str(parent_node["location"]), 'child_location': str(child_node["location"]), - 'order': index + 'order': index, + 'dump_id': dump_id, + 'time_last_dumped': dump_timestamp, } relationships.append(relationship) From d6af144f7bbdc31977c9ec4f021d0d25debdbb25 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Wed, 3 May 2023 11:17:06 -0400 Subject: [PATCH 6/7] chore: Run make upgrade --- requirements/base.txt | 2 +- requirements/dev.txt | 3 +-- requirements/doc.txt | 6 +----- requirements/quality.txt | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 6dc376e..9345f68 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -31,7 +31,7 @@ click-plugins==1.1.1 # via celery click-repl==0.2.0 # via celery -django==3.2.18 +django==3.2.19 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/base.in diff --git a/requirements/dev.txt b/requirements/dev.txt index 89c3dcd..721236d 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -88,7 +88,7 @@ distlib==0.3.6 # via # -r requirements/ci.txt # virtualenv -django==3.2.18 +django==3.2.19 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/quality.txt @@ -328,7 +328,6 @@ typing-extensions==4.5.0 # -r requirements/quality.txt # astroid # pylint -virtualenv==20.23.0 urllib3==1.26.15 # via # -r requirements/quality.txt diff --git a/requirements/doc.txt b/requirements/doc.txt index d043172..c7d3a6a 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -71,9 +71,7 @@ coverage[toml]==7.2.5 # via # -r requirements/test.txt # pytest-cov -cryptography==40.0.2 - # via secretstorage -django==3.2.18 +django==3.2.19 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/test.txt @@ -240,8 +238,6 @@ rfc3986==2.0.0 # via twine rich==13.3.5 # via twine -secretstorage==3.3.3 - # via keyring six==1.16.0 # via # -r requirements/test.txt diff --git a/requirements/quality.txt b/requirements/quality.txt index 241350d..70c0acd 100644 --- a/requirements/quality.txt +++ b/requirements/quality.txt @@ -69,7 +69,7 @@ coverage[toml]==7.2.5 # pytest-cov dill==0.3.6 # via pylint -django==3.2.18 +django==3.2.19 # via # -c https://raw.githubusercontent.com/edx/edx-lint/master/edx_lint/files/common_constraints.txt # -r requirements/test.txt From 61a1c6efd1b8f8d282565a66a4641dd2e18b02f4 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Thu, 4 May 2023 14:20:26 -0400 Subject: [PATCH 7/7] refactor: Address PR feedback and refactor ClickHouse HTTP requests The requests themselves have been moved into the base class to consolidate error handling, and the CSV / Request generation moved into their own methods. --- event_sink_clickhouse/apps.py | 2 - event_sink_clickhouse/sinks/base_sink.py | 29 ++++- .../sinks/course_published.py | 123 ++++++++++-------- event_sink_clickhouse/tasks.py | 2 +- pii_report/2023-24-04-10-29-05.yaml | 79 ----------- tests/test_course_published.py | 9 +- 6 files changed, 102 insertions(+), 142 deletions(-) delete mode 100644 pii_report/2023-24-04-10-29-05.yaml diff --git a/event_sink_clickhouse/apps.py b/event_sink_clickhouse/apps.py index 2d666d2..85c287e 100644 --- a/event_sink_clickhouse/apps.py +++ b/event_sink_clickhouse/apps.py @@ -19,12 +19,10 @@ class EventSinkClickhouseConfig(AppConfig): 'lms.djangoapp': { 'production': {PluginSettings.RELATIVE_PATH: 'settings.production'}, 'common': {PluginSettings.RELATIVE_PATH: 'settings.common'}, - 'devstack': {PluginSettings.RELATIVE_PATH: 'settings.devstack'}, }, 'cms.djangoapp': { 'production': {PluginSettings.RELATIVE_PATH: 'settings.production'}, 'common': {PluginSettings.RELATIVE_PATH: 'settings.common'}, - 'devstack': {PluginSettings.RELATIVE_PATH: 'settings.devstack'}, } }, # Configuration setting for Plugin Signals for this app. diff --git a/event_sink_clickhouse/sinks/base_sink.py b/event_sink_clickhouse/sinks/base_sink.py index 89ca440..6bcd40a 100644 --- a/event_sink_clickhouse/sinks/base_sink.py +++ b/event_sink_clickhouse/sinks/base_sink.py @@ -1,9 +1,13 @@ """ Base classes for event sinks """ +from collections import namedtuple +import requests from django.conf import settings +ClickHouseAuth = namedtuple("ClickHouseAuth", ["username", "password"]) + class BaseSink: """ @@ -12,15 +16,32 @@ class BaseSink: def __init__(self, connection_overrides, log): self.log = log self.ch_url = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["url"] - self.ch_auth = (settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["username"], - settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["password"]) + self.ch_auth = ClickHouseAuth(settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["username"], + settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["password"]) self.ch_database = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["database"] self.ch_timeout_secs = settings.EVENT_SINK_CLICKHOUSE_BACKEND_CONFIG["timeout_secs"] # If any overrides to the ClickHouse connection if connection_overrides: self.ch_url = connection_overrides.get("url", self.ch_url) - self.ch_auth = (connection_overrides.get("username", self.ch_auth[0]), - connection_overrides.get("password", self.ch_auth[1])) + self.ch_auth = ClickHouseAuth(connection_overrides.get("username", self.ch_auth.username), + connection_overrides.get("password", self.ch_auth.password)) self.ch_database = connection_overrides.get("database", self.ch_database) self.ch_timeout_secs = connection_overrides.get("timeout_secs", self.ch_timeout_secs) + + def _send_clickhouse_request(self, request): + """ + Perform the actual HTTP requests to ClickHouse. + """ + session = requests.Session() + prepared_request = request.prepare() + + try: + response = session.send(prepared_request, timeout=self.ch_timeout_secs) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + self.log.error(str(e)) + self.log.error(e.response.headers) + self.log.error(e.response) + self.log.error(e.response.text) + raise diff --git a/event_sink_clickhouse/sinks/course_published.py b/event_sink_clickhouse/sinks/course_published.py index 6096102..5c01afd 100644 --- a/event_sink_clickhouse/sinks/course_published.py +++ b/event_sink_clickhouse/sinks/course_published.py @@ -17,7 +17,13 @@ import requests from django.utils import timezone -from .base_sink import BaseSink +from event_sink_clickhouse.sinks.base_sink import BaseSink + +# Defaults we want to ensure we fail early on bulk inserts +CLICKHOUSE_BULK_INSERT_PARAMS = { + "input_format_allow_errors_num": 1, + "input_format_allow_errors_ratio": 0.1, +} class CoursePublishedSink(BaseSink): @@ -101,7 +107,7 @@ def serialize_item(item, index, detached_xblock_types, dump_id, dump_timestamp): course_key = item.scope_ids.usage_id.course_key block_type = item.scope_ids.block_type - rtn_fields = { + serialized_block = { 'org': course_key.org, 'course_key': str(course_key), 'course': course_key.course, @@ -116,7 +122,7 @@ def serialize_item(item, index, detached_xblock_types, dump_id, dump_timestamp): 'time_last_dumped': dump_timestamp, } - return rtn_fields + return serialized_block def serialize_course(self, course_id): """ @@ -135,19 +141,19 @@ def serialize_course(self, course_id): dump_id = str(uuid.uuid4()) dump_timestamp = str(timezone.now()) - # create a location to node mapping we'll need later for - # writing relationships + # Create a location to node mapping as a lookup for writing relationships later location_to_node = {} items = modulestore.get_items(course_id) - # create nodes - i = 0 + # Serialize the XBlocks to dicts and map them with their location as keys the + # whole map needs to be completed before we can define relationships + index = 0 for item in items: - i += 1 - fields = self.serialize_item(item, i, detached_xblock_types, dump_id, dump_timestamp) + index += 1 + fields = self.serialize_item(item, index, detached_xblock_types, dump_id, dump_timestamp) location_to_node[self.strip_branch_and_version(item.location)] = fields - # create relationships + # Create a list of relationships between blocks, using their locations as identifiers relationships = [] for item in items: for index, child in enumerate(item.get_children()): @@ -168,63 +174,74 @@ def serialize_course(self, course_id): nodes = list(location_to_node.values()) return nodes, relationships + def _send_xblocks(self, serialized_xblocks): + """ + Create the insert query and CSV to send the serialized XBlocks to ClickHouse. + """ + params = CLICKHOUSE_BULK_INSERT_PARAMS.copy() + + # "query" is a special param for the query, it's the best way to get the FORMAT CSV in there. + params["query"] = f"INSERT INTO {self.ch_database}.course_blocks FORMAT CSV" + + output = io.StringIO() + writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) + + for node in serialized_xblocks: + writer.writerow(node.values()) + + request = requests.Request( + 'POST', + self.ch_url, + data=output.getvalue(), + params=params, + auth=self.ch_auth + ) + + self._send_clickhouse_request(request) + + def _send_relationships(self, relationships): + """ + Create the insert query and CSV to send the serialized relationships to ClickHouse. + """ + params = CLICKHOUSE_BULK_INSERT_PARAMS.copy() + + # "query" is a special param for the query, it's the best way to get the FORMAT CSV in there. + params["query"] = f"INSERT INTO {self.ch_database}.course_relationships FORMAT CSV" + output = io.StringIO() + writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) + + for relationship in relationships: + writer.writerow(relationship.values()) + + request = requests.Request( + 'POST', + self.ch_url, + data=output.getvalue(), + params=params, + auth=self.ch_auth + ) + + self._send_clickhouse_request(request) + def dump(self, course_key): """ Do the serialization and send to ClickHouse """ - nodes, relationships = self.serialize_course(course_key) + serialized_blocks, relationships = self.serialize_course(course_key) self.log.info( - "Now dumping %s to ClickHouse: %d nodes and %d relationships", + "Now dumping %s to ClickHouse: %d serialized_blocks and %d relationships", course_key, - len(nodes), + len(serialized_blocks), len(relationships), ) course_string = str(course_key) try: - # Params that begin with "param_" will be used in the query replacement - # all others are ClickHouse settings. - params = { - # Fail early on bulk inserts - "input_format_allow_errors_num": 1, - "input_format_allow_errors_ratio": 0.1, - } - - # "query" is a special param for the query, it's the best way to get the FORMAT CSV in there. - params["query"] = f"INSERT INTO {self.ch_database}.course_blocks FORMAT CSV" - - output = io.StringIO() - writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) - - for node in nodes: - writer.writerow(node.values()) - - response = requests.post(self.ch_url, data=output.getvalue(), params=params, auth=self.ch_auth, - timeout=self.ch_timeout_secs) - self.log.info(response.headers) - self.log.info(response) - self.log.info(response.text) - response.raise_for_status() - - # Just overwriting the previous query - params["query"] = f"INSERT INTO {self.ch_database}.course_relationships FORMAT CSV" - output = io.StringIO() - writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC) - - for relationship in relationships: - writer.writerow(relationship.values()) - - response = requests.post(self.ch_url, data=output.getvalue(), params=params, auth=self.ch_auth, - timeout=self.ch_timeout_secs) - self.log.info(response.headers) - self.log.info(response) - self.log.info(response.text) - response.raise_for_status() - + self._send_xblocks(serialized_blocks) + self._send_relationships(relationships) self.log.info("Completed dumping %s to ClickHouse", course_key) - except Exception: self.log.exception( "Error trying to dump course %s to ClickHouse!", diff --git a/event_sink_clickhouse/tasks.py b/event_sink_clickhouse/tasks.py index 60d54e1..9089bab 100644 --- a/event_sink_clickhouse/tasks.py +++ b/event_sink_clickhouse/tasks.py @@ -8,7 +8,7 @@ from edx_django_utils.monitoring import set_code_owner_attribute from opaque_keys.edx.keys import CourseKey -from .sinks.course_published import CoursePublishedSink +from event_sink_clickhouse.sinks.course_published import CoursePublishedSink log = logging.getLogger(__name__) celery_log = logging.getLogger('edx.celery.task') diff --git a/pii_report/2023-24-04-10-29-05.yaml b/pii_report/2023-24-04-10-29-05.yaml deleted file mode 100644 index e9c88aa..0000000 --- a/pii_report/2023-24-04-10-29-05.yaml +++ /dev/null @@ -1,79 +0,0 @@ -.annotation_safe_list.yml: -- annotation_data: This model has no PII - annotation_token: '.. no_pii:' - extra: - full_comment: '{''.. no_pii:'': ''This model has no PII''}' - object_id: contenttypes.ContentType - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 1 -- annotation_data: This model has no PII - annotation_token: '.. no_pii:' - extra: - full_comment: '{''.. no_pii:'': ''This model has no PII''}' - object_id: admin.LogEntry - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 2 -- annotation_data: This model has no PII - annotation_token: '.. no_pii:' - extra: - full_comment: '{''.. no_pii:'': ''This model has no PII''}' - object_id: auth.Permission - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 3 -- annotation_data: This model has no PII - annotation_token: '.. no_pii:' - extra: - full_comment: '{''.. no_pii:'': ''This model has no PII''}' - object_id: sessions.Session - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 4 -- annotation_data: This model minimally contains a username, password, and email - annotation_token: .. pii - extra: - full_comment: '{''.. pii'': ''This model minimally contains a username, password, - and email'', ''.. pii_types'': ''username, email_address, password'', ''.. pii_retirement'': - ''consumer_api''}' - object_id: auth.User - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 5 -- annotation_data: username, email_address, password - annotation_token: .. pii_types - extra: - full_comment: '{''.. pii'': ''This model minimally contains a username, password, - and email'', ''.. pii_types'': ''username, email_address, password'', ''.. pii_retirement'': - ''consumer_api''}' - object_id: auth.User - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 5 -- annotation_data: consumer_api - annotation_token: .. pii_retirement - extra: - full_comment: '{''.. pii'': ''This model minimally contains a username, password, - and email'', ''.. pii_types'': ''username, email_address, password'', ''.. pii_retirement'': - ''consumer_api''}' - object_id: auth.User - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 5 -- annotation_data: This model has no PII - annotation_token: '.. no_pii:' - extra: - full_comment: '{''.. no_pii:'': ''This model has no PII''}' - object_id: auth.Group - filename: .annotation_safe_list.yml - found_by: safelist - line_number: 0 - report_group_id: 6 diff --git a/tests/test_course_published.py b/tests/test_course_published.py index e7bc4c5..037be6a 100644 --- a/tests/test_course_published.py +++ b/tests/test_course_published.py @@ -6,6 +6,7 @@ from unittest.mock import patch import pytest +import requests import responses from responses import matchers from responses.registries import OrderedRegistry @@ -83,19 +84,21 @@ def test_course_publish_clickhouse_error(mock_modulestore, mock_detached, caplog # This will raise an exception when we try to post to ClickHouse responses.post( "https://foo.bar/", - body=Exception("Bogus test exception in ClickHouse call") + body="Test Bad Request error", + status=400 ) course = course_str_factory() - with pytest.raises(Exception): + with pytest.raises(requests.exceptions.RequestException): dump_course_to_clickhouse(course) # Make sure everything was called as we expect assert mock_modulestore.call_count == 1 assert mock_detached.call_count == 1 - # Make sure our log message went through. + # Make sure our log messages went through. + assert "Test Bad Request error" in caplog.text assert f"Error trying to dump course {course} to ClickHouse!" in caplog.text