From 90f5a3d3f3db486a9fcff5fab0f280c790481318 Mon Sep 17 00:00:00 2001 From: Brian Mesick Date: Wed, 3 May 2023 08:39:49 -0400 Subject: [PATCH] fix: Add unique dump id and consistent dump time In order to connect the nodes in a dump, where there may be many dumps per course, these columns are necessary to find the dump that corresponds most closely to an event or set of events. --- event_sink_clickhouse/sinks/course_published.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/event_sink_clickhouse/sinks/course_published.py b/event_sink_clickhouse/sinks/course_published.py index 92c7567..12a6b40 100644 --- a/event_sink_clickhouse/sinks/course_published.py +++ b/event_sink_clickhouse/sinks/course_published.py @@ -12,6 +12,7 @@ import csv import io +import uuid import requests from django.utils import timezone @@ -86,7 +87,7 @@ def get_course_last_published(course_key): return str(approx_last_published) @staticmethod - def serialize_item(item, index, detached_xblock_types): + def serialize_item(item, index, detached_xblock_types, dump_id, dump_timestamp): """ Args: item: an XBlock @@ -109,9 +110,10 @@ def serialize_item(item, index, detached_xblock_types): 'display_name': item.display_name_with_default.replace("'", "\'"), 'block_type': block_type, 'detached': 1 if block_type in detached_xblock_types else 0, - 'edited_on': str(getattr(item, 'edited_on', '')), - 'time_last_dumped': str(timezone.now()), 'order': index, + 'edited_on': str(getattr(item, 'edited_on', '')), + 'dump_id': dump_id, + 'time_last_dumped': dump_timestamp, } return rtn_fields @@ -130,6 +132,9 @@ def serialize_course(self, course_id): modulestore = CoursePublishedSink._get_modulestore() detached_xblock_types = CoursePublishedSink._get_detached_xblock_types() + dump_id = str(uuid.uuid4()) + dump_timestamp = str(timezone.now()) + # create a location to node mapping we'll need later for # writing relationships location_to_node = {} @@ -139,7 +144,7 @@ def serialize_course(self, course_id): i = 0 for item in items: i += 1 - fields = self.serialize_item(item, i, detached_xblock_types) + fields = self.serialize_item(item, i, detached_xblock_types, dump_id, dump_timestamp) location_to_node[self.strip_branch_and_version(item.location)] = fields # create relationships @@ -154,6 +159,8 @@ def serialize_course(self, course_id): 'course_key': str(course_id), 'parent_location': str(parent_node["location"]), 'child_location': str(child_node["location"]), + 'dump_id': dump_id, + 'time_last_dumped': dump_timestamp, 'order': index } relationships.append(relationship)