From 159556dbf10ef736dc1f7ade80b33f22bc9b2337 Mon Sep 17 00:00:00 2001 From: jbannister Date: Thu, 27 Oct 2022 13:09:50 +0100 Subject: [PATCH 1/2] Ensure that the delete function actually deletes the gridfs blobs (#122) --- notebooker/serialization/mongo.py | 121 ++++++++++++++++++++---------- tests/integration/test_mongo.py | 75 ++++++++++++++++-- 2 files changed, 149 insertions(+), 47 deletions(-) diff --git a/notebooker/serialization/mongo.py b/notebooker/serialization/mongo.py index cabb04ec..a2981e47 100644 --- a/notebooker/serialization/mongo.py +++ b/notebooker/serialization/mongo.py @@ -27,6 +27,66 @@ def _add_deleted_status_to_filter(base_filter): return base_filter +def ignore_missing_files(f): + def _ignore_missing_files(path, *args, **kwargs): + try: + return f(path, *args, **kwargs) + except NoFile: + logger.error("Could not find file %s", path) + return "" + + return _ignore_missing_files + + +@ignore_missing_files +def read_file(result_data_store, path, is_json=False): + r = result_data_store.get_last_version(path).read() + try: + return "" if not r else json.loads(r) if is_json else r.decode("utf8") + except UnicodeDecodeError: + return r + + +@ignore_missing_files +def read_bytes_file(result_data_store, path): + return result_data_store.get_last_version(path).read() + + +def load_files_from_gridfs(result_data_store: gridfs.GridFS, result: Dict, do_read=True) -> List[str]: + + gridfs_filenames = [] + all_html_output_paths = result.get("raw_html_resources", {}).get("outputs", []) + gridfs_filenames.extend(all_html_output_paths) + if do_read: + outputs = {path: read_file(result_data_store, path) for path in all_html_output_paths} + result["raw_html_resources"]["outputs"] = outputs + if result.get("generate_pdf_output"): + pdf_filename = _pdf_filename(result["job_id"]) + if do_read: + result["pdf"] = read_bytes_file(result_data_store, pdf_filename) + gridfs_filenames.append(pdf_filename) + if not result.get("raw_ipynb_json"): + json_filename = _raw_json_filename(result["job_id"]) + if do_read: + result["raw_ipynb_json"] = read_file(result_data_store, json_filename, is_json=True) + gridfs_filenames.append(json_filename) + if not result.get("raw_html"): + html_filename = _raw_html_filename(result["job_id"]) + if do_read: + result["raw_html"] = read_file(result_data_store, html_filename) + gridfs_filenames.append(html_filename) + if not result.get("email_html"): + email_filename = _raw_email_html_filename(result["job_id"]) + result["email_html"] = read_file(result_data_store, email_filename) + gridfs_filenames.append(email_filename) + if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"): + css_inlining_filename = _css_inlining_filename(result["job_id"]) + if do_read: + result["raw_html_resources"]["inlining"] = read_file(result_data_store, css_inlining_filename, is_json=True) + gridfs_filenames.append(css_inlining_filename) + return gridfs_filenames + + class MongoResultSerializer(ABC): # This class is the interface between Mongo and the rest of the application @@ -212,47 +272,12 @@ def _convert_result( if cls is None: return None - def ignore_missing_files(f): - def _ignore_missing_files(path, *args, **kwargs): - try: - return f(path, *args, **kwargs) - except NoFile: - logger.error("Could not find file %s in %s", path, self.result_data_store) - return "" - return _ignore_missing_files - - @ignore_missing_files - def read_file(path, is_json=False): - r = self.result_data_store.get_last_version(path).read() - try: - return "" if not r else json.loads(r) if is_json else r.decode("utf8") - except UnicodeDecodeError: - return r - - @ignore_missing_files - def read_bytes_file(path): - return self.result_data_store.get_last_version(path).read() - if not load_payload: result.pop("stdout", None) if cls == NotebookResultComplete: if load_payload: - outputs = {path: read_file(path) for path in result.get("raw_html_resources", {}).get("outputs", [])} - result["raw_html_resources"]["outputs"] = outputs - if result.get("generate_pdf_output"): - pdf_filename = _pdf_filename(result["job_id"]) - result["pdf"] = read_bytes_file(pdf_filename) - if not result.get("raw_ipynb_json"): - result["raw_ipynb_json"] = read_file(_raw_json_filename(result["job_id"]), is_json=True) - if not result.get("raw_html"): - result["raw_html"] = read_file(_raw_html_filename(result["job_id"])) - if not result.get("email_html"): - result["email_html"] = read_file(_raw_email_html_filename(result["job_id"])) - if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"): - result["raw_html_resources"]["inlining"] = read_file( - _css_inlining_filename(result["job_id"]), is_json=True - ) + load_files_from_gridfs(self.result_data_store, result, do_read=True) else: result.pop("raw_html", None) result.pop("raw_ipynb_json", None) @@ -298,7 +323,7 @@ def read_bytes_file(path): elif cls == NotebookResultError: if load_payload: if not result.get("error_info"): - result["error_info"] = read_file(_error_info_filename(result["job_id"])) + result["error_info"] = read_file(self.result_data_store, _error_info_filename(result["job_id"])) else: result.pop("error_info", None) return NotebookResultError( @@ -319,11 +344,14 @@ def read_bytes_file(path): else: raise ValueError("Could not deserialise {} into result object.".format(result)) + def _get_raw_check_result(self, job_id: str): + return self.library.find_one({"job_id": job_id}, {"_id": 0}) + def get_check_result( - self, job_id: AnyStr + self, job_id: AnyStr, load_payload: bool = True ) -> Optional[Union[NotebookResultError, NotebookResultComplete, NotebookResultPending]]: - result = self.library.find_one({"job_id": job_id}, {"_id": 0}) - return self._convert_result(result) + result = self._get_raw_check_result(job_id) + return self._convert_result(result, load_payload=load_payload) def _get_raw_results(self, base_filter, projection, limit): base_filter = _add_deleted_status_to_filter(base_filter) @@ -462,8 +490,19 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) -> def n_all_results_for_report_name(self, report_name: str) -> int: return self._get_result_count({"report_name": report_name}) - def delete_result(self, job_id: AnyStr) -> None: + def delete_result(self, job_id: AnyStr) -> Dict[str, Any]: + result = self._get_raw_check_result(job_id) + status = JobStatus.from_string(result["status"]) + gridfs_filenames = [] + if status == JobStatus.DONE: + gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False) + elif status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED): + gridfs_filenames = [_error_info_filename(job_id)] self.update_check_status(job_id, JobStatus.DELETED) + for filename in gridfs_filenames: + logger.info(f"Deleting {filename}") + self.result_data_store.delete(filename) + return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames} def _pdf_filename(job_id: str) -> str: diff --git a/tests/integration/test_mongo.py b/tests/integration/test_mongo.py index 885897a3..b3a6ccf6 100644 --- a/tests/integration/test_mongo.py +++ b/tests/integration/test_mongo.py @@ -2,8 +2,9 @@ import uuid import pytest +from gridfs.errors import NoFile -from notebooker.constants import JobStatus, NotebookResultComplete +from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError from notebooker.serialization.serialization import initialize_serializer_from_config from notebooker.utils.filesystem import initialise_base_dirs @@ -23,11 +24,11 @@ def test_mongo_saving_ipynb_json_to_gridfs(bson_library, webapp_config): update_time=datetime.datetime(2018, 1, 12, 2, 32), job_start_time=datetime.datetime(2018, 1, 12, 2, 30), job_finish_time=datetime.datetime(2018, 1, 12, 2, 58), - raw_ipynb_json="x" * 32 * (2 ** 20), # 16MB document max - raw_html="x" * 32 * (2 ** 20), # 16MB document max - email_html="x" * 32 * (2 ** 20), # 16MB document max - pdf=b"x" * 32 * (2 ** 20), # 16MB document max - raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2 ** 20)}}, + raw_ipynb_json="x" * 32 * (2**20), # 16MB document max + raw_html="x" * 32 * (2**20), # 16MB document max + email_html="x" * 32 * (2**20), # 16MB document max + pdf=b"x" * 32 * (2**20), # 16MB document max + raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}}, ) ) result = serializer.get_check_result(job_id) @@ -52,3 +53,65 @@ def test_cant_serialise_done_job_via_update(bson_library, webapp_config): raw_html="", email_html="", ) + + +def test_delete(bson_library, webapp_config): + initialise_base_dirs(webapp_config=webapp_config) + serializer = initialize_serializer_from_config(webapp_config) + + job_id = str(uuid.uuid4()) + report_name = str(uuid.uuid4()) + raw_html = "x" * 32 * (2**20) + serializer.save_check_result( + NotebookResultComplete( + job_id=job_id, + report_name=report_name, + report_title=report_name, + status=JobStatus.DONE, + update_time=datetime.datetime(2018, 1, 12, 2, 32), + job_start_time=datetime.datetime(2018, 1, 12, 2, 30), + job_finish_time=datetime.datetime(2018, 1, 12, 2, 58), + raw_ipynb_json="x" * 32 * (2**20), # 16MB document max + raw_html=raw_html, # 16MB document max + email_html="x" * 32 * (2**20), # 16MB document max + pdf=b"x" * 32 * (2**20), # 16MB document max + raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"}, + ) + ) + assert bson_library.find_one({"job_id": job_id}) is not None + result = serializer.get_check_result(job_id) + assert result is not None + assert result.raw_html == raw_html + deleted_stuff = serializer.delete_result(job_id) + for filename in deleted_stuff["gridfs_filenames"]: + with pytest.raises(NoFile): + serializer.result_data_store.get(filename) + assert serializer.get_check_result(job_id) is None + + +@pytest.mark.parametrize("job_status", [JobStatus.CANCELLED, JobStatus.TIMEOUT, JobStatus.ERROR]) +def test_delete_error(bson_library, webapp_config, job_status): + initialise_base_dirs(webapp_config=webapp_config) + serializer = initialize_serializer_from_config(webapp_config) + + job_id = str(uuid.uuid4()) + report_name = str(uuid.uuid4()) + serializer.save_check_result( + NotebookResultError( + job_id=job_id, + report_name=report_name, + report_title=report_name, + status=job_status, + update_time=datetime.datetime(2018, 1, 12, 2, 32), + job_start_time=datetime.datetime(2018, 1, 12, 2, 30), + error_info="Oh no!", + ) + ) + assert bson_library.find_one({"job_id": job_id}) is not None + result = serializer.get_check_result(job_id) + assert result is not None + deleted_stuff = serializer.delete_result(job_id) + for filename in deleted_stuff["gridfs_filenames"]: + with pytest.raises(NoFile): + serializer.result_data_store.get(filename) + assert serializer.get_check_result(job_id) is None From 6a97130da8f4aa2373b128e7df9f2c109a6dd1d7 Mon Sep 17 00:00:00 2001 From: jbannister Date: Fri, 28 Oct 2022 14:38:31 +0100 Subject: [PATCH 2/2] Add to CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcb324b7..84e47234 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ------------------ * Bugfix: Small bugfix for synchronous report execution +* Improvement: Delete functionality in mongo now also deletes files from GridFS 0.4.5 (2022-09-29)