Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
------------------

* Bugfix: Small bugfix for synchronous report execution
* Improvement: Delete functionality in mongo now also deletes files from GridFS


0.4.5 (2022-09-29)
Expand Down
121 changes: 80 additions & 41 deletions notebooker/serialization/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,66 @@ def _add_deleted_status_to_filter(base_filter):
return base_filter


def ignore_missing_files(f):
def _ignore_missing_files(path, *args, **kwargs):
try:
return f(path, *args, **kwargs)
except NoFile:
logger.error("Could not find file %s", path)
return ""

return _ignore_missing_files


@ignore_missing_files
def read_file(result_data_store, path, is_json=False):
r = result_data_store.get_last_version(path).read()
try:
return "" if not r else json.loads(r) if is_json else r.decode("utf8")
except UnicodeDecodeError:
return r


@ignore_missing_files
def read_bytes_file(result_data_store, path):
return result_data_store.get_last_version(path).read()


def load_files_from_gridfs(result_data_store: gridfs.GridFS, result: Dict, do_read=True) -> List[str]:

gridfs_filenames = []
all_html_output_paths = result.get("raw_html_resources", {}).get("outputs", [])
gridfs_filenames.extend(all_html_output_paths)
if do_read:
outputs = {path: read_file(result_data_store, path) for path in all_html_output_paths}
result["raw_html_resources"]["outputs"] = outputs
if result.get("generate_pdf_output"):
pdf_filename = _pdf_filename(result["job_id"])
if do_read:
result["pdf"] = read_bytes_file(result_data_store, pdf_filename)
gridfs_filenames.append(pdf_filename)
if not result.get("raw_ipynb_json"):
json_filename = _raw_json_filename(result["job_id"])
if do_read:
result["raw_ipynb_json"] = read_file(result_data_store, json_filename, is_json=True)
gridfs_filenames.append(json_filename)
if not result.get("raw_html"):
html_filename = _raw_html_filename(result["job_id"])
if do_read:
result["raw_html"] = read_file(result_data_store, html_filename)
gridfs_filenames.append(html_filename)
if not result.get("email_html"):
email_filename = _raw_email_html_filename(result["job_id"])
result["email_html"] = read_file(result_data_store, email_filename)
gridfs_filenames.append(email_filename)
if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
css_inlining_filename = _css_inlining_filename(result["job_id"])
if do_read:
result["raw_html_resources"]["inlining"] = read_file(result_data_store, css_inlining_filename, is_json=True)
gridfs_filenames.append(css_inlining_filename)
return gridfs_filenames


class MongoResultSerializer(ABC):
# This class is the interface between Mongo and the rest of the application

Expand Down Expand Up @@ -212,47 +272,12 @@ def _convert_result(
if cls is None:
return None

def ignore_missing_files(f):
def _ignore_missing_files(path, *args, **kwargs):
try:
return f(path, *args, **kwargs)
except NoFile:
logger.error("Could not find file %s in %s", path, self.result_data_store)
return ""
return _ignore_missing_files

@ignore_missing_files
def read_file(path, is_json=False):
r = self.result_data_store.get_last_version(path).read()
try:
return "" if not r else json.loads(r) if is_json else r.decode("utf8")
except UnicodeDecodeError:
return r

@ignore_missing_files
def read_bytes_file(path):
return self.result_data_store.get_last_version(path).read()

if not load_payload:
result.pop("stdout", None)

if cls == NotebookResultComplete:
if load_payload:
outputs = {path: read_file(path) for path in result.get("raw_html_resources", {}).get("outputs", [])}
result["raw_html_resources"]["outputs"] = outputs
if result.get("generate_pdf_output"):
pdf_filename = _pdf_filename(result["job_id"])
result["pdf"] = read_bytes_file(pdf_filename)
if not result.get("raw_ipynb_json"):
result["raw_ipynb_json"] = read_file(_raw_json_filename(result["job_id"]), is_json=True)
if not result.get("raw_html"):
result["raw_html"] = read_file(_raw_html_filename(result["job_id"]))
if not result.get("email_html"):
result["email_html"] = read_file(_raw_email_html_filename(result["job_id"]))
if result.get("raw_html_resources") and not result.get("raw_html_resources", {}).get("inlining"):
result["raw_html_resources"]["inlining"] = read_file(
_css_inlining_filename(result["job_id"]), is_json=True
)
load_files_from_gridfs(self.result_data_store, result, do_read=True)
else:
result.pop("raw_html", None)
result.pop("raw_ipynb_json", None)
Expand Down Expand Up @@ -298,7 +323,7 @@ def read_bytes_file(path):
elif cls == NotebookResultError:
if load_payload:
if not result.get("error_info"):
result["error_info"] = read_file(_error_info_filename(result["job_id"]))
result["error_info"] = read_file(self.result_data_store, _error_info_filename(result["job_id"]))
else:
result.pop("error_info", None)
return NotebookResultError(
Expand All @@ -319,11 +344,14 @@ def read_bytes_file(path):
else:
raise ValueError("Could not deserialise {} into result object.".format(result))

def _get_raw_check_result(self, job_id: str):
return self.library.find_one({"job_id": job_id}, {"_id": 0})

def get_check_result(
self, job_id: AnyStr
self, job_id: AnyStr, load_payload: bool = True
) -> Optional[Union[NotebookResultError, NotebookResultComplete, NotebookResultPending]]:
result = self.library.find_one({"job_id": job_id}, {"_id": 0})
return self._convert_result(result)
result = self._get_raw_check_result(job_id)
return self._convert_result(result, load_payload=load_payload)

def _get_raw_results(self, base_filter, projection, limit):
base_filter = _add_deleted_status_to_filter(base_filter)
Expand Down Expand Up @@ -462,8 +490,19 @@ def get_latest_successful_job_ids_for_name_all_params(self, report_name: str) ->
def n_all_results_for_report_name(self, report_name: str) -> int:
return self._get_result_count({"report_name": report_name})

def delete_result(self, job_id: AnyStr) -> None:
def delete_result(self, job_id: AnyStr) -> Dict[str, Any]:
result = self._get_raw_check_result(job_id)
status = JobStatus.from_string(result["status"])
gridfs_filenames = []
if status == JobStatus.DONE:
gridfs_filenames = load_files_from_gridfs(self.result_data_store, result, do_read=False)
elif status in (JobStatus.ERROR, JobStatus.TIMEOUT, JobStatus.CANCELLED):
gridfs_filenames = [_error_info_filename(job_id)]
self.update_check_status(job_id, JobStatus.DELETED)
for filename in gridfs_filenames:
logger.info(f"Deleting {filename}")
self.result_data_store.delete(filename)
return {"deleted_result_document": result, "gridfs_filenames": gridfs_filenames}


def _pdf_filename(job_id: str) -> str:
Expand Down
75 changes: 69 additions & 6 deletions tests/integration/test_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import uuid

import pytest
from gridfs.errors import NoFile

from notebooker.constants import JobStatus, NotebookResultComplete
from notebooker.constants import JobStatus, NotebookResultComplete, NotebookResultError
from notebooker.serialization.serialization import initialize_serializer_from_config
from notebooker.utils.filesystem import initialise_base_dirs

Expand All @@ -23,11 +24,11 @@ def test_mongo_saving_ipynb_json_to_gridfs(bson_library, webapp_config):
update_time=datetime.datetime(2018, 1, 12, 2, 32),
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
raw_ipynb_json="x" * 32 * (2 ** 20), # 16MB document max
raw_html="x" * 32 * (2 ** 20), # 16MB document max
email_html="x" * 32 * (2 ** 20), # 16MB document max
pdf=b"x" * 32 * (2 ** 20), # 16MB document max
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2 ** 20)}},
raw_ipynb_json="x" * 32 * (2**20), # 16MB document max
raw_html="x" * 32 * (2**20), # 16MB document max
email_html="x" * 32 * (2**20), # 16MB document max
pdf=b"x" * 32 * (2**20), # 16MB document max
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}},
)
)
result = serializer.get_check_result(job_id)
Expand All @@ -52,3 +53,65 @@ def test_cant_serialise_done_job_via_update(bson_library, webapp_config):
raw_html="",
email_html="",
)


def test_delete(bson_library, webapp_config):
initialise_base_dirs(webapp_config=webapp_config)
serializer = initialize_serializer_from_config(webapp_config)

job_id = str(uuid.uuid4())
report_name = str(uuid.uuid4())
raw_html = "x" * 32 * (2**20)
serializer.save_check_result(
NotebookResultComplete(
job_id=job_id,
report_name=report_name,
report_title=report_name,
status=JobStatus.DONE,
update_time=datetime.datetime(2018, 1, 12, 2, 32),
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
job_finish_time=datetime.datetime(2018, 1, 12, 2, 58),
raw_ipynb_json="x" * 32 * (2**20), # 16MB document max
raw_html=raw_html, # 16MB document max
email_html="x" * 32 * (2**20), # 16MB document max
pdf=b"x" * 32 * (2**20), # 16MB document max
raw_html_resources={"inlining": {"big_thing": "a" * 32 * (2**20)}, "other_stuff": "Yep"},
)
)
assert bson_library.find_one({"job_id": job_id}) is not None
result = serializer.get_check_result(job_id)
assert result is not None
assert result.raw_html == raw_html
deleted_stuff = serializer.delete_result(job_id)
for filename in deleted_stuff["gridfs_filenames"]:
with pytest.raises(NoFile):
serializer.result_data_store.get(filename)
assert serializer.get_check_result(job_id) is None


@pytest.mark.parametrize("job_status", [JobStatus.CANCELLED, JobStatus.TIMEOUT, JobStatus.ERROR])
def test_delete_error(bson_library, webapp_config, job_status):
initialise_base_dirs(webapp_config=webapp_config)
serializer = initialize_serializer_from_config(webapp_config)

job_id = str(uuid.uuid4())
report_name = str(uuid.uuid4())
serializer.save_check_result(
NotebookResultError(
job_id=job_id,
report_name=report_name,
report_title=report_name,
status=job_status,
update_time=datetime.datetime(2018, 1, 12, 2, 32),
job_start_time=datetime.datetime(2018, 1, 12, 2, 30),
error_info="Oh no!",
)
)
assert bson_library.find_one({"job_id": job_id}) is not None
result = serializer.get_check_result(job_id)
assert result is not None
deleted_stuff = serializer.delete_result(job_id)
for filename in deleted_stuff["gridfs_filenames"]:
with pytest.raises(NoFile):
serializer.result_data_store.get(filename)
assert serializer.get_check_result(job_id) is None