Skip to content

Commit

Permalink
Run reports in a separate Python process. (#41)
Browse files Browse the repository at this point in the history
The runpy module we've been using to run reports provides very little
isolation. In particular, "any side effects (such as cached imports of
other modules) will remain in place after the functions have returned".
This means that if a user splits their report into multiple files, with
one file importing another, the latter will be cached and never reloaded
by Python, even if it is modified. Similarly, if the module was already
loaded before, the report runs against old code that does not match the
files included in the packet.

Using a separate subprocess gives us a much stronger isolation,
including all global variables and not sharing any of previously loaded
modules. It means a user can run a report, modify some of the imported
code, run the report again and have this behave as expected.

The sandbox is implemented by pickling the function and its argument
into a file, starting a new Python process which unpickles the file and
calls the target. Similarly, the function's return value, or any
exception it may throw, it pickled to an output file, which is read by
the parent.

I tried to use the multiprocessing module to implement this, but it puts
inconvenient requirements on the way the top-level source file is
written. The module re-evaluates the file in the subprocess, meaning its
contents must check for `__name__ == "__main__"` to avoid doing the work
again. It also doesn't work too well with the REPL. Using our own
implementation fixes these issues.

The Packet class is refactored a little and its scope is reduced to just
creating the packet and its metadata, without inserting it into the
repository. The insertion is now handled by a separate function. This is
done as a way of distinguishing what happens in the child process and
what happens in the parent.
  • Loading branch information
plietar committed May 3, 2024
1 parent 14260e4 commit c42fca9
Show file tree
Hide file tree
Showing 13 changed files with 407 additions and 149 deletions.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ dependencies = [
"jsonschema",
"pygit2",
"outpack-query-parser",
"humanize"
"humanize",
"tblib"
]

[project.urls]
Expand Down Expand Up @@ -214,3 +215,7 @@ ignore_missing_imports = true
[[tool.mypy.overrides]]
module = "outpack_query_parser"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "tblib"
ignore_missing_imports = true
112 changes: 80 additions & 32 deletions src/orderly/run.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import runpy
import shutil
from pathlib import Path
from typing import Tuple

from orderly.core import Description
from orderly.current import ActiveOrderlyContext
from orderly.current import ActiveOrderlyContext, OrderlyCustomMetadata
from orderly.read import orderly_read
from outpack.ids import outpack_id
from outpack.packet import Packet
from outpack.metadata import MetadataCore
from outpack.packet import Packet, insert_packet
from outpack.root import root_open
from outpack.util import all_normal_files, run_script
from outpack.sandbox import run_in_sandbox
from outpack.util import all_normal_files


def orderly_run(
Expand All @@ -19,36 +22,89 @@ def orderly_run(
path_src, entrypoint = _validate_src_directory(name, root)

dat = orderly_read(path_src / entrypoint)
envir = _validate_parameters(parameters, dat["parameters"])
parameters = _validate_parameters(parameters, dat["parameters"])

packet_id = outpack_id()
path_dest = root.path / "draft" / name / packet_id
path_dest.mkdir(parents=True)

_copy_resources_implicit(path_src, path_dest)

metadata = run_in_sandbox(
_packet_builder,
args=(
root.path,
packet_id,
name,
path_dest,
path_src,
entrypoint,
parameters,
search_options,
),
cwd=path_dest,
)

insert_packet(root, path_dest, metadata)

# This is intentionally not in a `try-finally` block. If creating the
# packet fails and an exception was raised, we want to keep the packet in
# the drafts directory for the user to examine.
shutil.rmtree(path_dest)

return packet_id


def _packet_builder(
root, id, name, path, path_src, entrypoint, parameters, search_options
) -> MetadataCore:
root = root_open(root, locate=False)
packet = Packet(
root, path_dest, name, id=packet_id, locate=False, parameters=envir
root,
path,
name,
id=id,
locate=False,
parameters=parameters,
)

packet.mark_file_immutable(entrypoint)

try:
orderly = _run_report_script(
packet, path, path_src, entrypoint, parameters, search_options
)
_check_artefacts(orderly, path)
except:
packet.end(succesful=False)
raise

packet.add_custom_metadata("orderly", _custom_metadata(entrypoint, orderly))
return packet.end()


def _run_report_script(
packet, path, path_src, entrypoint, parameters, search_options
) -> OrderlyCustomMetadata:
try:
with ActiveOrderlyContext(packet, path_src, search_options) as orderly:
packet.mark_file_immutable(entrypoint)
run_script(path_dest, entrypoint, envir)
# Calling runpy with the full path to the script gives us better
# tracebacks
runpy.run_path(
str(path / entrypoint),
init_globals=parameters,
run_name="__main__",
)

except Exception as error:
_orderly_cleanup_failure(packet)
# This is pretty barebones for now; we will need to do some
# work to make sure that we retain enough contextual errors
# for the user to see that the report failed, and that it
# failed *because* something else failed.
msg = "Running orderly report failed!"
raise Exception(msg) from error

try:
_orderly_cleanup_success(packet, entrypoint, orderly)
except Exception:
_orderly_cleanup_failure(packet)
raise
return packet_id
return orderly


def _validate_src_directory(name, root) -> Tuple[Path, str]:
Expand Down Expand Up @@ -105,22 +161,6 @@ def _copy_resources_implicit(src, dest):
shutil.copy2(src / p, p_dest)


def _orderly_cleanup_success(packet, entrypoint, orderly):
missing = set()
for artefact in orderly.artefacts:
for path in artefact.files:
if not packet.path.joinpath(path).exists():
missing.add(path)
if missing:
missing = ", ".join(f"'{x}'" for x in sorted(missing))
msg = f"Script did not produce the expected artefacts: {missing}"
raise Exception(msg)
# check files (either strict or relaxed) -- but we can't do that yet
packet.add_custom_metadata("orderly", _custom_metadata(entrypoint, orderly))
packet.end(insert=True)
shutil.rmtree(packet.path)


def _custom_metadata(entrypoint, orderly):
role = [{"path": entrypoint, "role": "orderly"}]
for p in orderly.resources:
Expand All @@ -136,5 +176,13 @@ def _custom_metadata(entrypoint, orderly):
}


def _orderly_cleanup_failure(packet):
packet.end(insert=False)
def _check_artefacts(metadata, path):
missing = set()
for artefact in metadata.artefacts:
for file in artefact.files:
if not path.joinpath(file).exists():
missing.add(file)
if missing:
missing = ", ".join(f"'{x}'" for x in sorted(missing))
msg = f"Script did not produce the expected artefacts: {missing}"
raise Exception(msg)
20 changes: 9 additions & 11 deletions src/outpack/packet.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def add_custom_metadata(self, key, value):
raise Exception(msg)
self.custom[key] = value

def end(self, *, insert=True):
def end(self, *, succesful=True):
if self.metadata:
msg = f"Packet '{id}' already ended"
raise Exception(msg)
Expand All @@ -96,11 +96,14 @@ def end(self, *, insert=True):
]
_check_immutable_files(self.files, self.immutable)
self.metadata = self._build_metadata()

validate(self.metadata.to_dict(), "outpack/metadata.json")
if insert:
_insert(self.root, self.path, self.metadata)
else:
_cancel(self.root, self.path, self.metadata)
if not succesful:
self.path.joinpath("outpack.json").write_text(
self.metadata.to_json()
)

return self.metadata

def _build_metadata(self):
return MetadataCore(
Expand All @@ -116,7 +119,7 @@ def _build_metadata(self):
)


def _insert(root, path, meta):
def insert_packet(root, path, meta):
# check that we have not already inserted this packet; in R we
# look to see if it's unpacked but actually the issue is if it is
# present as metadata at all.
Expand All @@ -141,11 +144,6 @@ def _insert(root, path, meta):
mark_known(root, meta.id, "local", hash_meta, time.time())


def _cancel(_root, path, meta):
with path.joinpath("outpack.json").open("w") as f:
f.write(meta.to_json())


def _check_immutable_files(files, immutable):
if not immutable:
return
Expand Down
79 changes: 79 additions & 0 deletions src/outpack/sandbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import pickle
import subprocess
import sys

from tblib import pickling_support

from outpack.util import openable_temporary_file


def run_in_sandbox(target, args=(), cwd=None, syspath=None):
"""
Run a function as a separate process.
The function, its arguments and return value must be picklable.
Parameters
----------
target:
The function to run in the subprocess. This function must be accessible
by name from the top level of a module in order to be pickled.
args:
The arguments to be passed to the function
cwd:
The working directory in which the subprocess runs. If None, it inherits
the current process' working directory.
syspath:
A list of paths to be added to the child process' Python search path.
This is used when the target function's module is not globally
available.
"""
with openable_temporary_file() as input_file:
with openable_temporary_file(mode="rb") as output_file:
pickle.dump((target, args), input_file)
input_file.flush()

cmd = [
sys.executable,
"-m",
"outpack.sandbox",
input_file.name,
output_file.name,
]

if syspath is not None:
env = os.environ.copy()
pythonpath = ":".join(str(s) for s in syspath)

if "PYTHONPATH" in env:
env["PYTHONPATH"] = f"{pythonpath}:{env['PYTHONPATH']}"
else:
env["PYTHONPATH"] = pythonpath
else:
env = None

p = subprocess.run(cmd, cwd=cwd, env=env, check=False) # noqa: S603
p.check_returncode()

(ok, value) = pickle.load(output_file) # noqa: S301
if ok:
return value
else:
raise value


if __name__ == "__main__":
with open(sys.argv[1], "rb") as input_file:
(target, args) = pickle.load(input_file) # noqa: S301

try:
result = (True, target(*args))
except BaseException as e:
# This allows the traceback to be pickled and communicated out of the
# the sandbox.
pickling_support.install(e)
result = (False, e)

with open(sys.argv[2], "wb") as output_file:
pickle.dump(result, output_file)
10 changes: 0 additions & 10 deletions src/outpack/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import os
import runpy
import tempfile
import time
from contextlib import contextmanager
Expand Down Expand Up @@ -49,15 +48,6 @@ def all_normal_files(path):
return result


def run_script(wd, path, init_globals):
with transient_working_directory(wd):
# other ways to do this include importlib, subprocess and
# multiprocess
runpy.run_path(
str(wd / path), init_globals=init_globals, run_name="__main__"
)


@contextmanager
def transient_working_directory(path):
origin = os.getcwd()
Expand Down
8 changes: 5 additions & 3 deletions tests/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from outpack.init import outpack_init
from outpack.location import outpack_location_add_path
from outpack.metadata import MetadataCore, PacketDepends
from outpack.packet import Packet
from outpack.packet import Packet, insert_packet
from outpack.root import root_open
from outpack.schema import outpack_schema_version
from outpack.util import openable_temporary_file
Expand All @@ -29,15 +29,17 @@ def create_packet(root, name, *, packet_id=None, parameters=None):
packet can be populated in the block's body. The packet gets completed and
added to the repository when the context manager is exited.
"""
root = root_open(root, locate=False)
with TemporaryDirectory() as src:
p = Packet(root, src, name, id=packet_id, parameters=parameters)
try:
yield p
except BaseException:
p.end(insert=False)
p.end(succesful=False)
raise
else:
p.end(insert=True)
metadata = p.end(succesful=True)
insert_packet(root, Path(src), metadata)


def create_random_packet(root, name="data", *, parameters=None, packet_id=None):
Expand Down
2 changes: 2 additions & 0 deletions tests/orderly/examples/imports/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def get_description():
return "Hello from module"
4 changes: 4 additions & 0 deletions tests/orderly/examples/imports/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import orderly
from helpers import get_description # type: ignore

orderly.description(display=get_description())
Loading

0 comments on commit c42fca9

Please sign in to comment.