Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Signal handling fixes #10758

Merged
merged 8 commits into from Sep 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
118 changes: 54 additions & 64 deletions src/python/pants/base/exception_sink.py
Expand Up @@ -10,7 +10,7 @@
import threading
import traceback
from contextlib import contextmanager
from typing import Optional
from typing import Callable, Dict, Iterator, Optional

import setproctitle

Expand All @@ -33,7 +33,7 @@ class SignalHandler:
"""

@property
def signal_handler_mapping(self):
def signal_handler_mapping(self) -> Dict[signal.Signals, Callable]:
"""A dict mapping (signal number) -> (a method handling the signal)."""
# Could use an enum here, but we never end up doing any matching on the specific signal value,
# instead just iterating over the registered signals to set handlers, so a dict is probably
Expand All @@ -44,41 +44,23 @@ def signal_handler_mapping(self):
signal.SIGTERM: self.handle_sigterm,
}

def __init__(self):
def __init__(self, *, pantsd_instance: bool):
self._ignore_sigint_lock = threading.Lock()
self._threads_ignoring_sigint = 0
self._ignoring_sigint_v2_engine = False
self._ignoring_sigint = False
self._pantsd_instance = pantsd_instance

def _check_sigint_gate_is_correct(self):
assert (
self._threads_ignoring_sigint >= 0
), "This should never happen, someone must have modified the counter outside of SignalHandler."

def _handle_sigint_if_enabled(self, signum, _frame):
with self._ignore_sigint_lock:
self._check_sigint_gate_is_correct()
threads_ignoring_sigint = self._threads_ignoring_sigint
ignoring_sigint_v2_engine = self._ignoring_sigint_v2_engine
if threads_ignoring_sigint == 0 and not ignoring_sigint_v2_engine:
self.handle_sigint(signum, _frame)

def _toggle_ignoring_sigint_v2_engine(self, toggle: bool):
def _handle_sigint_if_enabled(self, signum: int, _frame):
with self._ignore_sigint_lock:
self._ignoring_sigint_v2_engine = toggle
if not self._ignoring_sigint:
self.handle_sigint(signum, _frame)

@contextmanager
def _ignoring_sigint(self):
with self._ignore_sigint_lock:
self._check_sigint_gate_is_correct()
self._threads_ignoring_sigint += 1
try:
yield
finally:
def _toggle_ignoring_sigint(self, toggle: bool) -> None:
if not self._pantsd_instance:
with self._ignore_sigint_lock:
self._threads_ignoring_sigint -= 1
self._check_sigint_gate_is_correct()
self._ignoring_sigint = toggle

def handle_sigint(self, signum, _frame):
def handle_sigint(self, signum: int, _frame):
ExceptionSink._signal_sent = signum
raise KeyboardInterrupt("User interrupted execution with control-c!")

# TODO(#7406): figure out how to let sys.exit work in a signal handler instead of having to raise
Expand All @@ -97,10 +79,18 @@ def __init__(self, signum, signame):
self.traceback_lines = traceback.format_stack()
super(SignalHandler.SignalHandledNonLocalExit, self).__init__()

if "I/O operation on closed file" in self.traceback_lines:
logger.debug(
"SignalHandledNonLocalExit: unexpected appearance of "
"'I/O operation on closed file' in traceback"
)

def handle_sigquit(self, signum, _frame):
ExceptionSink._signal_sent = signum
raise self.SignalHandledNonLocalExit(signum, "SIGQUIT")

def handle_sigterm(self, signum, _frame):
ExceptionSink._signal_sent = signum
raise self.SignalHandledNonLocalExit(signum, "SIGTERM")


Expand All @@ -110,26 +100,35 @@ class ExceptionSink:
# NB: see the bottom of this file where we call reset_log_location() and other mutators in order
# to properly setup global state.
_log_dir = None

# Where to log stacktraces to in a SIGUSR2 handler.
_interactive_output_stream = None

# Whether to print a stacktrace in any fatal error message printed to the terminal.
_should_print_backtrace_to_terminal = True

# An instance of `SignalHandler` which is invoked to handle a static set of specific
# nonfatal signals (these signal handlers are allowed to make pants exit, but unlike SIGSEGV they
# don't need to exit immediately).
_signal_handler: Optional[SignalHandler] = None
_signal_handler: SignalHandler = SignalHandler(pantsd_instance=False)

# These persistent open file descriptors are kept so the signal handler can do almost no work
# (and lets faulthandler figure out signal safety).
_pid_specific_error_fileobj = None
_shared_error_fileobj = None

_signal_sent: Optional[int] = None

def __new__(cls, *args, **kwargs):
raise TypeError("Instances of {} are not allowed to be constructed!".format(cls.__name__))

class ExceptionSinkError(Exception):
pass

@classmethod
def signal_sent(cls) -> Optional[int]:
return cls._signal_sent

@classmethod
def reset_should_print_backtrace_to_terminal(cls, should_print_backtrace):
"""Set whether a backtrace gets printed to the terminal error stream on a fatal error.
Expand Down Expand Up @@ -281,20 +280,14 @@ def _try_write_with_flush(cls, fileobj, payload):
fileobj.flush()

@classmethod
def reset_signal_handler(cls, signal_handler):
"""Class state:
def reset_signal_handler(cls, signal_handler: SignalHandler) -> SignalHandler:
"""Given a SignalHandler, uses the `signal` std library functionality to set the pants
process's signal handlers to those specified in the object.

- Overwrites `cls._signal_handler`.
OS state:
- Overwrites signal handlers for SIGINT, SIGQUIT, and SIGTERM.

NB: This method calls signal.signal(), which will crash if not called from the main thread!

:returns: The :class:`SignalHandler` that was previously registered, or None if this is
the first time this method was called.
Note that since this calls `signal.signal()`, it will crash if not the main thread. Returns
the previously-registered signal handler.
"""
assert isinstance(signal_handler, SignalHandler)
# NB: Modify process-global state!

for signum, handler in signal_handler.signal_handler_mapping.items():
signal.signal(signum, handler)
# Retry any system calls interrupted by any of the signals we just installed handlers for
Expand All @@ -303,13 +296,13 @@ def reset_signal_handler(cls, signal_handler):
signal.siginterrupt(signum, False)

previous_signal_handler = cls._signal_handler
# NB: Mutate the class variables!
cls._signal_handler = signal_handler

return previous_signal_handler

@classmethod
@contextmanager
def trapped_signals(cls, new_signal_handler):
def trapped_signals(cls, new_signal_handler: SignalHandler) -> Iterator[None]:
"""A contextmanager which temporarily overrides signal handling.

NB: This method calls signal.signal(), which will crash if not called from the main thread!
Expand All @@ -321,23 +314,16 @@ def trapped_signals(cls, new_signal_handler):
cls.reset_signal_handler(previous_signal_handler)

@classmethod
@contextmanager
def ignoring_sigint(cls):
"""A contextmanager which disables handling sigint in the current signal handler. This
allows threads that are not the main thread to ignore sigint.

NB: Only use this if you can't use ExceptionSink.trapped_signals().

Class state:
- Toggles `self._ignore_sigint` in `cls._signal_handler`.
def toggle_ignoring_sigint(cls, toggle: bool) -> None:
"""This method is used to temporarily disable responding to the SIGINT signal sent by a
Ctrl-C in the terminal.

We currently only use this to implement disabling catching SIGINT while an
InteractiveProcess is running (where we want that process to catch it), and only when pantsd
is not enabled (if pantsd is enabled, the client will actually catch SIGINT and forward it
to the server, so we don't want the server process to ignore it.
"""
with cls._signal_handler._ignoring_sigint():
yield

@classmethod
def toggle_ignoring_sigint_v2_engine(cls, toggle: bool) -> None:
assert cls._signal_handler is not None
cls._signal_handler._toggle_ignoring_sigint_v2_engine(toggle)
cls._signal_handler._toggle_ignoring_sigint(toggle)

@classmethod
def _iso_timestamp_for_now(cls):
Expand Down Expand Up @@ -453,12 +439,16 @@ def _handle_signal_gracefully(cls, signum, signame, traceback_lines):
# import time.
# Set the log location for writing logs before bootstrap options are parsed.
ExceptionSink.reset_log_location(os.getcwd())

# NB: Mutate process-global state!
sys.excepthook = ExceptionSink.log_exception

# Sets a SIGUSR2 handler.
ExceptionSink.reset_interactive_output_stream(sys.stderr.buffer)
# Sets a handler that logs nonfatal signals to the exception sink.
ExceptionSink.reset_signal_handler(SignalHandler())

# Setup a default signal handler.
ExceptionSink.reset_signal_handler(SignalHandler(pantsd_instance=False))

# Set whether to print stacktraces on exceptions or signals during import time.
# NB: This will be overridden by bootstrap options in PantsRunner, so we avoid printing out a full
# stacktrace when a user presses control-c during import time unless the environment variable is set
Expand Down
4 changes: 0 additions & 4 deletions src/python/pants/base/exception_sink_integration_test.py
Expand Up @@ -32,8 +32,6 @@ def _assert_unhandled_exception_log_matches(self, pid, file_contents, namespace)
ResolveError: 'this-target-does-not-exist' was not found in namespace '{namespace}'\\. Did you mean one of:
""",
)
# Ensure we write all output such as stderr and reporting files before closing any streams.
self.assertNotIn("Exception message: I/O operation on closed file.", file_contents)

def _get_log_file_paths(self, workdir, pid):
pid_specific_log_file = ExceptionSink.exceptions_log_path(for_pid=pid, in_dir=workdir)
Expand Down Expand Up @@ -135,8 +133,6 @@ def _assert_graceful_signal_log_matches(self, pid, signum, signame, contents):
pid=pid, signum=signum, signame=signame
),
)
# Ensure we write all output such as stderr and reporting files before closing any streams.
self.assertNotIn("Exception message: I/O operation on closed file.", contents)

def test_dumps_logs_on_signal(self):
"""Send signals which are handled, but don't get converted into a KeyboardInterrupt."""
Expand Down
28 changes: 10 additions & 18 deletions src/python/pants/bin/remote_pants_runner.py
Expand Up @@ -4,7 +4,6 @@
import logging
import sys
import time
from contextlib import contextmanager
from typing import List, Mapping

from pants.base.exception_sink import ExceptionSink, SignalHandler
Expand All @@ -20,12 +19,11 @@


class PailgunClientSignalHandler(SignalHandler):
def __init__(self, pailgun_client, pid, timeout=1, *args, **kwargs):
assert isinstance(pailgun_client, NailgunClient)
def __init__(self, pailgun_client: NailgunClient, pid: int, timeout: float = 1):
self._pailgun_client = pailgun_client
self._timeout = timeout
self.pid = pid
super().__init__(*args, **kwargs)
super().__init__(pantsd_instance=False)

def _forward_signal_with_timeout(self, signum, signame):
# TODO Consider not accessing the private function _maybe_last_pid here, or making it public.
Expand All @@ -36,7 +34,7 @@ def _forward_signal_with_timeout(self, signum, signame):
)
self._pailgun_client.set_exit_timeout(
timeout=self._timeout,
reason=KeyboardInterrupt("Interrupted by user over pailgun client!"),
reason=KeyboardInterrupt("Sending user interrupt to pantsd"),
)
self._pailgun_client.maybe_send_signal(signum)

Expand Down Expand Up @@ -82,17 +80,6 @@ def __init__(
self._bootstrap_options = options_bootstrapper.bootstrap_options
self._client = PantsDaemonClient(self._bootstrap_options)

@contextmanager
def _trapped_signals(self, client, pid: int):
"""A contextmanager that handles SIGINT (control-c) and SIGQUIT (control-\\) remotely."""
signal_handler = PailgunClientSignalHandler(
client,
pid=pid,
timeout=self._bootstrap_options.for_global_scope().pantsd_pailgun_quit_timeout,
)
with ExceptionSink.trapped_signals(signal_handler):
yield

@staticmethod
def _backoff(attempt):
"""Minimal backoff strategy for daemon restarts."""
Expand Down Expand Up @@ -140,14 +127,17 @@ def run(self) -> ExitCode:
def _connect_and_execute(self, pantsd_handle: PantsDaemonClient.Handle) -> ExitCode:
port = pantsd_handle.port
pid = pantsd_handle.pid

global_options = self._bootstrap_options.for_global_scope()

# Merge the nailgun TTY capability environment variables with the passed environment dict.
ng_env = NailgunProtocol.ttynames_to_env(sys.stdin, sys.stdout.buffer, sys.stderr.buffer)
modified_env = {
**self._env,
**ng_env,
"PANTSD_RUNTRACKER_CLIENT_START_TIME": str(self._start_time),
"PANTSD_REQUEST_TIMEOUT_LIMIT": str(
self._bootstrap_options.for_global_scope().pantsd_timeout_when_multiple_invocations
global_options.pantsd_timeout_when_multiple_invocations
),
}

Expand All @@ -162,7 +152,9 @@ def _connect_and_execute(self, pantsd_handle: PantsDaemonClient.Handle) -> ExitC
metadata_base_dir=pantsd_handle.metadata_base_dir,
)

with self._trapped_signals(client, pantsd_handle.pid), STTYSettings.preserved():
timeout = global_options.pantsd_pailgun_quit_timeout
pantsd_signal_handler = PailgunClientSignalHandler(client, pid=pid, timeout=timeout)
with ExceptionSink.trapped_signals(pantsd_signal_handler), STTYSettings.preserved():
# Execute the command on the pailgun.
return client.execute(self._args[0], self._args[1:], modified_env)

Expand Down
15 changes: 13 additions & 2 deletions src/python/pants/engine/internals/scheduler.py
Expand Up @@ -314,9 +314,20 @@ def poll_workunits(self, session, max_log_verbosity: LogLevel) -> PolledWorkunit
return {"started": result[0], "completed": result[1]}

def _run_and_return_roots(self, session, execution_request):
def python_signal() -> bool:
"""This function checks to see whether the main Python thread has responded to a signal.

It is invoked by the Rust scheduler, and if it returns true, the scheduler will
gracefully shut down.
"""
return ExceptionSink.signal_sent() is not None

try:
raw_roots = self._native.lib.scheduler_execute(
self._scheduler, session, execution_request
self._scheduler,
session,
execution_request,
python_signal,
)
except self._native.lib.PollTimeout:
raise ExecutionTimeoutError("Timed out")
Expand Down Expand Up @@ -481,7 +492,7 @@ def execute(self, execution_request: ExecutionRequest):
),
)

ExceptionSink.toggle_ignoring_sigint_v2_engine(False)
ExceptionSink.toggle_ignoring_sigint(False)

self._maybe_visualize()

Expand Down
2 changes: 1 addition & 1 deletion src/python/pants/engine/process.py
Expand Up @@ -309,7 +309,7 @@ class InteractiveRunner:
_scheduler: "SchedulerSession"

def run(self, request: InteractiveProcess) -> InteractiveProcessResult:
ExceptionSink.toggle_ignoring_sigint_v2_engine(True)
ExceptionSink.toggle_ignoring_sigint(True)
return self._scheduler.run_local_interactive_process(request)
Comment on lines +312 to 313
Copy link
Sponsor Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this would be a good place to use a context manager / with block... any idea why we stopped doing that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code never used a with block, although I agree that that would be a good improvement, and I might implement that shortly in a followup commit.



Expand Down
4 changes: 2 additions & 2 deletions src/python/pants/java/nailgun_client.py
Expand Up @@ -43,7 +43,7 @@ def __init__(self, sock, in_file, out_file, err_file, exit_on_broken_pipe=False)
self._exit_timeout = None
self._exit_reason = None

def _set_exit_timeout(self, timeout, reason):
def _set_exit_timeout(self, timeout: float, reason: type) -> None:
"""Set a timeout for the remainder of the session, along with an exception to raise. which
is implemented by NailgunProtocol.

Expand Down Expand Up @@ -245,7 +245,7 @@ def try_connect(self):
else:
return sock

def set_exit_timeout(self, timeout, reason):
def set_exit_timeout(self, timeout: float, reason: type) -> None:
"""Expose the inner session object's exit timeout setter."""
self._session._set_exit_timeout(timeout, reason)

Expand Down