Skip to content

Commit

Permalink
Crash pailgun client on SIGINT if we haven't received the pid yet (#7924
Browse files Browse the repository at this point in the history
)

## Problem

If the client hadn't received the remote process PID when it received a Ctrl-C signal, it was unable to forward it to the daemon. The client receives the PID once the actual pants work starts, so there was no way of Ctrl-C-ing out of pants when a request was waiting to aquire the pantsd lock. More context and examples here: #7920

## Solution

Crash the client when we receive ctrl-c and haven't received a remote pid yet, so that the request is no longer valid, and the server can stop trying to serve it.

## Result

Ctrl-C now works when waiting for the daemon lock, and it doesn't kill the daemon.
  • Loading branch information
blorente committed Jun 26, 2019
1 parent 4514aa5 commit 8edbcef
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 2 deletions.
8 changes: 7 additions & 1 deletion src/python/pants/bin/remote_pants_runner.py
Expand Up @@ -29,6 +29,7 @@ def __init__(self, pailgun_client, timeout=1, *args, **kwargs):
super().__init__(*args, **kwargs)

def _forward_signal_with_timeout(self, signum, signame):
# TODO Consider not accessing the private function _maybe_last_pid here, or making it public.
logger.info(
'Sending {} to pantsd with pid {}, waiting up to {} seconds before sending SIGKILL...'
.format(signame, self._pailgun_client._maybe_last_pid(), self._timeout))
Expand All @@ -38,7 +39,12 @@ def _forward_signal_with_timeout(self, signum, signame):
self._pailgun_client.maybe_send_signal(signum)

def handle_sigint(self, signum, _frame):
self._forward_signal_with_timeout(signum, 'SIGINT')
if self._pailgun_client._maybe_last_pid():
self._forward_signal_with_timeout(signum, 'SIGINT')
else:
# NB: We consider not having received a PID yet as "not having started substantial work".
# So in this case, we let the client die gracefully, and the server handle the closed socket.
super(PailgunClientSignalHandler, self).handle_sigint(signum, _frame)

def handle_sigquit(self, signum, _frame):
self._forward_signal_with_timeout(signum, 'SIGQUIT')
Expand Down
3 changes: 3 additions & 0 deletions src/python/pants/java/nailgun_client.py
Expand Up @@ -314,6 +314,9 @@ def maybe_send_signal(self, signum, include_pgrp=True):
"""Send the signal `signum` send if the PID and/or PGRP chunks have been received.
No error is raised if the pid or pgrp are None or point to an already-dead process.
:param signum: The signal number to send to the remote process.
:param include_pgrp: If True, it will try to kill the pgrp as well
"""
remote_pid = self._maybe_last_pid()
if remote_pid is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/python/pants/option/global_options.py
Expand Up @@ -266,7 +266,9 @@ def register_bootstrap_options(cls, register):
# In practice, this means that if this is set, a run will not even try to use pantsd.
# NB: Eventually, we would like to deprecate this flag in favor of making pantsd runs parallelizable.
register('--concurrent', advanced=True, type=bool, default=False, daemon=False,
help='Enable concurrent runs of pants.')
help='Enable concurrent runs of pants. Without this enabled, pants will '
'start up all concurrent invocations (e.g. in other terminals) without pantsd. '
'Enabling this option requires parallel pants invocations to block on the first')

# Shutdown pantsd after the current run.
# This needs to be accessed at the same time as enable_pantsd,
Expand Down
4 changes: 4 additions & 0 deletions src/python/pants/pantsd/pailgun_server.py
Expand Up @@ -266,6 +266,7 @@ def yield_and_release(time_waited):
with yield_and_release(time_polled):
yield
else:
self.logger.debug("request {} didn't aquire the lock on the first try, polling...".format(request))
# We have to wait for another request to finish being handled.
NailgunProtocol.send_stderr(request, "Another pants invocation is running. Will wait {} for it to finish before giving up.\n".format(
"forever" if self._should_poll_forever(timeout) else "up to {} seconds".format(timeout)
Expand All @@ -292,6 +293,9 @@ def process_request_thread(self, request, client_address):
# Attempt to handle a request with the handler.
handler.handle_request()
self.request_complete_callback()
except BrokenPipeError as e:
# The client has closed the connection, most likely from a SIGINT
self.logger.error("Request {} abruptly closed with {}, probably because the client crashed or was sent a SIGINT.".format(request, type(e)))
except Exception as e:
# If that fails, (synchronously) handle the error with the error handler sans-fork.
try:
Expand Down
50 changes: 50 additions & 0 deletions tests/python/pants_test/pantsd/test_pantsd_integration.py
Expand Up @@ -600,6 +600,56 @@ def test_signal_pailgun_stream_timeout(self):
# NB: Make the timeout very small to ensure the warning message will reliably occur in CI!
quit_timeout=1e-6)

def test_sigint_kills_request_waiting_for_lock(self):
"""
Test that, when a pailgun request is blocked waiting for another one to end,
sending SIGINT to the blocked run will kill it.
Regression test for issue: #7920
"""
config = {'GLOBAL': {
'pantsd_timeout_when_multiple_invocations': -1,
'level': 'debug'
}}
with self.pantsd_test_context(extra_config=config) as (workdir, config, checker):
# Run a repl, so that any other run waiting to acquire the daemon lock waits forever.
first_run_handle = self.run_pants_with_workdir_without_waiting(
command=['repl', 'examples/src/python/example/hello::'],
workdir=workdir,
config=config
)
checker.assert_started()
checker.assert_running()

blocking_run_handle = self.run_pants_with_workdir_without_waiting(
command=['goals'],
workdir=workdir,
config=config
)

# Block until the second request is waiting for the lock.
blocked = True
while blocked:
log = '\n'.join(read_pantsd_log(workdir))
if "didn't aquire the lock on the first try, polling." in log:
blocked = False
# NB: This sleep is totally deterministic, it's just so that we don't spend too many cycles
# busy waiting.
time.sleep(0.1)

# Sends SIGINT to the run that is waiting.
blocking_run_client_pid = blocking_run_handle.process.pid
os.kill(blocking_run_client_pid, signal.SIGINT)
blocking_run_handle.join()

# Check that pantsd is still serving the other request.
checker.assert_running()

# Send exit() to the repl, and exit it.
result = first_run_handle.join(stdin_data='exit()')
self.assert_success(result)
checker.assert_running()

def test_pantsd_environment_scrubbing(self):
# This pair of JVM options causes the JVM to always crash, so the command will fail if the env
# isn't stripped.
Expand Down

0 comments on commit 8edbcef

Please sign in to comment.