Crash pailgun client on SIGINT if we haven't received the pid yet (#7924

) ## Problem If the client hadn't received the remote process PID when it received a Ctrl-C signal, it was unable to forward it to the daemon. The client receives the PID once the actual pants work starts, so there was no way of Ctrl-C-ing out of pants when a request was waiting to aquire the pantsd lock. More context and examples here: #7920 ## Solution Crash the client when we receive ctrl-c and haven't received a remote pid yet, so that the request is no longer valid, and the server can stop trying to serve it. ## Result Ctrl-C now works when waiting for the daemon lock, and it doesn't kill the daemon.
pantsbuild · Jun 26, 2019 · 8edbcef · 8edbcef
1 parent 4514aa5
commit 8edbcef
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 2 deletions.
diff --git a/src/python/pants/bin/remote_pants_runner.py b/src/python/pants/bin/remote_pants_runner.py
@@ -29,6 +29,7 @@ def __init__(self, pailgun_client, timeout=1, *args, **kwargs):
     super().__init__(*args, **kwargs)
 
   def _forward_signal_with_timeout(self, signum, signame):
+    # TODO Consider not accessing the private function _maybe_last_pid here, or making it public.
     logger.info(
       'Sending {} to pantsd with pid {}, waiting up to {} seconds before sending SIGKILL...'
       .format(signame, self._pailgun_client._maybe_last_pid(), self._timeout))
@@ -38,7 +39,12 @@ def _forward_signal_with_timeout(self, signum, signame):
     self._pailgun_client.maybe_send_signal(signum)
 
   def handle_sigint(self, signum, _frame):
-    self._forward_signal_with_timeout(signum, 'SIGINT')
+    if self._pailgun_client._maybe_last_pid():
+      self._forward_signal_with_timeout(signum, 'SIGINT')
+    else:
+      # NB: We consider not having received a PID yet as "not having started substantial work".
+      # So in this case, we let the client die gracefully, and the server handle the closed socket.
+      super(PailgunClientSignalHandler, self).handle_sigint(signum, _frame)
 
   def handle_sigquit(self, signum, _frame):
     self._forward_signal_with_timeout(signum, 'SIGQUIT')

diff --git a/src/python/pants/java/nailgun_client.py b/src/python/pants/java/nailgun_client.py
@@ -314,6 +314,9 @@ def maybe_send_signal(self, signum, include_pgrp=True):
     """Send the signal `signum` send if the PID and/or PGRP chunks have been received.
 
     No error is raised if the pid or pgrp are None or point to an already-dead process.
+
+    :param signum: The signal number to send to the remote process.
+    :param include_pgrp: If True, it will try to kill the pgrp as well
     """
     remote_pid = self._maybe_last_pid()
     if remote_pid is not None:

diff --git a/src/python/pants/option/global_options.py b/src/python/pants/option/global_options.py
@@ -266,7 +266,9 @@ def register_bootstrap_options(cls, register):
     # In practice, this means that if this is set, a run will not even try to use pantsd.
     # NB: Eventually, we would like to deprecate this flag in favor of making pantsd runs parallelizable.
     register('--concurrent', advanced=True, type=bool, default=False, daemon=False,
-             help='Enable concurrent runs of pants.')
+             help='Enable concurrent runs of pants. Without this enabled, pants will '
+                  'start up all concurrent invocations (e.g. in other terminals) without pantsd. '
+                  'Enabling this option requires parallel pants invocations to block on the first')
 
     # Shutdown pantsd after the current run.
     # This needs to be accessed at the same time as enable_pantsd,

diff --git a/src/python/pants/pantsd/pailgun_server.py b/src/python/pants/pantsd/pailgun_server.py
@@ -266,6 +266,7 @@ def yield_and_release(time_waited):
       with yield_and_release(time_polled):
         yield
     else:
+      self.logger.debug("request {} didn't aquire the lock on the first try, polling...".format(request))
       # We have to wait for another request to finish being handled.
       NailgunProtocol.send_stderr(request, "Another pants invocation is running. Will wait {} for it to finish before giving up.\n".format(
         "forever" if self._should_poll_forever(timeout) else "up to {} seconds".format(timeout)
@@ -292,6 +293,9 @@ def process_request_thread(self, request, client_address):
         # Attempt to handle a request with the handler.
         handler.handle_request()
         self.request_complete_callback()
+    except BrokenPipeError as e:
+      # The client has closed the connection, most likely from a SIGINT
+      self.logger.error("Request {} abruptly closed with {}, probably because the client crashed or was sent a SIGINT.".format(request, type(e)))
     except Exception as e:
       # If that fails, (synchronously) handle the error with the error handler sans-fork.
       try:

diff --git a/tests/python/pants_test/pantsd/test_pantsd_integration.py b/tests/python/pants_test/pantsd/test_pantsd_integration.py
@@ -600,6 +600,56 @@ def test_signal_pailgun_stream_timeout(self):
       # NB: Make the timeout very small to ensure the warning message will reliably occur in CI!
       quit_timeout=1e-6)
 
+  def test_sigint_kills_request_waiting_for_lock(self):
+    """
+    Test that, when a pailgun request is blocked waiting for another one to end,
+    sending SIGINT to the blocked run will kill it.
+
+    Regression test for issue: #7920
+    """
+    config = {'GLOBAL': {
+      'pantsd_timeout_when_multiple_invocations': -1,
+      'level': 'debug'
+    }}
+    with self.pantsd_test_context(extra_config=config) as (workdir, config, checker):
+      # Run a repl, so that any other run waiting to acquire the daemon lock waits forever.
+      first_run_handle = self.run_pants_with_workdir_without_waiting(
+        command=['repl', 'examples/src/python/example/hello::'],
+        workdir=workdir,
+        config=config
+      )
+      checker.assert_started()
+      checker.assert_running()
+
+      blocking_run_handle = self.run_pants_with_workdir_without_waiting(
+        command=['goals'],
+        workdir=workdir,
+        config=config
+      )
+
+      # Block until the second request is waiting for the lock.
+      blocked = True
+      while blocked:
+        log = '\n'.join(read_pantsd_log(workdir))
+        if "didn't aquire the lock on the first try, polling." in log:
+          blocked = False
+        # NB: This sleep is totally deterministic, it's just so that we don't spend too many cycles
+        # busy waiting.
+        time.sleep(0.1)
+
+      # Sends SIGINT to the run that is waiting.
+      blocking_run_client_pid = blocking_run_handle.process.pid
+      os.kill(blocking_run_client_pid, signal.SIGINT)
+      blocking_run_handle.join()
+
+      # Check that pantsd is still serving the other request.
+      checker.assert_running()
+
+      # Send exit() to the repl, and exit it.
+      result = first_run_handle.join(stdin_data='exit()')
+      self.assert_success(result)
+      checker.assert_running()
+
   def test_pantsd_environment_scrubbing(self):
     # This pair of JVM options causes the JVM to always crash, so the command will fail if the env
     # isn't stripped.