Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

add check_pid, and handle stale PID info in ipcluster.

    
ipcluster now actually tries to check if a program with a PID is running, better handling stale (or fake) PID files.
    
Checks are via `kernel32.OpenProcess` on Windows and `ps x` on everything else.  If the operation fails, then the process is assumed to exist, effectively falling back on prior behavior.

closes gh-428
  • Loading branch information...
commit bf35b32821db536d3ea72c0bf9bbb98afb0b1497 1 parent 8879791
Min RK authored
31 IPython/parallel/apps/clusterdir.py
@@ -23,6 +23,8 @@
23 23 import shutil
24 24 import sys
25 25
  26 +from subprocess import Popen, PIPE
  27 +
26 28 from IPython.config.loader import PyFileConfigLoader
27 29 from IPython.config.configurable import Configurable
28 30 from IPython.core.application import Application, BaseAppConfigLoader
@@ -534,4 +536,31 @@ def get_pid_from_file(self):
534 536 return pid
535 537 else:
536 538 raise PIDFileError('pid file not found: %s' % pid_file)
537   -
  539 +
  540 + def check_pid(self, pid):
  541 + if os.name == 'nt':
  542 + try:
  543 + import ctypes
  544 + # returns 0 if no such process (of ours) exists
  545 + # positive int otherwise
  546 + p = ctypes.windll.kernel32.OpenProcess(1,0,pid)
  547 + except Exception:
  548 + self.log.warn(
  549 + "Could not determine whether pid %i is running via `OpenProcess`. "
  550 + " Making the likely assumption that it is."%pid
  551 + )
  552 + return True
  553 + return bool(p)
  554 + else:
  555 + try:
  556 + p = Popen(['ps','x'], stdout=PIPE, stderr=PIPE)
  557 + output,_ = p.communicate()
  558 + except OSError:
  559 + self.log.warn(
  560 + "Could not determine whether pid %i is running via `ps x`. "
  561 + " Making the likely assumption that it is."%pid
  562 + )
  563 + return True
  564 + pids = map(int, re.findall(r'^\W*\d+', output, re.MULTILINE))
  565 + return pid in pids
  566 +
62 IPython/parallel/apps/ipclusterapp.py
@@ -21,6 +21,7 @@
21 21 import re
22 22 import signal
23 23
  24 +from subprocess import check_call, CalledProcessError, PIPE
24 25 import zmq
25 26 from zmq.eventloop import ioloop
26 27
@@ -497,13 +498,17 @@ def start_app_start(self):
497 498 except PIDFileError:
498 499 pass
499 500 else:
500   - self.log.critical(
501   - 'Cluster is already running with [pid=%s]. '
502   - 'use "ipcluster stop" to stop the cluster.' % pid
503   - )
504   - # Here I exit with a unusual exit status that other processes
505   - # can watch for to learn how I existed.
506   - self.exit(ALREADY_STARTED)
  501 + if self.check_pid(pid):
  502 + self.log.critical(
  503 + 'Cluster is already running with [pid=%s]. '
  504 + 'use "ipcluster stop" to stop the cluster.' % pid
  505 + )
  506 + # Here I exit with a unusual exit status that other processes
  507 + # can watch for to learn how I existed.
  508 + self.exit(ALREADY_STARTED)
  509 + else:
  510 + self.remove_pid_file()
  511 +
507 512
508 513 # Now log and daemonize
509 514 self.log.info(
@@ -526,7 +531,8 @@ def start_app_start(self):
526 531 pass
527 532 else:
528 533 raise
529   - self.remove_pid_file()
  534 + finally:
  535 + self.remove_pid_file()
530 536
531 537 def start_app_engines(self):
532 538 """Start the app for the start subcommand."""
@@ -563,23 +569,41 @@ def start_app_stop(self):
563 569 pid = self.get_pid_from_file()
564 570 except PIDFileError:
565 571 self.log.critical(
566   - 'Problem reading pid file, cluster is probably not running.'
  572 + 'Could not read pid file, cluster is probably not running.'
567 573 )
568 574 # Here I exit with a unusual exit status that other processes
569 575 # can watch for to learn how I existed.
  576 + self.remove_pid_file()
570 577 self.exit(ALREADY_STOPPED)
571   - else:
572   - if os.name=='posix':
573   - sig = config.Global.signal
574   - self.log.info(
575   - "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
576   - )
  578 +
  579 + if not self.check_pid(pid):
  580 + self.log.critical(
  581 + 'Cluster [pid=%r] is not running.' % pid
  582 + )
  583 + self.remove_pid_file()
  584 + # Here I exit with a unusual exit status that other processes
  585 + # can watch for to learn how I existed.
  586 + self.exit(ALREADY_STOPPED)
  587 +
  588 + elif os.name=='posix':
  589 + sig = config.Global.signal
  590 + self.log.info(
  591 + "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
  592 + )
  593 + try:
577 594 os.kill(pid, sig)
578   - elif os.name=='nt':
579   - # As of right now, we don't support daemonize on Windows, so
580   - # stop will not do anything. Minimally, it should clean up the
581   - # old .pid files.
  595 + except OSError:
  596 + self.log.error("Stopping cluster failed, assuming already dead.",
  597 + exc_info=True)
582 598 self.remove_pid_file()
  599 + elif os.name=='nt':
  600 + try:
  601 + # kill the whole tree
  602 + p = check_call(['taskkill', '-pid', str(pid), '-t', '-f'], stdout=PIPE,stderr=PIPE)
  603 + except (CalledProcessError, OSError):
  604 + self.log.error("Stopping cluster failed, assuming already dead.",
  605 + exc_info=True)
  606 + self.remove_pid_file()
583 607
584 608
585 609 def launch_new_instance():

0 comments on commit bf35b32

Please sign in to comment.
Something went wrong with that request. Please try again.