Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

add check_pid, and handle stale PID info in ipcluster.

    
ipcluster now actually tries to check if a program with a PID is running, better handling stale (or fake) PID files.
    
Checks are via `kernel32.OpenProcess` on Windows and `ps x` on everything else.  If the operation fails, then the process is assumed to exist, effectively falling back on prior behavior.

closes gh-428
  • Loading branch information...
commit bf35b32821db536d3ea72c0bf9bbb98afb0b1497 1 parent 8879791
Min RK authored
31 IPython/parallel/apps/clusterdir.py
View
@@ -23,6 +23,8 @@
import shutil
import sys
+from subprocess import Popen, PIPE
+
from IPython.config.loader import PyFileConfigLoader
from IPython.config.configurable import Configurable
from IPython.core.application import Application, BaseAppConfigLoader
@@ -534,4 +536,31 @@ def get_pid_from_file(self):
return pid
else:
raise PIDFileError('pid file not found: %s' % pid_file)
-
+
+ def check_pid(self, pid):
+ if os.name == 'nt':
+ try:
+ import ctypes
+ # returns 0 if no such process (of ours) exists
+ # positive int otherwise
+ p = ctypes.windll.kernel32.OpenProcess(1,0,pid)
+ except Exception:
+ self.log.warn(
+ "Could not determine whether pid %i is running via `OpenProcess`. "
+ " Making the likely assumption that it is."%pid
+ )
+ return True
+ return bool(p)
+ else:
+ try:
+ p = Popen(['ps','x'], stdout=PIPE, stderr=PIPE)
+ output,_ = p.communicate()
+ except OSError:
+ self.log.warn(
+ "Could not determine whether pid %i is running via `ps x`. "
+ " Making the likely assumption that it is."%pid
+ )
+ return True
+ pids = map(int, re.findall(r'^\W*\d+', output, re.MULTILINE))
+ return pid in pids
+
62 IPython/parallel/apps/ipclusterapp.py
View
@@ -21,6 +21,7 @@
import re
import signal
+from subprocess import check_call, CalledProcessError, PIPE
import zmq
from zmq.eventloop import ioloop
@@ -497,13 +498,17 @@ def start_app_start(self):
except PIDFileError:
pass
else:
- self.log.critical(
- 'Cluster is already running with [pid=%s]. '
- 'use "ipcluster stop" to stop the cluster.' % pid
- )
- # Here I exit with a unusual exit status that other processes
- # can watch for to learn how I existed.
- self.exit(ALREADY_STARTED)
+ if self.check_pid(pid):
+ self.log.critical(
+ 'Cluster is already running with [pid=%s]. '
+ 'use "ipcluster stop" to stop the cluster.' % pid
+ )
+ # Here I exit with a unusual exit status that other processes
+ # can watch for to learn how I existed.
+ self.exit(ALREADY_STARTED)
+ else:
+ self.remove_pid_file()
+
# Now log and daemonize
self.log.info(
@@ -526,7 +531,8 @@ def start_app_start(self):
pass
else:
raise
- self.remove_pid_file()
+ finally:
+ self.remove_pid_file()
def start_app_engines(self):
"""Start the app for the start subcommand."""
@@ -563,23 +569,41 @@ def start_app_stop(self):
pid = self.get_pid_from_file()
except PIDFileError:
self.log.critical(
- 'Problem reading pid file, cluster is probably not running.'
+ 'Could not read pid file, cluster is probably not running.'
)
# Here I exit with a unusual exit status that other processes
# can watch for to learn how I existed.
+ self.remove_pid_file()
self.exit(ALREADY_STOPPED)
- else:
- if os.name=='posix':
- sig = config.Global.signal
- self.log.info(
- "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
- )
+
+ if not self.check_pid(pid):
+ self.log.critical(
+ 'Cluster [pid=%r] is not running.' % pid
+ )
+ self.remove_pid_file()
+ # Here I exit with a unusual exit status that other processes
+ # can watch for to learn how I existed.
+ self.exit(ALREADY_STOPPED)
+
+ elif os.name=='posix':
+ sig = config.Global.signal
+ self.log.info(
+ "Stopping cluster [pid=%r] with [signal=%r]" % (pid, sig)
+ )
+ try:
os.kill(pid, sig)
- elif os.name=='nt':
- # As of right now, we don't support daemonize on Windows, so
- # stop will not do anything. Minimally, it should clean up the
- # old .pid files.
+ except OSError:
+ self.log.error("Stopping cluster failed, assuming already dead.",
+ exc_info=True)
self.remove_pid_file()
+ elif os.name=='nt':
+ try:
+ # kill the whole tree
+ p = check_call(['taskkill', '-pid', str(pid), '-t', '-f'], stdout=PIPE,stderr=PIPE)
+ except (CalledProcessError, OSError):
+ self.log.error("Stopping cluster failed, assuming already dead.",
+ exc_info=True)
+ self.remove_pid_file()
def launch_new_instance():
Please sign in to comment.
Something went wrong with that request. Please try again.