Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Bug 795417 - Persistent job queues. r=bc

Phones also no longer reboot by default.  Added a reboot command,
and made device commands handle "all" instead of a device ID.
Cleaned up some old comments.
  • Loading branch information...
commit 63933d50f835c0b7c2f87d4de089ec94fc027b49 1 parent cc8be50
@markrcote markrcote authored
Showing with 226 additions and 136 deletions.
  1. +1 −0  .gitignore
  2. +30 −43 autophone.py
  3. +80 −0 jobs.py
  4. +115 −93 worker.py
View
1  .gitignore
@@ -3,3 +3,4 @@
*.log
*~
configs/unittest_defaults.ini
+jobs.sqlite
View
73 autophone.py
@@ -24,6 +24,7 @@
import builds
import buildserver
+import jobs
import phonetest
from mailer import Mailer
@@ -65,7 +66,7 @@ def handle(self):
response = self.server.cmd_cb(line)
self.request.send(response + '\n')
- def __init__(self, clear_cache, reboot_phones, test_path, cachefile,
+ def __init__(self, clear_cache, test_path, cachefile,
ipaddr, port, logfile, loglevel, emailcfg, enable_pulse,
repos, buildtypes, build_cache_port):
self._test_path = test_path
@@ -85,6 +86,7 @@ def __init__(self, clear_cache, reboot_phones, test_path, cachefile,
self._stop = False
self._next_worker_num = 0
+ self.jobs = jobs.Jobs()
self.phone_workers = {} # indexed by mac address
self.worker_lock = threading.Lock()
self.cmd_lock = threading.Lock()
@@ -106,8 +108,9 @@ def __init__(self, clear_cache, reboot_phones, test_path, cachefile,
else:
# Otherwise assume cache is valid and read from it
self.read_cache()
- if reboot_phones:
- self.reset_phones()
+
+ if clear_cache:
+ self.jobs.clear_all()
self.server = None
self.server_thread = None
@@ -175,13 +178,6 @@ def check_for_dead_workers(self):
logging.info(traceback.format_exc())
def worker_msg_loop(self):
- # FIXME: look up worker by msg.phoneid and have worker process
- # message. worker, as part of the main process, can log status
- # and store it for later querying.
- # also, store first instance of current status (e.g. idle for 30
- # minutes, last update 1 minute ago). store test name and start time
- # if status is WORKING. All this will help us determine if and where
- # a phone/worker process is stuck.
try:
while not self._stop:
self.check_for_dead_workers()
@@ -197,14 +193,15 @@ def worker_msg_loop(self):
self.stop()
# Start the phones for testing
- def start_tests(self, build_url, devices=None):
+ def new_job(self, build_url, devices=None):
self.worker_lock.acquire()
for p in self.phone_workers.values():
- if devices and p.phone_cfg['phoneid'] not in devices:
+ phoneid = p.phone_cfg['phoneid']
+ if devices and phoneid not in devices:
continue
- logging.info('Notifying device %s of new build.' %
- p.phone_cfg['phoneid'])
- p.new_build(build_url)
+ self.jobs.new_job(build_url, phoneid)
+ logging.info('Notifying device %s of new job.' % phoneid)
+ p.new_job()
self.worker_lock.release()
def route_cmd(self, data):
@@ -247,28 +244,22 @@ def route_cmd(self, data):
response += ' previous state %s ago:\n %s\n' % (now - w.last_status_of_previous_type.timestamp, w.last_status_of_previous_type.short_desc())
response += 'ok'
elif (cmd == 'disable' or cmd == 'enable' or cmd == 'debug' or
- cmd == 'ping'):
+ cmd == 'ping' or cmd == 'reboot'):
# Commands that take a phone as a parameter
# FIXME: need start, stop, and remove
- # Note that disable means that the device will still be pinged
- # periodically. Do we need permanently disabled/stopped?
phoneid, space, params = params.partition(' ')
- worker = None
- for w in self.phone_workers.values():
- if (w.phone_cfg['serial'] == phoneid or
- w.phone_cfg['phoneid'] == phoneid):
- worker = w
- break
- if worker:
- f = getattr(worker, cmd)
- if params:
- f(params)
- else:
- f()
- response = 'ok'
- self.update_phone_cache()
- else:
- response = 'error: phone not found'
+ response = 'error: phone not found'
+ for worker in self.phone_workers.values():
+ if (phoneid.lower() == 'all' or
+ worker.phone_cfg['serial'] == phoneid or
+ worker.phone_cfg['phoneid'] == phoneid):
+ f = getattr(worker, cmd)
+ if params:
+ f(params)
+ else:
+ f()
+ response = 'ok'
+ self.update_phone_cache()
else:
response = 'Unknown command "%s"\n' % cmd
self.cmd_lock.release()
@@ -377,7 +368,7 @@ def trigger_jobs(self, data):
args = data.split(' ')
if not args:
return 'invalid args'
- self.start_tests(args[0], args[1:])
+ self.new_job(args[0], args[1:])
return 'ok'
def reset_phones(self):
@@ -395,7 +386,7 @@ def on_build(self, msg):
# those, and only run the ones with real URLs
# We create jobs for all the phones and push them into the queue
if 'buildurl' in msg:
- self.start_tests(msg['buildurl'])
+ self.new_job(msg['buildurl'])
def stop(self):
self._stop = True
@@ -405,7 +396,7 @@ def stop(self):
self.server_thread.join()
-def main(clear_cache, reboot_phones, test_path, cachefile, ipaddr, port,
+def main(clear_cache, test_path, cachefile, ipaddr, port,
logfile, loglevel_name, emailcfg, enable_pulse, enable_unittests,
cache_dir, override_build_dir, repos, buildtypes, build_cache_port):
@@ -459,7 +450,7 @@ def sigterm_handler(signum, frame):
print '%s Starting server on port %d.' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), port)
- autophone = AutoPhone(clear_cache, reboot_phones, test_path, cachefile,
+ autophone = AutoPhone(clear_cache, test_path, cachefile,
ipaddr, port, logfile, loglevel, emailcfg,
enable_pulse, repos, buildtypes, build_cache_port)
signal.signal(signal.SIGTERM, sigterm_handler)
@@ -480,10 +471,6 @@ def sigterm_handler(signum, frame):
default=False,
help='If specified, we clear the information in the '
'autophone cache before starting')
- parser.add_option('--no-reboot', action='store_false', dest='reboot_phones',
- default=True, help='Indicates that phones should not be '
- 'rebooted when autophone starts (ignored if '
- '--clear-cache is used')
parser.add_option('--ipaddr', action='store', type='string', dest='ipaddr',
default=None, help='IP address of interface to use for '
'phone callbacks, e.g. after rebooting. If not given, '
@@ -558,7 +545,7 @@ def sigterm_handler(signum, frame):
if not options.buildtypes:
options.buildtypes = ['opt']
- exit_code = main(options.clear_cache, options.reboot_phones,
+ exit_code = main(options.clear_cache,
options.test_path, options.cachefile, options.ipaddr,
options.port, options.logfile, options.loglevel,
options.emailcfg, options.enable_pulse,
View
80 jobs.py
@@ -0,0 +1,80 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import datetime
+import os
+import sqlite3
+
+
+class Jobs(object):
+
+ MAX_ATTEMPTS = 3
+
+ def __init__(self, default_device=None):
+ self.default_device = default_device
+ self.filename = 'jobs.sqlite'
+ if not os.path.exists(self.filename):
+ conn = self._conn()
+ c = conn.cursor()
+ c.execute('create table jobs '
+ '(created text, last_attempt text, build_url text, '
+ 'attempts int, device text)')
+ conn.commit()
+
+ def _conn(self):
+ return sqlite3.connect(self.filename)
+
+ def clear_all(self):
+ conn = self._conn()
+ conn.cursor().execute('delete from jobs')
+ conn.commit()
+
+ def new_job(self, build_url, device=None):
+ if not device:
+ device = self.default_device
+ now = datetime.datetime.now().isoformat()
+ conn = self._conn()
+ conn.cursor().execute('insert into jobs values (?, ?, ?, 0, ?)',
+ (now, None, build_url, device))
+ conn.commit()
+
+ def jobs_pending(self, device=None):
+ if not device:
+ device = self.default_device
+ return self._conn().cursor().execute(
+ 'select count(ROWID) from jobs where device=?',
+ (device,)).fetchone()[0]
+
+ def get_next_job(self, device=None):
+ if not device:
+ device = self.default_device
+ conn = self._conn()
+ c = conn.cursor()
+ c.execute('delete from jobs where device=? and attempts>=?',
+ (device, self.MAX_ATTEMPTS))
+ conn.commit()
+ jobs = [{'id': job[0],
+ 'created': job[1],
+ 'last_attempt': job[2],
+ 'build_url': job[3],
+ 'attempts': job[4]}
+ for job in c.execute(
+ 'select ROWID as id,created,last_attempt,build_url,attempts'
+ ' from jobs where device=? order by created', (device,))]
+ if not jobs:
+ return None
+ next_job = jobs[0]
+ next_job['attempts'] += 1
+ next_job['last_attempt'] = datetime.datetime.now().isoformat()
+ c.execute('update jobs set attempts=?, last_attempt=? where ROWID=?',
+ (next_job['attempts'], next_job['last_attempt'],
+ next_job['id']))
+ conn.commit()
+ return next_job
+
+ def job_completed(self, job_id):
+ conn = self._conn()
+ c = conn.cursor()
+ c.execute('delete from jobs where ROWID=?', (job_id,))
+ conn.commit()
View
208 worker.py
@@ -6,6 +6,7 @@
import Queue
import datetime
+import jobs
import logging
import multiprocessing
import os
@@ -73,8 +74,8 @@ def start(self, status=phonetest.PhoneTestMessage.IDLE):
def stop(self):
self.subprocess.stop()
- def new_build(self, build_url):
- self.cmd_queue.put_nowait(('build', build_url))
+ def new_job(self):
+ self.cmd_queue.put_nowait(('job', None))
def reboot(self):
self.cmd_queue.put_nowait(('reboot', None))
@@ -138,9 +139,11 @@ def __init__(self, worker_num, ipaddr, tests, phone_cfg, user_cfg,
self.loglevel = loglevel
self.mailer = mailer
self.build_cache_port = build_cache_port
+ self._stop = False
self.p = None
- self.skipped_job_queue = []
+ self.jobs = jobs.Jobs(self.phone_cfg['phoneid'])
self.current_build = None
+ self.last_ping = None
self._dm = None
self.status = None
@@ -169,7 +172,7 @@ def start(self, status):
return
del self.p
self.status = status
- self.p = multiprocessing.Process(target=self.loop)
+ self.p = multiprocessing.Process(target=self.run)
self.p.start()
def stop(self):
@@ -345,11 +348,7 @@ def run_tests(self, build_metadata):
# may have gotten an error trying to reboot, so test again
if self.has_error():
- logging.info('Phone is in error state; queuing job '
- 'for later.')
- # FIXME: Write these to a file to resume eventually.
- # We'll need a way (command, cli option) to automatically
- # clear these as well.
+ logging.info('Phone is in error state; not running test.')
return False
self.status_update(phonetest.PhoneTestMessage(
@@ -388,13 +387,116 @@ def run_tests(self, build_metadata):
exc = 'Uncaught device error while running test!\n\n%s' % \
traceback.format_exc()
logging.error(exc)
- # FIXME: We should retry the whole thing; as it is, we
- # actually skip this job.
self.phone_disconnected(exc)
return False
return True
- def loop(self):
+ def handle_timeout(self):
+ if (self.status != phonetest.PhoneTestMessage.DISABLED and
+ (not self.last_ping or
+ (datetime.datetime.now() - self.last_ping >
+ datetime.timedelta(seconds=self.PING_SECONDS)))):
+ self.last_ping = datetime.datetime.now()
+ if self.ping():
+ if self.status == phonetest.PhoneTestMessage.DISCONNECTED:
+ self.recover_phone()
+ if not self.has_error():
+ self.status_update(phonetest.PhoneTestMessage(
+ self.phone_cfg['phoneid'],
+ phonetest.PhoneTestMessage.IDLE,
+ self.current_build))
+ else:
+ logging.info('Ping unanswered.')
+ # No point in trying to recover, since we couldn't
+ # even perform a simple action.
+ if not self.has_error():
+ self.phone_disconnected('No response to ping.')
+
+ def handle_job(self, job):
+ logging.info('Starting job %s.' % job['build_url'])
+ client = buildserver.BuildCacheClient(port=self.build_cache_port)
+ logging.info('Fetching build...')
+ cache_response = client.get(job['build_url'])
+ client.close()
+ if not cache_response['success']:
+ logging.warn('Errors occured getting build %s: %s' %
+ (job['build_url'], cache_response['error']))
+ return
+ if self.run_tests(cache_response['metadata']):
+ logging.info('Job completed.')
+ self.jobs.job_completed(job['id'])
+ self.status_update(phonetest.PhoneTestMessage(
+ self.phone_cfg['phoneid'],
+ phonetest.PhoneTestMessage.IDLE,
+ self.current_build))
+ else:
+ logging.error('Job failed.')
+
+ def handle_cmd(self, request):
+ if not request:
+ pass
+ elif request[0] == 'stop':
+ self._stop = True
+ elif request[0] == 'job':
+ # This is just a notification that breaks us from waiting on the
+ # command queue; it's not essential, since jobs are stored in
+ # a db, but it allows the worker to react quickly to a request if
+ # it isn't doing anything else.
+ pass
+ elif request[0] == 'reboot':
+ logging.info('Rebooting at user\'s request...')
+ self.reboot()
+ elif request[0] == 'disable':
+ self.disable_phone('Disabled at user\'s request', False)
+ elif request[0] == 'enable':
+ logging.info('Enabling phone at user\'s request...')
+ if self.has_error():
+ self.status_update(phonetest.PhoneTestMessage(
+ self.phone_cfg['phoneid'],
+ phonetest.PhoneTestMessage.IDLE,
+ self.current_build))
+ self.last_ping = None
+ elif request[0] == 'debug':
+ self.user_cfg['debug'] = request[1]
+ DeviceManagerSUT.debug = self.user_cfg['debug']
+ # update any existing DeviceManagerSUT objects
+ if self._dm:
+ self._dm.debug = self.user_cfg['debug']
+ for t in self.tests:
+ t.set_dm_debug(self.user_cfg['debug'])
+ elif request[0] == 'ping':
+ self.ping()
+
+ def main_loop(self):
+ # Commands take higher priority than jobs, so we deal with all
+ # immediately available commands, then start the next job, if there is
+ # one. If neither a job nor a command is currently available,
+ # block on the command queue for CMD_QUEUE_TIMEOUT_SECONDS.
+ while True:
+ request = None
+ while ((self.has_error() or not self.jobs.jobs_pending()) and
+ not request and self.cmd_queue.empty()):
+ try:
+ request = self.cmd_queue.get(
+ timeout=self.CMD_QUEUE_TIMEOUT_SECONDS)
+ except Queue.Empty:
+ request = None
+ self.handle_timeout()
+ while request:
+ self.handle_cmd(request)
+ if self._stop:
+ return
+ try:
+ request = self.cmd_queue.get_nowait()
+ except Queue.Empty:
+ request = None
+ if self.has_error():
+ continue
+ job = self.jobs.get_next_job()
+ if job:
+ self.handle_job(job)
+
+ def run(self):
sys.stdout = file(self.outfile, 'a', 0)
sys.stderr = sys.stdout
print '%s Worker starting up.' % \
@@ -414,8 +516,6 @@ def loop(self):
for t in self.tests:
t.status_cb = self.status_update
- last_ping = None
-
self.status_update(phonetest.PhoneTestMessage(
self.phone_cfg['phoneid'], self.status))
@@ -425,82 +525,4 @@ def loop(self):
if self.has_error():
logging.error('Initial SD card check failed.')
- while True:
- request = None
- try:
- request = self.cmd_queue.get(
- timeout=self.CMD_QUEUE_TIMEOUT_SECONDS)
- except Queue.Empty:
- if (self.status != phonetest.PhoneTestMessage.DISABLED and
- (not last_ping or
- (datetime.datetime.now() - last_ping >
- datetime.timedelta(seconds=self.PING_SECONDS)))):
- last_ping = datetime.datetime.now()
- if self.ping():
- if (self.status ==
- phonetest.PhoneTestMessage.DISCONNECTED):
- self.recover_phone()
- if not self.has_error():
- self.status_update(phonetest.PhoneTestMessage(
- self.phone_cfg['phoneid'],
- phonetest.PhoneTestMessage.IDLE,
- self.current_build))
- else:
- logging.info('Ping unanswered.')
- # No point in trying to recover, since we couldn't
- # even perform a simple action.
- if not self.has_error():
- self.phone_disconnected('No response to ping.')
- except KeyboardInterrupt:
- return
-
- if not request:
- continue
- if request[0] == 'stop':
- return
- if request[0] == 'build':
- build_url = request[1]
- logging.info('Got notification of build %s.' % build_url)
- client = buildserver.BuildCacheClient(
- port=self.build_cache_port)
- logging.info('Fetching build...')
- cache_response = client.get(build_url)
- client.close()
- if not cache_response['success']:
- logging.warn('Errors occured getting build %s: %s' %
- (build_url, cache_response['error']))
- continue
- if self.run_tests(cache_response['metadata']):
- logging.info('Job completed.')
- self.status_update(phonetest.PhoneTestMessage(
- self.phone_cfg['phoneid'],
- phonetest.PhoneTestMessage.IDLE,
- self.current_build))
- else:
- logging.error('Job failed; queuing it for later.')
- self.skipped_job_queue.append(cache_response['metadata'])
- elif request[0] == 'reboot':
- logging.info('Rebooting at user\'s request...')
- self.reboot()
- elif request[0] == 'disable':
- self.disable_phone('Disabled at user\'s request', False)
- elif request[0] == 'enable':
- logging.info('Enabling phone at user\'s request...')
- if self.has_error():
- self.status_update(phonetest.PhoneTestMessage(
- self.phone_cfg['phoneid'],
- phonetest.PhoneTestMessage.IDLE,
- self.current_build))
- last_ping = None
- for j in self.skipped_job_queue:
- self.cmd_queue.put(('job', j))
- elif request[0] == 'debug':
- self.user_cfg['debug'] = request[1]
- DeviceManagerSUT.debug = self.user_cfg['debug']
- # update any existing DeviceManagerSUT objects
- if self._dm:
- self._dm.debug = self.user_cfg['debug']
- for t in self.tests:
- t.set_dm_debug(self.user_cfg['debug'])
- elif request[0] == 'ping':
- self.ping()
+ self.main_loop()
Please sign in to comment.
Something went wrong with that request. Please try again.