Skip to content
This repository has been archived by the owner on Jan 20, 2022. It is now read-only.

Commit

Permalink
Merge pull request #25 from mrrrgn/backoff
Browse files Browse the repository at this point in the history
Bug 1144445 - Add jitter to runner retries
  • Loading branch information
usize committed Mar 18, 2015
2 parents 9cf64ee + 91bae4e commit 5017ea4
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 4 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ Configuration is done with INI style configuration files.
## [runner] section
Keys:

- `sleep_time`: how long to wait between retries
- `sleep_time`: minimum time to wait between retries
- `retry_jitter`: a random interval, added to sleep_times
- `max_tries`: how many times to retry before giving up
- `halt_task`: which task to run to "halt" the process. This could perhaps shut
the machine down or terminate the EC2 instance
Expand Down
13 changes: 10 additions & 3 deletions runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time
import shlex
import json
import random
import subprocess

from lib.config import Config, TaskConfig
Expand Down Expand Up @@ -93,6 +94,7 @@ def process_taskdir(config, dirname):
"max_time": int(config.max_time),
"max_tries": int(config.max_tries),
"sleep_time": int(config.sleep_time),
"retry_jitter": int(config.retry_jitter),
"interpreter": config.interpreter,
}

Expand Down Expand Up @@ -156,9 +158,14 @@ def process_taskdir(config, dirname):
log.info("halting")
run_task(halt_cmd, env, max_time=task_config['max_time'])
return False
# Sleep and try again
log.debug("sleeping for %i", task_config['sleep_time'])
time.sleep(task_config['sleep_time'])
# Sleep and try again, sleep time is the lower bound within a
# random jitter. Note: the 1.14 was chosen at random
# and has no special meaning.
sleep_time = int((1.14**try_num) * random.randint(
task_config['sleep_time'],
task_config['sleep_time'] + task_config['retry_jitter']))
log.debug("sleeping for %i", sleep_time)
time.sleep(sleep_time)
break
elif r == "HALT":
log.info("halting")
Expand Down
1 change: 1 addition & 0 deletions runner/lib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

class Config(object):
sleep_time = 1
retry_jitter = 30
max_tries = 5
max_time = 600
halt_task = "halt.sh"
Expand Down
3 changes: 3 additions & 0 deletions tests/test-process-taskdir.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def teardown_logfile():
#os.remove(logfile)
pass


def test_tasks_default_config():
config = Config()
assert runner.process_taskdir(config, tasksd) is True
Expand All @@ -37,6 +38,7 @@ def test_tasks_pre_post_hooks():

config.max_time = 1
config.max_tries = 1
config.retry_jitter = 0
config.task_hook = "python %s runner-test %s" % (pre_post_hook, logfile)
runner.process_taskdir(config, tasksd)

Expand Down Expand Up @@ -100,6 +102,7 @@ def test_task_retries():
config = Config()
config.max_time = 1
config.max_tries = 2
config.retry_jitter = 0
fake_halt_task_name = 'mrrrgns_lil_halt_task'
config.halt_task = fake_halt_task_name

Expand Down

0 comments on commit 5017ea4

Please sign in to comment.