From 54935d96a48240de81d89aaf80b25a407db48c3f Mon Sep 17 00:00:00 2001 From: Jamie Hewland Date: Thu, 29 Sep 2016 12:56:08 +0200 Subject: [PATCH] Add an endpoint to reload HAProxy config (#312) * First attempt at a reload endpoint * Add --api-listen parameter * Parse API listen URI before launching thread * Catch errors early * Apparently the http:// is important... * Refactor config validation into separate method * Simplify logic around skipping validation * In preparation for testing existing config... * Add the ability to reload existing HAProxy config * /reload?existing * Fix notifying bools in event processor * Improve debug logging around config comparisons * Tweak response messages for reload API * Do a slightly better check on query params * Add a README note about the endpoint * Regenerate Longhelp.md * Remove API and add signal handling to the event processor * Revert "Regenerate Longhelp.md" This reverts commit eef20c87e1dffdb30e9d9e50d32df2332700a9e8. * Revert "Add a README note about the endpoint" This reverts commit c5a5c501581a55534e6a66ba04fac22a8182409f. * Add Lua script and endpoints to send signals to marathon-lb * Write about new endpoints in README * Regenerate Longhelp.md * Add note to README about marathon-lb expecting to be in its own PID namespace --- Longhelp.md | 6 ++ README.md | 6 +- config.py | 6 ++ marathon_lb.py | 197 +++++++++++++++++++++++--------------- signalmlb.lua | 40 ++++++++ tests/test_marathon_lb.py | 6 ++ 6 files changed, 184 insertions(+), 77 deletions(-) create mode 100644 signalmlb.lua diff --git a/Longhelp.md b/Longhelp.md index e1902b42..cacbd75e 100644 --- a/Longhelp.md +++ b/Longhelp.md @@ -379,6 +379,7 @@ global lua-load /marathon-lb/getpids.lua lua-load /marathon-lb/getconfig.lua lua-load /marathon-lb/getmaps.lua + lua-load /marathon-lb/signalmlb.lua defaults load-server-state-from-file global log global @@ -410,6 +411,11 @@ listen stats http-request use-service lua.getappmap if getappmap acl getconfig path /_haproxy_getconfig http-request use-service lua.getconfig if getconfig + + acl signalmlbhup path /_mlb_signal/hup + http-request use-service lua.signalmlbhup if signalmlbhup + acl signalmlbusr1 path /_mlb_signal/usr1 + http-request use-service lua.signalmlbusr1 if signalmlbusr1 ``` ## `HAPROXY_HTTPS_FRONTEND_ACL` *Overridable* diff --git a/README.md b/README.md index 43f3ce0d..75f1543c 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,10 @@ Marathon-lb exposes a few endpoints on port 9090 (by default). They are: | `:9090/_haproxy_getvhostmap` | Returns the HAProxy vhost to backend map. This endpoint returns HAProxy map file only when the `--haproxy-map` flag is enabled, it returns an empty string otherwise. Implemented in [`getmaps.lua`](getmaps.lua). | | `:9090/_haproxy_getappmap` | Returns the HAProxy app ID to backend map. Like `_haproxy_getvhostmap`, this requires the `--haproxy-map` flag to be enabled and returns an empty string otherwise. Also implemented in `getmaps.lua`. | | `:9090/_haproxy_getpids` | Returns the PIDs for all HAProxy instances within the current process namespace. This literally returns `$(pidof haproxy)`. Implemented in [`getpids.lua`](getpids.lua). This is also used by the [`zdd.py`](zdd.py) script to determine if connections have finished draining during a deploy. | +| `:9090/_mlb_signal/hup`* | Sends a `SIGHUP` signal to the marathon-lb process, causing it to fetch the running apps from Marathon and reload the HAProxy config as though an event was received from Marathon. | +| `:9090/_mlb_signal/usr1`* | Sends a `SIGUSR1` signal to the marathon-lb process, causing it to restart HAProxy with the existing config, without checking Marathon for changes. | +\* These endpoints won't function when marathon-lb is in `poll` mode as there is no marathon-lb process to be signaled in this mode (marathon-lb exits after each poll). ## HAProxy Configuration @@ -265,6 +268,7 @@ are [documented here](Longhelp.md#templates). > < HTTP/1.1 200 OK ``` + * Some of the features of marathon-lb assume that it is the only instance of itself running in a PID namespace. i.e. marathon-lb assumes that it is running in a container. Certain features like the `/_mlb_signal` endpoints and the `/_haproxy_getpids` endpoint (and by extension, zero-downtime deployments) may behave unexpectedly if more than one instance of marathon-lb is running in the same PID namespace or if there are other HAProxy processes in the same PID namespace. ## Zero-downtime Deployments @@ -352,4 +356,4 @@ PRs are welcome, but here are a few general guidelines: ``` bash /path/to/marathon-lb/scripts/install-git-hooks.sh - ``` \ No newline at end of file + ``` diff --git a/config.py b/config.py index 76e9cbfb..d4624d5a 100644 --- a/config.py +++ b/config.py @@ -60,6 +60,7 @@ def load(self): lua-load /marathon-lb/getpids.lua lua-load /marathon-lb/getconfig.lua lua-load /marathon-lb/getmaps.lua + lua-load /marathon-lb/signalmlb.lua defaults load-server-state-from-file global log global @@ -91,6 +92,11 @@ def load(self): http-request use-service lua.getappmap if getappmap acl getconfig path /_haproxy_getconfig http-request use-service lua.getconfig if getconfig + + acl signalmlbhup path /_mlb_signal/hup + http-request use-service lua.signalmlbhup if signalmlbhup + acl signalmlbusr1 path /_mlb_signal/usr1 + http-request use-service lua.signalmlbusr1 if signalmlbusr1 ''', overridable=False, description='''\ diff --git a/marathon_lb.py b/marathon_lb.py index 1e724bb1..68a3d562 100755 --- a/marathon_lb.py +++ b/marathon_lb.py @@ -28,6 +28,7 @@ import random import re import shlex +import signal import stat import subprocess import sys @@ -1055,28 +1056,14 @@ def writeConfigAndValidate( # Change the file paths in the config to (temporarily) point to the # temporary map files so those can also be checked when the config is # validated - if not args.skip_validation: - temp_config = config.replace( - domain_map_file, domain_temp_map_file - ).replace(app_map_file, app_temp_map_file) + temp_config = config.replace( + domain_map_file, domain_temp_map_file + ).replace(app_map_file, app_temp_map_file) # Write the new config to a temporary file haproxyTempConfigFile = writeReplacementTempFile(temp_config, config_file) - # If skip validation flag is provided, don't check. - if args.skip_validation: - logger.debug("skipping validation.") - if haproxy_map: - moveTempFile(domain_temp_map_file, domain_map_file) - moveTempFile(app_temp_map_file, app_map_file) - moveTempFile(haproxyTempConfigFile, config_file) - return True - - # Check that config is valid - cmd = ['haproxy', '-f', haproxyTempConfigFile, '-c'] - logger.debug("checking config with command: " + str(cmd)) - returncode = subprocess.call(args=cmd) - if returncode == 0: + if validateConfig(haproxyTempConfigFile): # Move into place if haproxy_map: moveTempFile(domain_temp_map_file, domain_map_file) @@ -1085,11 +1072,13 @@ def writeConfigAndValidate( # Edit the config file again to point to the actual map paths with open(haproxyTempConfigFile, 'w') as tempConfig: tempConfig.write(config) + else: + truncateMapFileIfExists(domain_map_file) + truncateMapFileIfExists(app_map_file) moveTempFile(haproxyTempConfigFile, config_file) return True else: - logger.error("haproxy returned non-zero when checking config") return False @@ -1113,12 +1102,38 @@ def writeReplacementTempFile(content, file_to_replace): return tempFile +def validateConfig(haproxy_config_file): + # If skip validation flag is provided, don't check. + if args.skip_validation: + logger.debug("skipping validation.") + return True + + # Check that config is valid + cmd = ['haproxy', '-f', haproxy_config_file, '-c'] + logger.debug("checking config with command: " + str(cmd)) + returncode = subprocess.call(args=cmd) + if returncode == 0: + return True + else: + logger.error("haproxy returned non-zero when checking config") + return False + + def moveTempFile(temp_file, dest_file): # Replace the old file with the new from its temporary location logger.debug("moving temp file %s to %s", temp_file, dest_file) move(temp_file, dest_file) +def truncateMapFileIfExists(map_file): + if os.path.isfile(map_file): + logger.debug("Truncating map file as haproxy-map flag " + "is disabled %s", map_file) + fd = os.open(map_file, os.O_RDWR) + os.ftruncate(fd, 0) + os.close(fd) + + def compareWriteAndReloadConfig(config, config_file, domain_map_array, app_map_array, haproxy_map): # See if the last config on disk matches this, and if so don't reload @@ -1153,8 +1168,9 @@ def compareWriteAndReloadConfig(config, config_file, domain_map_array, app_map_string, app_map_file, haproxy_map): reloadConfig() else: - logger.warning("skipping reload: config not valid") - + logger.warning("skipping reload: config/map not valid") + else: + logger.debug("skipping reload: config/map unchanged") else: truncateMapFileIfExists(domain_map_file) truncateMapFileIfExists(app_map_file) @@ -1168,6 +1184,8 @@ def compareWriteAndReloadConfig(config, config_file, domain_map_array, reloadConfig() else: logger.warning("skipping reload: config not valid") + else: + logger.debug("skipping reload: config unchanged") def generateMapString(map_array): @@ -1197,15 +1215,6 @@ def compareMapFile(map_file, map_string): return runningmap != map_string -def truncateMapFileIfExists(map_file): - if os.path.isfile(map_file): - logger.debug("Truncating map file as haproxy-map flag " - "is disabled %s", map_file) - fd = os.open(map_file, os.O_RDWR) - os.ftruncate(fd, 0) - os.close(fd) - - def get_health_check(app, portIndex): for check in app['healthChecks']: if check.get('port'): @@ -1459,8 +1468,9 @@ def __init__(self, marathon, config_file, groups, self.__ssl_certs = ssl_certs self.__condition = threading.Condition() - self.__thread = threading.Thread(target=self.do_reset) + self.__thread = threading.Thread(target=self.try_reset) self.__pending_reset = False + self.__pending_reload = False self.__stop = False self.__haproxy_map = haproxy_map self.__thread.start() @@ -1468,39 +1478,66 @@ def __init__(self, marathon, config_file, groups, # Fetch the base data self.reset_from_tasks() - def do_reset(self): + def try_reset(self): with self.__condition: logger.info('starting event processor thread') while True: self.__condition.acquire() + if self.__stop: logger.info('stopping event processor thread') return - if not self.__pending_reset: + + if not self.__pending_reset and not self.__pending_reload: if not self.__condition.wait(300): logger.info('condition wait expired') + + pending_reset = self.__pending_reset + pending_reload = self.__pending_reload self.__pending_reset = False + self.__pending_reload = False + self.__condition.release() - try: - start_time = time.time() - - self.__apps = get_apps(self.__marathon) - regenerate_config(self.__apps, - self.__config_file, - self.__groups, - self.__bind_http_https, - self.__ssl_certs, - self.__templater, - self.__haproxy_map) - - logger.debug("updating tasks finished, took %s seconds", - time.time() - start_time) - except requests.exceptions.ConnectionError as e: - logger.error("Connection error({0}): {1}".format( - e.errno, e.strerror)) - except: - logger.exception("Unexpected error!") + # Reset takes precedence over reload + if pending_reset: + self.do_reset() + elif pending_reload: + self.do_reload() + else: + # Timed out waiting on the condition variable, just do a + # full reset for good measure (as was done before). + self.do_reset() + + def do_reset(self): + try: + start_time = time.time() + + self.__apps = get_apps(self.__marathon) + regenerate_config(self.__apps, + self.__config_file, + self.__groups, + self.__bind_http_https, + self.__ssl_certs, + self.__templater, + self.__haproxy_map) + + logger.debug("updating tasks finished, took %s seconds", + time.time() - start_time) + except requests.exceptions.ConnectionError as e: + logger.error("Connection error({0}): {1}".format( + e.errno, e.strerror)) + except: + logger.exception("Unexpected error!") + + def do_reload(self): + try: + # Validate the existing config before reloading + logger.debug("attempting to reload existing config...") + if validateConfig(self.__config_file): + reloadConfig() + except: + logger.exception("Unexpected error!") def stop(self): self.__condition.acquire() @@ -1514,12 +1551,28 @@ def reset_from_tasks(self): self.__condition.notify() self.__condition.release() + def reload_existing_config(self): + self.__condition.acquire() + self.__pending_reload = True + self.__condition.notify() + self.__condition.release() + def handle_event(self, event): if event['eventType'] == 'status_update_event' or \ event['eventType'] == 'health_status_changed_event' or \ event['eventType'] == 'api_post_event': self.reset_from_tasks() + def handle_signal(self, sig, stack): + if sig == signal.SIGHUP: + logger.debug('received signal SIGHUP - reloading config') + self.reset_from_tasks() + elif sig == signal.SIGUSR1: + logger.debug('received signal SIGUSR1 - reloading existing config') + self.reload_existing_config() + else: + logger.warning('received unknown signal %d' % (sig,)) + def get_arg_parser(): parser = argparse.ArgumentParser( @@ -1601,13 +1654,7 @@ def get_arg_parser(): return parser -def run_server(marathon, listen_addr, callback_url, config_file, groups, - bind_http_https, ssl_certs, haproxy_map, marathon_ca_cert): - processor = MarathonEventProcessor(marathon, - config_file, - groups, - bind_http_https, - ssl_certs, haproxy_map) +def run_server(marathon, listen_addr, callback_url, processor): try: marathon.add_subscriber(callback_url) @@ -1635,13 +1682,7 @@ def clear_callbacks(marathon, callback_url): marathon.remove_subscriber(callback_url) -def process_sse_events(marathon, config_file, groups, - bind_http_https, ssl_certs, haproxy_map): - processor = MarathonEventProcessor(marathon, - config_file, - groups, - bind_http_https, - ssl_certs, haproxy_map) +def process_sse_events(marathon, processor): try: events = marathon.get_event_stream() for event in events: @@ -1726,6 +1767,18 @@ def process_sse_events(marathon, config_file, groups, get_marathon_auth_params(args), args.marathon_ca_cert) + # If we're going to be handling events, set up the event processor and + # hook it up to the process signals. + if args.listening or args.sse: + processor = MarathonEventProcessor(marathon, + args.haproxy_config, + args.group, + not args.dont_bind_http_https, + args.ssl_certs, + args.haproxy_map) + signal.signal(signal.SIGHUP, processor.handle_signal) + signal.signal(signal.SIGUSR1, processor.handle_signal) + # If in listening mode, spawn a webserver waiting for events. Otherwise # just write the config. if args.listening: @@ -1733,10 +1786,7 @@ def process_sse_events(marathon, config_file, groups, "and will be removed in future releases") callback_url = args.callback_url or args.listening try: - run_server(marathon, args.listening, callback_url, - args.haproxy_config, args.group, - not args.dont_bind_http_https, args.ssl_certs, - args.haproxy_map, args.marathon_ca_cert) + run_server(marathon, args.listening, callback_url, processor) finally: clear_callbacks(marathon, callback_url) elif args.sse: @@ -1744,12 +1794,7 @@ def process_sse_events(marathon, config_file, groups, while True: stream_started = time.time() try: - process_sse_events(marathon, - args.haproxy_config, - args.group, - not args.dont_bind_http_https, - args.ssl_certs, - args.haproxy_map) + process_sse_events(marathon, processor) except: logger.exception("Caught exception") backoff = backoff * 1.5 diff --git a/signalmlb.lua b/signalmlb.lua new file mode 100644 index 00000000..0e0db641 --- /dev/null +++ b/signalmlb.lua @@ -0,0 +1,40 @@ +-- A simple Lua module for HAProxy that sends signals to the marathon-lb process + +function run(cmd) + local file = io.popen(cmd) + local output = file:read('*a') + local success, _, code = file:close() + return output, success, code +end + +function send_response(applet, code, response) + applet:set_status(code) + applet:add_header("content-length", string.len(response)) + applet:add_header("content-type", "text/plain") + applet:start_response() + applet:send(response) +end + +core.register_service("signalmlbhup", "http", function(applet) + local _, success, code = run("pkill -HUP -f '^python.*marathon_lb.py'") + if not success then + send_response(applet, 500, string.format( + "Failed to send SIGHUP signal to marathon-lb (exit code %d). Is \z + marathon-lb running in 'poll' mode?", code)) + return + end + + send_response(applet, 200, "Sent SIGHUP signal to marathon-lb") +end) + +core.register_service("signalmlbusr1", "http", function(applet) + local _, success, code = run("pkill -USR1 -f '^python.*marathon_lb.py'") + if not success then + send_response(applet, 500, string.format( + "Failed to send SIGUSR1 signal to marathon-lb (exit code %d). Is \z + marathon-lb running in 'poll' mode?", code)) + return + end + + send_response(applet, 200, "Sent SIGUSR1 signal to marathon-lb") +end) diff --git a/tests/test_marathon_lb.py b/tests/test_marathon_lb.py index 6abd0cbf..8f47f275 100644 --- a/tests/test_marathon_lb.py +++ b/tests/test_marathon_lb.py @@ -44,6 +44,7 @@ def setUp(self): lua-load /marathon-lb/getpids.lua lua-load /marathon-lb/getconfig.lua lua-load /marathon-lb/getmaps.lua + lua-load /marathon-lb/signalmlb.lua defaults load-server-state-from-file global log global @@ -75,6 +76,11 @@ def setUp(self): http-request use-service lua.getappmap if getappmap acl getconfig path /_haproxy_getconfig http-request use-service lua.getconfig if getconfig + + acl signalmlbhup path /_mlb_signal/hup + http-request use-service lua.signalmlbhup if signalmlbhup + acl signalmlbusr1 path /_mlb_signal/usr1 + http-request use-service lua.signalmlbusr1 if signalmlbusr1 ''' def test_config_no_apps(self):