Skip to content

Commit

Permalink
Gracefully restart dnsmasq to not break tcp DNS
Browse files Browse the repository at this point in the history
When talking to dnsmasq using DNS over tcp dnsmasq will fork out for
TCP connections. Forked processes will stay until all connections have
been closed, meaning that dangling connections will keep the processes
and with that will also keep the tcp/53 port in listening state. On
dnsmasq restart (e.g. on network update, subnet create, ...) the parent
process is killed with SIGKILL and a new process is started. This new
process cannot listen on tcp/53, as it is still in use by the old child
with the dangling connection.

To prevent dangling dnsmasq connections on tcp we need to properly
shutdown the child. This is done by first sending SIGTERM and only send
a SIGKILL if the process is not shutting down properly. With that we
get proper cleanup of all children and tcp will come up after a restart.

Change-Id: Ie633148c512f5124e978648c50a4c6318c61baa8
Closes-bug: #1998621
  • Loading branch information
sebageek committed Dec 6, 2022
1 parent 16399a2 commit 74224e7
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
15 changes: 13 additions & 2 deletions neutron/agent/linux/dhcp.py
Expand Up @@ -21,6 +21,7 @@
import os
import re
import shutil
import signal
import time

import netaddr
Expand All @@ -45,6 +46,7 @@
from neutron.privileged.agent.linux import dhcp as priv_dhcp

LOG = logging.getLogger(__name__)
SIGTERM_TIMEOUT = 5

DNS_PORT = 53
WIN2k3_STATIC_DNS = 249
Expand Down Expand Up @@ -349,9 +351,18 @@ def _get_process_manager(self, cmd_callback=None):
def disable(self, retain_port=False, block=False):
"""Disable DHCP for this network by killing the local process."""
self.process_monitor.unregister(self.network.id, DNSMASQ_SERVICE_NAME)
self._get_process_manager().disable()
pm = self._get_process_manager()
pm.disable(sig=str(int(signal.SIGTERM)))
if block:
common_utils.wait_until_true(lambda: not self.active)
try:
common_utils.wait_until_true(lambda: not self.active,
timeout=SIGTERM_TIMEOUT)
except common_utils.WaitTimeout:
LOG.warning('dnsmasq process %s did not finish after SIGTERM '
'signal in %s seconds, sending SIGKILL signal',
pm.pid, SIGTERM_TIMEOUT)
pm.disable(sig=str(int(signal.SIGKILL)))
common_utils.wait_until_true(lambda: not self.active)
self._del_running_interface(self.interface_name)
if not retain_port:
self._destroy_namespace_and_port()
Expand Down
32 changes: 32 additions & 0 deletions neutron/tests/unit/agent/linux/test_dhcp.py
Expand Up @@ -15,6 +15,7 @@

import copy
import os
import signal
from unittest import mock

import netaddr
Expand All @@ -32,6 +33,7 @@
from neutron.agent.linux import dhcp
from neutron.agent.linux import ip_lib
from neutron.cmd import runtime_checks as checks
from neutron.common import utils as common_utils
from neutron.conf.agent import common as config
from neutron.conf.agent import dhcp as dhcp_config
from neutron.conf import common as base_config
Expand Down Expand Up @@ -1272,6 +1274,36 @@ def test_disable_config_dir_removed_after_destroy(self):
parent.assert_has_calls(expected)
delete_ns.assert_called_with('qdhcp-ns')

@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking(self, mock_wait_until):
lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(1, mock_wait_until.call_count)
mock_pm.disable.assert_called_once_with(sig=str(int(signal.SIGTERM)))

@mock.patch.object(common_utils, 'wait_until_true')
def test_disable_blocking_sigterm_sigkill(self, mock_wait_until):
mock_wait_until.side_effect = [common_utils.WaitTimeout, None]

lp = LocalChild(self.conf, FakeDualNetwork())
mock_pm = mock.Mock()
with mock.patch('neutron.agent.linux.ip_lib.'
'delete_network_namespace'), \
mock.patch.object(dhcp.DhcpLocalProcess,
'_get_process_manager',
return_value=mock_pm):
lp.disable(block=True)
self.assertEqual(2, mock_wait_until.call_count)
mock_pm.disable.assert_has_calls([
mock.call(sig=str(int(signal.SIGTERM))),
mock.call(sig=str(int(signal.SIGKILL)))])

def test_get_interface_name(self):
net = FakeDualNetwork()
path = '/dhcp/%s/interface' % net.id
Expand Down

0 comments on commit 74224e7

Please sign in to comment.