Skip to content

Commit

Permalink
CNI: Don't wait for missing pods on DEL
Browse files Browse the repository at this point in the history
We've seen some issues related to the fact that on CNI DEL we wait for
pod annotation to appear in CNI registry:

1. Overloaded kuryr-daemon HTTP server, because many stale DEL requests
   from CRI will saturate the number of free connections, causing CNI to
   answer requests extremely slowly.
2. Some CRI's will not delete network namespace before getting a
   successful CNI DEL. If we'll assign the same IP (or vlan) to a next
   pod, we might end up with an IP or vlan conflict and NetlinkError.

This commit makes sure we only wait for VIFs up to 5 seconds on DEL
requests, enough time for watcher to populate registry on restart. Also
some code got moved around to make implementating the above simpler and
some logs got clarified to make debugging of such issues easier later
on.

Change-Id: I9221b6bc9166597837a4b53382862aa6c6f3e94c
Closes-Bug: 1882083
Related-Bug: 1854928
  • Loading branch information
dulek authored and MaysaMacedo committed Jun 19, 2020
1 parent 968713c commit 586d639
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 42 deletions.
17 changes: 6 additions & 11 deletions kuryr_kubernetes/cni/binding/nested.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,12 @@ def connect(self, vif, ifname, netns, container_id):
# NOTE(dulek): This is related to bug 1854928. It's super-rare,
# so aim of this piece is to gater any info useful
# for determining when it happens.
LOG.exception('Creation of pod interface failed, most likely '
'due to duplicated VLAN id. This will probably '
'cause kuryr-daemon to crashloop. Trying to '
'gather debugging information.')

with b_base.get_ipdb() as h_ipdb:
LOG.error('List of host interfaces: %s', h_ipdb.interfaces)

with b_base.get_ipdb(netns) as c_ipdb:
LOG.error('List of pod namespace interfaces: %s',
c_ipdb.interfaces)
LOG.exception(f'Creation of pod interface failed due to VLAN '
f'ID (vlan_info={args}) conflict. Probably the '
f'CRI had not cleaned up the network namespace '
f'of deleted pods. This should not be a '
f'permanent issue but may cause restart of '
f'kuryr-cni pod.')
raise

with b_base.get_ipdb(netns) as c_ipdb:
Expand Down
4 changes: 2 additions & 2 deletions kuryr_kubernetes/cni/daemon/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def delete(self):
self.plugin.delete(params)
except exceptions.ResourceNotReady:
# NOTE(dulek): It's better to ignore this error - most of the time
# it will happen when pod is long gone and kubelet
# it will happen when pod is long gone and CRI
# overzealously tries to delete it from the network.
# We cannot really do anything without VIF annotation,
# so let's just tell kubelet to move along.
# so let's just tell CRI to move along.
LOG.warning('Error when processing delNetwork request. '
'Ignoring this error, pod is most likely gone')
return '', httplib.NO_CONTENT, self.headers
Expand Down
62 changes: 33 additions & 29 deletions kuryr_kubernetes/cni/plugins/k8s_cni_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,28 @@ def _get_pod_name(self, params):
'name': params.args.K8S_POD_NAME}

def add(self, params):
vifs = self._do_work(params, b_base.connect, confirm=True)

pod_name = self._get_pod_name(params)
timeout = CONF.cni_daemon.vif_annotation_timeout

# Try to confirm if pod in the registry is not stale cache. If it is,
# remove it.
with lockutils.lock(pod_name, external=True):
if pod_name in self.registry:
cached_pod = self.registry[pod_name]['pod']
try:
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
except Exception:
LOG.exception('Error when getting pod %s', pod_name)
raise exceptions.ResourceNotReady(pod_name)

if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
LOG.warning('Stale pod %s detected in cache. (API '
'uid=%s, cached uid=%s). Removing it from '
'cache.', pod_name, pod['metadata']['uid'],
cached_pod['metadata']['uid'])
del self.registry[pod_name]

vifs = self._do_work(params, b_base.connect, timeout)

# NOTE(dulek): Saving containerid to be able to distinguish old DEL
# requests that we should ignore. We need a lock to
Expand All @@ -66,9 +85,6 @@ def add(self, params):
LOG.debug('Saved containerid = %s for pod %s',
params.CNI_CONTAINERID, pod_name)

# Wait for VIFs to become active.
timeout = CONF.cni_daemon.vif_annotation_timeout

# Wait for timeout sec, 1 sec between tries, retry when even one
# vif is not active.
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
Expand Down Expand Up @@ -96,12 +112,20 @@ def delete(self, params):
# NOTE(dulek): This is a DEL request for some older (probably
# failed) ADD call. We should ignore it or we'll
# unplug a running pod.
LOG.warning('Received DEL request for unknown ADD call. '
'Ignoring.')
LOG.warning('Received DEL request for unknown ADD call for '
'pod %s (CNI_CONTAINERID=%s). Ignoring.', pod_name,
params.CNI_CONTAINERID)
return
except KeyError:
pass
self._do_work(params, b_base.disconnect)

# Passing arbitrary 5 seconds as timeout, as it does not make any sense
# to wait on CNI DEL. If pod got deleted from API - VIF info is gone.
# If pod got the annotation removed - it is now gone too. The number's
# not 0, because we need to anticipate for restarts and delay before
# registry is populated by watcher.
self._do_work(params, b_base.disconnect, 5)

# NOTE(ndesh): We need to lock here to avoid race condition
# with the deletion code in the watcher to ensure that
# we delete the registry entry exactly once
Expand All @@ -126,29 +150,9 @@ def report_drivers_health(self, driver_healthy):
LOG.debug("Reporting CNI driver not healthy.")
self.healthy.value = driver_healthy

def _do_work(self, params, fn, confirm=False):
def _do_work(self, params, fn, timeout):
pod_name = self._get_pod_name(params)

timeout = CONF.cni_daemon.vif_annotation_timeout

if confirm:
# Try to confirm if pod in the registry is not stale cache.
with lockutils.lock(pod_name, external=True):
if pod_name in self.registry:
cached_pod = self.registry[pod_name]['pod']
try:
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
except Exception:
LOG.exception('Error when getting pod %s', pod_name)
raise exceptions.ResourceNotReady(pod_name)

if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
LOG.warning('Stale pod %s detected in cache. (API '
'uid=%s, cached uid=%s). Removing it from '
'cache.', pod_name, pod['metadata']['uid'],
cached_pod['metadata']['uid'])
del self.registry[pod_name]

# In case of KeyError retry for `timeout` s, wait 1 s between tries.
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
retry_on_exception=lambda e: isinstance(e, KeyError))
Expand Down

0 comments on commit 586d639

Please sign in to comment.