Skip to content

Commit

Permalink
Merge pull request #2061 from ricky-rav/OCPBUGS-20336_414
Browse files Browse the repository at this point in the history
OCPBUGS-29231:[release-4.14] Separate timeout for handler sync from informer sync & do not resync services during node tracker startup
  • Loading branch information
openshift-merge-bot[bot] committed Feb 14, 2024
2 parents 243fa48 + b84bbbc commit 87016e7
Show file tree
Hide file tree
Showing 11 changed files with 84 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -168,20 +168,20 @@ func (c *Controller) Start(threadiness int) error {
defer utilruntime.HandleCrash()

klog.Infof("Starting Egress Services Controller")
if !util.WaitForNamedCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for egressservice caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for egress service caches to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for service caches (for egress services) to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for endpoint slice caches (for egress services) to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_nodes", c.stopCh, c.nodesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_nodes", c.stopCh, c.nodesSynced) {
return fmt.Errorf("timed out waiting for node caches (for egress services) to sync")
}

klog.Infof("Repairing Egress Services")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ func (nadController *NetAttachDefinitionController) Start() error {

func (nadController *NetAttachDefinitionController) start() error {
nadController.nadFactory.Start(nadController.stopChan)
if !util.WaitForNamedCacheSyncWithTimeout(nadController.name, nadController.stopChan, nadController.netAttachDefSynced) {
return fmt.Errorf("stop requested while syncing caches")
if !util.WaitForInformerCacheSyncWithTimeout(nadController.name, nadController.stopChan, nadController.netAttachDefSynced) {
return fmt.Errorf("stop requested while syncing %s caches", nadController.name)
}

err := nadController.SyncNetworkControllers()
Expand Down
2 changes: 1 addition & 1 deletion go-controller/pkg/node/controllers/egressip/egressip.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ func (c *Controller) Run(stopCh <-chan struct{}, wg *sync.WaitGroup, threads int
syncWg.Add(1)
go func(resourceName string, syncFn cache.InformerSynced) {
defer syncWg.Done()
if !util.WaitForNamedCacheSyncWithTimeout(resourceName, stopCh, syncFn) {
if !util.WaitForInformerCacheSyncWithTimeout(resourceName, stopCh, syncFn) {
syncErrs = append(syncErrs, fmt.Errorf("timed out waiting for %q caches to sync", resourceName))
}
}(se.resourceName, se.syncFn)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func runController(testNS ns.NetNS, c *Controller) (cleanupFn, error) {
{"eippod", c.podInformer.HasSynced},
} {
func(resourceName string, syncFn cache.InformerSynced) {
if !util.WaitForNamedCacheSyncWithTimeout(resourceName, stopCh, syncFn) {
if !util.WaitForInformerCacheSyncWithTimeout(resourceName, stopCh, syncFn) {
gomega.PanicWith(fmt.Sprintf("timed out waiting for %q caches to sync", resourceName))
}
}(se.resourceName, se.syncFn)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,16 @@ func (c *Controller) Run(wg *sync.WaitGroup, threadiness int) error {

klog.Infof("Starting Egress Services Controller")

if !util.WaitForNamedCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for egress service caches to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for service caches (for egress services) to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for endpoint slice caches (for egress services) to sync")
}

klog.Infof("Repairing Egress Services")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) {

// Wait for the caches to be synced
klog.Info("Waiting for informer caches to sync")
if !util.WaitForNamedCacheSyncWithTimeout(c.controllerName, stopCh, c.anpCacheSynced, c.banpCacheSynced, c.anpNamespaceSynced, c.anpPodSynced) {
utilruntime.HandleError(fmt.Errorf("timed out waiting for caches to sync"))
if !util.WaitForInformerCacheSyncWithTimeout(c.controllerName, stopCh, c.anpCacheSynced, c.banpCacheSynced, c.anpNamespaceSynced, c.anpPodSynced) {
utilruntime.HandleError(fmt.Errorf("timed out waiting for admin network policy caches to sync"))
klog.Errorf("Error syncing caches for admin network policy and baseline admin network policy")
return
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,20 +204,20 @@ func (c *Controller) Run(wg *sync.WaitGroup, threadiness int) error {

klog.Infof("Starting Egress Services Controller")

if !util.WaitForNamedCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices", c.stopCh, c.egressServiceSynced) {
return fmt.Errorf("timed out waiting for egress service caches to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_services", c.stopCh, c.servicesSynced) {
return fmt.Errorf("timed out waiting for service caches (for egress services) to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_endpointslices", c.stopCh, c.endpointSlicesSynced) {
return fmt.Errorf("timed out waiting for endpoint slice caches (for egress services) to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressservices_nodes", c.stopCh, c.nodesSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressservices_nodes", c.stopCh, c.nodesSynced) {
return fmt.Errorf("timed out waiting for node caches (for egress services) to sync")
}

klog.Infof("Repairing Egress Services")
Expand Down
49 changes: 33 additions & 16 deletions go-controller/pkg/ovn/controller/services/services_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ type Controller struct {
// nodeTracker
nodeTracker *nodeTracker

// startupDone is false up until the node tracker, service and endpointslice initial sync
// in Run() is completed
startupDone bool
startupDoneLock sync.RWMutex

// Per node information and template variables. The latter expand to each
// chassis' node IP (v4 and v6).
// Must be accessed only with the nodeInfo mutex taken.
Expand Down Expand Up @@ -167,10 +172,13 @@ func (c *Controller) Run(workers int, stopCh <-chan struct{}, runRepair, useLBGr
if err != nil {
return err
}
// We need node cache to be synced first, as we rely on it to properly reprogram initial per node load balancers
klog.Info("Waiting for node tracker caches to sync")
if !util.WaitForNamedCacheSyncWithTimeout(nodeControllerName, stopCh, nodeHandler.HasSynced) {
return fmt.Errorf("error syncing cache")
// We need the node tracker to be synced first, as we rely on it to properly reprogram initial per node load balancers
klog.Info("Waiting for node tracker handler to sync")
c.startupDoneLock.Lock()
c.startupDone = false
c.startupDoneLock.Unlock()
if !util.WaitForHandlerSyncWithTimeout(nodeControllerName, stopCh, types.HandlerSyncTimeout, nodeHandler.HasSynced) {
return fmt.Errorf("error syncing node tracker handler")
}

klog.Info("Setting up event handlers for services")
Expand All @@ -193,10 +201,9 @@ func (c *Controller) Run(workers int, stopCh <-chan struct{}, runRepair, useLBGr
return err
}

// Wait for the caches to be synced
klog.Info("Waiting for service and endpoint caches to sync")
if !util.WaitForNamedCacheSyncWithTimeout(controllerName, stopCh, svcHandler.HasSynced, endpointHandler.HasSynced) {
return fmt.Errorf("error syncing cache")
klog.Info("Waiting for service and endpoint handlers to sync")
if !util.WaitForHandlerSyncWithTimeout(controllerName, stopCh, types.HandlerSyncTimeout, svcHandler.HasSynced, endpointHandler.HasSynced) {
return fmt.Errorf("error syncing service and endpoint handlers")
}

if runRepair {
Expand All @@ -210,6 +217,10 @@ func (c *Controller) Run(workers int, stopCh <-chan struct{}, runRepair, useLBGr
return fmt.Errorf("error initializing alreadyApplied cache: %w", err)
}

c.startupDoneLock.Lock()
c.startupDone = true
c.startupDoneLock.Unlock()

// Start the workers after the repair loop to avoid races
klog.Info("Starting workers")
for i := 0; i < workers; i++ {
Expand Down Expand Up @@ -512,15 +523,21 @@ func (c *Controller) RequestFullSync(nodeInfos []nodeInfo) {
// Resync node infos and node IP templates.
c.syncNodeInfos(nodeInfos)

// Resync services.
services, err := c.serviceLister.List(labels.Everything())
if err != nil {
klog.Errorf("Cached lister failed!? %v", err)
return
}
// Resync all services unless we're processing the initial node tracker sync (in which case
// the service add will happen at the next step in the services controller Run() and workers
// aren't up yet anyway: no need to do it during node tracker startup then)
c.startupDoneLock.RLock()
defer c.startupDoneLock.RUnlock()
if c.startupDone {
services, err := c.serviceLister.List(labels.Everything())
if err != nil {
klog.Errorf("Cached lister failed!? %v", err)
return
}

for _, service := range services {
c.onServiceAdd(service)
for _, service := range services {
c.onServiceAdd(service)
}
}
}

Expand Down
12 changes: 6 additions & 6 deletions go-controller/pkg/ovn/egressqos.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,16 @@ func (oc *DefaultNetworkController) runEgressQoSController(wg *sync.WaitGroup, t

klog.Infof("Starting EgressQoS Controller")

if !util.WaitForNamedCacheSyncWithTimeout("egressqosnodes", stopCh, oc.egressQoSNodeSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressqosnodes", stopCh, oc.egressQoSNodeSynced) {
return fmt.Errorf("timed out waiting for egress QoS node caches to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressqospods", stopCh, oc.egressQoSPodSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressqospods", stopCh, oc.egressQoSPodSynced) {
return fmt.Errorf("timed out waiting for egress QoS pods caches to sync")
}

if !util.WaitForNamedCacheSyncWithTimeout("egressqos", stopCh, oc.egressQoSSynced) {
return fmt.Errorf("timed out waiting for caches to sync")
if !util.WaitForInformerCacheSyncWithTimeout("egressqos", stopCh, oc.egressQoSSynced) {
return fmt.Errorf("timed out waiting for egress QoS caches to sync")
}

klog.Infof("Repairing EgressQoSes")
Expand Down
7 changes: 6 additions & 1 deletion go-controller/pkg/types/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,12 @@ const (
// Logical Switch or Router Port
MaxLogicalPortTunnelKey = 32767

// InformerSyncTimeout is used to wait from the initial informer cache sync.
// InformerSyncTimeout is used when waiting for the initial informer cache sync
// (i.e. all existing objects should be listed by the informer).
// It allows ~4 list() retries with the default reflector exponential backoff config
InformerSyncTimeout = 20 * time.Second

// HandlerSyncTimeout is used when waiting for initial object handler sync.
// (i.e. all the ADD events should be processed for the existing objects by the event handler)
HandlerSyncTimeout = 20 * time.Second
)
12 changes: 11 additions & 1 deletion go-controller/pkg/util/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@ func GetChildStopChanWithTimeout(parentStopChan <-chan struct{}, duration time.D
return childStopChan
}

func WaitForNamedCacheSyncWithTimeout(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool {
// WaitForInformerCacheSyncWithTimeout waits for the provided informer caches to be populated with all existing objects
// by their respective informer. This corresponds to a LIST operation on the corresponding resource types.
// WaitForInformerCacheSyncWithTimeout times out and returns false if the provided caches haven't all synchronized within types.InformerSyncTimeout
func WaitForInformerCacheSyncWithTimeout(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool {
return cache.WaitForNamedCacheSync(controllerName, GetChildStopChanWithTimeout(stopCh, types.InformerSyncTimeout), cacheSyncs...)
}

// WaitForHandlerSyncWithTimeout waits for the provided handlers to do a sync on all existing objects for the resource types they're
// watching. This corresponds to adding all existing objects. If that doesn't happen before the provided timeout,
// WaitForInformerCacheSyncWithTimeout times out and returns false.
func WaitForHandlerSyncWithTimeout(controllerName string, stopCh <-chan struct{}, timeout time.Duration, handlerSyncs ...cache.InformerSynced) bool {
return cache.WaitForNamedCacheSync(controllerName, GetChildStopChanWithTimeout(stopCh, timeout), handlerSyncs...)
}

0 comments on commit 87016e7

Please sign in to comment.