Skip to content

Commit

Permalink
Merge pull request #446 from wking/gracefully-release-leader-lease-4.5
Browse files Browse the repository at this point in the history
Bug 1872906: pkg/start: Release leader lease on graceful shutdown
  • Loading branch information
openshift-merge-robot committed Oct 7, 2020
2 parents 8bfbc20 + 65bcffd commit 2c849e5
Show file tree
Hide file tree
Showing 12 changed files with 298 additions and 181 deletions.
1 change: 1 addition & 0 deletions bootstrap/bootstrap-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ spec:
fieldRef:
fieldPath: spec.nodeName
hostNetwork: true
terminationGracePeriodSeconds: 130
volumes:
- name: kubeconfig
hostPath:
Expand Down
7 changes: 5 additions & 2 deletions cmd/start.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package main

import (
"context"

"github.com/spf13/cobra"
"k8s.io/klog"

Expand All @@ -16,11 +18,12 @@ func init() {
Long: "",
Run: func(cmd *cobra.Command, args []string) {
// To help debugging, immediately log version
klog.Infof("%s", version.String)
klog.Info(version.String)

if err := opts.Run(); err != nil {
if err := opts.Run(context.Background()); err != nil {
klog.Fatalf("error: %v", err)
}
klog.Infof("Graceful shutdown complete for %s.", version.String)
},
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ spec:
nodeSelector:
node-role.kubernetes.io/master: ""
priorityClassName: "system-cluster-critical"
terminationGracePeriodSeconds: 130
tolerations:
- key: "node-role.kubernetes.io/master"
operator: Exists
Expand Down
6 changes: 3 additions & 3 deletions lib/resourceread/apiext_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestReadCustomResourceDefinitionOrDie(t *testing.T) {
args args
}{
{
name:"v1",
name: "v1",
args: args{
objBytes: []byte(`
apiVersion: apiextensions.k8s.io/v1
Expand Down Expand Up @@ -42,7 +42,7 @@ spec:
},
},
{
name:"v1beta1",
name: "v1beta1",
args: args{
objBytes: []byte(`
apiVersion: apiextensions.k8s.io/v1beta1
Expand Down Expand Up @@ -82,4 +82,4 @@ spec:
_ = ReadCustomResourceDefinitionOrDie(tt.args.objBytes)
})
}
}
}
9 changes: 5 additions & 4 deletions pkg/autoupdate/autoupdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

"github.com/blang/semver"

"k8s.io/klog"
v1 "github.com/openshift/api/config/v1"
clientset "github.com/openshift/client-go/config/clientset/versioned"
"github.com/openshift/client-go/config/clientset/versioned/scheme"
Expand All @@ -23,6 +22,7 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
)

const (
Expand Down Expand Up @@ -87,23 +87,24 @@ func New(
}

// Run runs the autoupdate controller.
func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) {
func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) error {
defer utilruntime.HandleCrash()
defer ctrl.queue.ShutDown()

klog.Info("Starting AutoUpdateController")
defer klog.Info("Shutting down AutoUpdateController")

if !cache.WaitForCacheSync(stopCh, ctrl.cacheSynced...) {
klog.Info("Caches never synchronized")
return
return fmt.Errorf("caches never synchronized")
}

for i := 0; i < workers; i++ {
// FIXME: actually wait until these complete if the Context is canceled. And possibly add utilruntime.HandleCrash.
go wait.Until(ctrl.worker, time.Second, stopCh)
}

<-stopCh
return nil
}

func (ctrl *Controller) eventHandler() cache.ResourceEventHandler {
Expand Down
53 changes: 0 additions & 53 deletions pkg/cvo/availableupdates.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package cvo

import (
"crypto/tls"
"crypto/x509"
"fmt"
"net/url"
"runtime"
Expand All @@ -11,7 +10,6 @@ import (
"github.com/blang/semver"
"github.com/google/uuid"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog"

Expand Down Expand Up @@ -197,54 +195,3 @@ func calculateAvailableUpdatesStatus(clusterID string, proxyURL *url.URL, tlsCon
LastTransitionTime: metav1.Now(),
}
}

// getHTTPSProxyURL returns a url.URL object for the configured
// https proxy only. It can be nil if does not exist or there is an error.
func (optr *Operator) getHTTPSProxyURL() (*url.URL, string, error) {
proxy, err := optr.proxyLister.Get("cluster")

if errors.IsNotFound(err) {
return nil, "", nil
}
if err != nil {
return nil, "", err
}

if &proxy.Spec != nil {
if proxy.Spec.HTTPSProxy != "" {
proxyURL, err := url.Parse(proxy.Spec.HTTPSProxy)
if err != nil {
return nil, "", err
}
return proxyURL, proxy.Spec.TrustedCA.Name, nil
}
}
return nil, "", nil
}

func (optr *Operator) getTLSConfig(cmNameRef string) (*tls.Config, error) {
cm, err := optr.cmConfigLister.Get(cmNameRef)

if err != nil {
return nil, err
}

certPool, _ := x509.SystemCertPool()
if certPool == nil {
certPool = x509.NewCertPool()
}

if cm.Data["ca-bundle.crt"] != "" {
if ok := certPool.AppendCertsFromPEM([]byte(cm.Data["ca-bundle.crt"])); !ok {
return nil, fmt.Errorf("unable to add ca-bundle.crt certificates")
}
} else {
return nil, nil
}

config := &tls.Config{
RootCAs: certPool,
}

return config, nil
}
14 changes: 4 additions & 10 deletions pkg/cvo/cvo.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ func New(
proxyInformer configinformersv1.ProxyInformer,
client clientset.Interface,
kubeClient kubernetes.Interface,
enableMetrics bool,
exclude string,
) *Operator {
eventBroadcaster := record.NewBroadcaster()
Expand Down Expand Up @@ -214,11 +213,6 @@ func New(
// make sure this is initialized after all the listers are initialized
optr.upgradeableChecks = optr.defaultUpgradeableChecks()

if enableMetrics {
if err := optr.registerMetrics(coInformer.Informer()); err != nil {
panic(err)
}
}
return optr
}

Expand Down Expand Up @@ -321,8 +315,7 @@ func loadConfigMapVerifierDataFromUpdate(update *payload.Update, clientBuilder v
}

// Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
func (optr *Operator) Run(ctx context.Context, workers int) {
defer utilruntime.HandleCrash()
func (optr *Operator) Run(ctx context.Context, workers int) error {
defer optr.queue.ShutDown()
stopCh := ctx.Done()
workerStopCh := make(chan struct{})
Expand All @@ -331,8 +324,7 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
defer klog.Info("Shutting down ClusterVersionOperator")

if !cache.WaitForCacheSync(stopCh, optr.cacheSynced...) {
klog.Info("Caches never synchronized")
return
return fmt.Errorf("caches never synchronized: %w", ctx.Err())
}

// trigger the first cluster version reconcile always
Expand Down Expand Up @@ -361,6 +353,8 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
// stop the queue, then wait for the worker to exit
optr.queue.ShutDown()
<-workerStopCh

return nil
}

func (optr *Operator) queueKey() string {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cvo/cvo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ import (
"k8s.io/apimachinery/pkg/util/diff"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/discovery"
"k8s.io/client-go/rest"
kfake "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/rest"
ktesting "k8s.io/client-go/testing"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog"
Expand Down
61 changes: 61 additions & 0 deletions pkg/cvo/egress.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package cvo

import (
"crypto/tls"
"crypto/x509"
"fmt"
"net/url"

"k8s.io/apimachinery/pkg/api/errors"
)

// getHTTPSProxyURL returns a url.URL object for the configured
// https proxy only. It can be nil if does not exist or there is an error.
func (optr *Operator) getHTTPSProxyURL() (*url.URL, string, error) {
proxy, err := optr.proxyLister.Get("cluster")

if errors.IsNotFound(err) {
return nil, "", nil
}
if err != nil {
return nil, "", err
}

if &proxy.Spec != nil {
if proxy.Spec.HTTPSProxy != "" {
proxyURL, err := url.Parse(proxy.Spec.HTTPSProxy)
if err != nil {
return nil, "", err
}
return proxyURL, proxy.Spec.TrustedCA.Name, nil
}
}
return nil, "", nil
}

func (optr *Operator) getTLSConfig(cmNameRef string) (*tls.Config, error) {
cm, err := optr.cmConfigLister.Get(cmNameRef)

if err != nil {
return nil, err
}

certPool, _ := x509.SystemCertPool()
if certPool == nil {
certPool = x509.NewCertPool()
}

if cm.Data["ca-bundle.crt"] != "" {
if ok := certPool.AppendCertsFromPEM([]byte(cm.Data["ca-bundle.crt"])); !ok {
return nil, fmt.Errorf("unable to add ca-bundle.crt certificates")
}
} else {
return nil, nil
}

config := &tls.Config{
RootCAs: certPool,
}

return config, nil
}
70 changes: 69 additions & 1 deletion pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
package cvo

import (
"context"
"net"
"net/http"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
"github.com/openshift/cluster-version-operator/pkg/internal"
)

func (optr *Operator) registerMetrics(coInformer cache.SharedInformer) error {
// RegisterMetrics initializes metrics and registers them with the
// Prometheus implementation.
func (optr *Operator) RegisterMetrics(coInformer cache.SharedInformer) error {
m := newOperatorMetrics(optr)
coInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: m.clusterOperatorChanged,
Expand Down Expand Up @@ -86,6 +93,67 @@ version for 'cluster', or empty for 'initial'.
}
}

// RunMetrics launches an server bound to listenAddress serving
// Prometheus metrics at /metrics over HTTP. Continues serving until
// runContext.Done() and then attempts a clean shutdown limited by
// shutdownContext.Done(). Assumes runContext.Done() occurs before or
// simultaneously with shutdownContext.Done().
func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress string) error {
handler := http.NewServeMux()
handler.Handle("/metrics", promhttp.Handler())
server := &http.Server{
Handler: handler,
}

errorChannel := make(chan error, 1)
errorChannelCount := 1
go func() {
tcpListener, err := net.Listen("tcp", listenAddress)
if err != nil {
errorChannel <- err
return
}

klog.Infof("Metrics port listening for HTTP on %v", listenAddress)

errorChannel <- server.Serve(tcpListener)
}()

shutdown := false
var loopError error
for errorChannelCount > 0 {
if shutdown {
err := <-errorChannel
errorChannelCount--
if err != nil && err != http.ErrServerClosed {
if loopError == nil {
loopError = err
} else if err != nil { // log the error we are discarding
klog.Errorf("Failed to gracefully shut down metrics server: %s", err)
}
}
} else {
select {
case <-runContext.Done(): // clean shutdown
case err := <-errorChannel: // crashed before a shutdown was requested
errorChannelCount--
if err != nil && err != http.ErrServerClosed {
loopError = err
}
}
shutdown = true
shutdownError := server.Shutdown(shutdownContext)
if loopError == nil {
loopError = shutdownError
} else if shutdownError != nil { // log the error we are discarding
klog.Errorf("Failed to gracefully shut down metrics server: %s", shutdownError)
}
}
}

return loopError
}

type conditionKey struct {
Name string
Type string
Expand Down

0 comments on commit 2c849e5

Please sign in to comment.