Skip to content

Commit

Permalink
pkg/operator: add cluster backup upgrade controller
Browse files Browse the repository at this point in the history
Signed-off-by: Sam Batschelet <sbatsche@redhat.com>
  • Loading branch information
hexfusion committed Aug 30, 2021
1 parent ee6aea0 commit 9f68fc6
Show file tree
Hide file tree
Showing 14 changed files with 1,157 additions and 23 deletions.
76 changes: 76 additions & 0 deletions bindata/etcd/cluster-backup-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
apiVersion: v1
kind: Pod
metadata:
name: cluster-backup
namespace: openshift-etcd
labels:
app: cluster-backup
spec:
initContainers:
- name: verify-storage
imagePullPolicy: IfNotPresent
terminationMessagePolicy: FallbackToLogsOnError
command: [ "cluster-etcd-operator", "verify", "backup-storage" ]
securityContext:
privileged: true
resources:
requests:
memory: 50Mi
cpu: 5m
volumeMounts:
- mountPath: /etc/kubernetes/cluster-backup
name: etc-kubernetes-cluster-backup
containers:
- name: cluster-backup
imagePullPolicy: IfNotPresent
terminationMessagePolicy: FallbackToLogsOnError
command:
- /bin/sh
- -c
- |
#!/bin/sh
set -exuo pipefail
/usr/local/bin/cluster-backup.sh --force ${CLUSTER_BACKUP_PATH}
resources:
requests:
memory: 80Mi
cpu: 10m
securityContext:
privileged: true
volumeMounts:
- mountPath: /usr/local/bin
name: usr-local-bin
- mountPath: /etc/kubernetes/static-pod-resources
name: resources-dir
- mountPath: /etc/kubernetes/static-pod-certs
name: cert-dir
- mountPath: /etc/kubernetes/manifests
name: static-pod-dir
- mountPath: /etc/kubernetes/cluster-backup
name: etc-kubernetes-cluster-backup
priorityClassName: system-node-critical
nodeSelector:
node-role.kubernetes.io/master: ""
restartPolicy: Never
hostNetwork: true
tolerations:
- operator: "Exists"
volumes:
- hostPath:
path: /usr/local/bin
name: usr-local-bin
- hostPath:
path: /etc/kubernetes/cluster-backup
name: etc-kubernetes-cluster-backup
- hostPath:
path: /etc/kubernetes/manifests
name: static-pod-dir
- hostPath:
path: /etc/kubernetes/static-pod-resources
name: resources-dir
- hostPath:
path: /etc/kubernetes/static-pod-resources/etcd-certs
name: cert-dir

4 changes: 4 additions & 0 deletions bindata/etcd/etcd-common-tools
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ export KUBECONFIG

# download etcdctl from upstream release assets
function dl_etcdctl {
if [ -x "$(command -v etcdctl)" ]; then
echo "etcdctl is already installed"
return
fi
local etcdimg=${ETCD_IMAGE}
local etcdctr=$(podman create ${etcdimg} --authfile=/var/lib/kubelet/config.json)
local etcdmnt=$(podman mount "${etcdctr}")
Expand Down
2 changes: 2 additions & 0 deletions cmd/cluster-etcd-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
goflag "flag"
"fmt"
"github.com/openshift/cluster-etcd-operator/pkg/cmd/verify"
"io/ioutil"
"math/rand"
"os"
Expand Down Expand Up @@ -64,6 +65,7 @@ func NewSSCSCommand() *cobra.Command {
cmd.AddCommand(certsyncpod.NewCertSyncControllerCommand(operator.CertConfigMaps, operator.CertSecrets))
cmd.AddCommand(waitforceo.NewWaitForCeoCommand(os.Stderr))
cmd.AddCommand(monitor.NewMonitorCommand(os.Stderr))
cmd.AddCommand(verify.NewVerifyCommand())

return cmd
}
227 changes: 227 additions & 0 deletions pkg/cmd/verify/backupstorage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
package verify

import (
"context"
"fmt"
"os"
"os/signal"
"syscall"
"time"

"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/client/pkg/v3/transport"
clientv3 "go.etcd.io/etcd/client/v3"
"golang.org/x/sys/unix"
"google.golang.org/grpc"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
)

const (
defaultEndpoint = "https://127.0.0.1:2370"
defaultBackupPath = "/var/lib/cluster-backup"
defaultCertFilePath = "/var/run/secrets/etcd-client/tls.crt"
defaultKeyFilePath = "/var/run/secrets/etcd-client/tls.key"
defaultCaCertFilePath = "/var/run/configmaps/etcd-ca/ca-bundle.crt"

keepaliveTime = 30 * time.Second
keepaliveTimeout = 10 * time.Second
dialTimeout = 20 * time.Second
)

type verifyBackupStorage struct {
endpoints string
backupPath string
clientCertFile string
clientKeyFile string
clientCACertFile string
}

// NewVerifyBackupStorage perform checks against the local filesystem and compares the available storage bytes with the
// estimated size as reported by EndpointStatus.
func NewVerifyBackupStorage() *cobra.Command {
verifyBackupStorage := &verifyBackupStorage{}
cmd := &cobra.Command{
Use: "backup-storage",
Short: "performs checks to ensure storage is adequate for backup state",
Run: func(cmd *cobra.Command, args []string) {
must := func(fn func(ctx context.Context) error) {}
must(verifyBackupStorage.Run)
},
}

verifyBackupStorage.AddFlags(cmd.Flags())
return cmd
}

var shutdownSignals = []os.Signal{os.Interrupt, syscall.SIGTERM}

func (v *verifyBackupStorage) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&v.endpoints, "endpoints", defaultEndpoint, "Comma separated listed of targets to perform health checks against. Default https://localhost:2379")
fs.StringVar(&v.backupPath, "backup-path", defaultBackupPath, "path to verify storage requirements.")
fs.StringVar(&v.clientCertFile, "cert-file", defaultCertFilePath, "etcd client certificate file.")
fs.StringVar(&v.clientKeyFile, "key-file", defaultKeyFilePath, "etcd client key file.")
fs.StringVar(&v.clientCACertFile, "cacert-file", defaultCaCertFilePath, "etcd client CA certificate file.")
}

func (v *verifyBackupStorage) Run(ctx context.Context) error {
defer utilruntime.HandleCrash()
ctx, cancel := context.WithCancel(ctx)
defer cancel()

// handle teardown
shutdownHandler := make(chan os.Signal, 2)
signal.Notify(shutdownHandler, shutdownSignals...)
go func() {
select {
case <-shutdownHandler:
klog.Infof("Received SIGTERM or SIGINT signal, shutting down.")
close(shutdownHandler)
cancel()
case <-ctx.Done():
klog.Infof("Context has been cancelled, shutting down.")
close(shutdownHandler)
cancel()
}
}()

waitDuration := 1 * time.Second
timeoutDuration := 10 * time.Second

// Perform reasonable retry on non-fatal errors
err := wait.Poll(
waitDuration,
timeoutDuration,
func() (bool, error) {
return v.verifyBackupStorage(ctx)
})
if err != nil {
return err
}

return nil
}

func (v *verifyBackupStorage) verifyBackupStorage(ctx context.Context) (bool, error) {
tlsInfo := transport.TLSInfo{
CertFile: v.clientCertFile,
KeyFile: v.clientKeyFile,
TrustedCAFile: v.clientCACertFile,
}
cli, err := newETCD3Client(ctx, tlsInfo)
if err != nil {
klog.Warningf("failed to create client: %v", err)
return false, nil
}
defer cli.Close()

members, err := cli.MemberList(ctx)
if err != nil {
klog.Warningf("failed checking member list: %v", err)
return false, nil
}

dbSizeBytes, err := getBackupSizeBytes(ctx, cli, members.Members)
if err != nil {
klog.Warningf("failed checking backup size: %v", err)
return false, nil
}

fsAvailableBytes, err := getPathAvailableSpaceBytes(v.backupPath)
if err != nil {
return true, err
}

requiredBytes := 2 * dbSizeBytes
if requiredBytes > fsAvailableBytes {
return true, fmt.Errorf("available storage is not adequate for path: %q, required bytes: %d, available bytes %d", v.backupPath, requiredBytes, fsAvailableBytes)
}

klog.Infof("Path %s, required storage bytes: %d, available %d", v.backupPath, requiredBytes, fsAvailableBytes)
return true, nil
}

func newETCD3Client(ctx context.Context, tlsInfo transport.TLSInfo) (*clientv3.Client, error) {
tlsConfig, err := tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
dialOptions := []grpc.DialOption{
grpc.WithBlock(), // block until the underlying connection is up
}

cfg := &clientv3.Config{
DialTimeout: dialTimeout,
DialOptions: dialOptions,
DialKeepAliveTime: keepaliveTime,
DialKeepAliveTimeout: keepaliveTimeout,
Endpoints: []string{defaultEndpoint},
TLS: tlsConfig,
Context: ctx,
}

return clientv3.New(*cfg)
}

// getBackupSizeBytes asks the etcd leader for DbSize as an approximation for how large the backup state will be. If
// leader is not found in the list we use the largest DBSize.
func getBackupSizeBytes(ctx context.Context, cli *clientv3.Client, members []*etcdserverpb.Member) (int64, error) {
var errs []error
var endpointStatus []*clientv3.StatusResponse
for _, member := range members {
if !etcdcli.HasStarted(member) {
continue
}
status, err := cli.Status(ctx, member.ClientURLs[0])
if err != nil {
errs = append(errs, err)
continue
}
// best effort use leader
if status.Leader != status.Header.MemberId {
return status.DbSize, nil
}
endpointStatus = append(endpointStatus, status)
}

// If no leader responds use the largest of endpoints queried.
if len(endpointStatus) > 0 {
var largestDBSize int64
for _, status := range endpointStatus {
if status.DbSize > largestDBSize {
largestDBSize = status.DbSize
}
}
return largestDBSize, nil
}

if len(errs) > 0 {
return 0, utilerrors.NewAggregate(errs)
}

return 0, fmt.Errorf("endpoint status: DBSize check failed")
}

func getPathAvailableSpaceBytes(path string) (int64, error) {
// Verify path exists
if _, err := os.Stat(path); err != nil {
return 0, fmt.Errorf("verification of backup path failed: %w", err)
}
var stat unix.Statfs_t
err := unix.Statfs(path, &stat)
if err != nil {
return 0, fmt.Errorf("filesystem status: %w", err)
}

available := int64(stat.Bavail) * stat.Bsize
if available == 0 {
return 0, fmt.Errorf("filesystem status: no available bytes")
}

return available, nil
}
23 changes: 23 additions & 0 deletions pkg/cmd/verify/verify.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package verify

import (
"io"

"github.com/spf13/cobra"
)

type verifyStorageOpts struct {
errOut io.Writer
kubeconfig string
}

// NewVerifyCommand performs a various checks.
func NewVerifyCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "verify",
Short: "performs checks to verify preconditions and exit 0 on success",
Run: func(cmd *cobra.Command, args []string) {},
}
cmd.AddCommand(NewVerifyBackupStorage())
return cmd
}

0 comments on commit 9f68fc6

Please sign in to comment.