Skip to content

Commit

Permalink
Account for large scale simultaneous renewal on UPI clusters
Browse files Browse the repository at this point in the history
If more than 100 machines need to renew their certificates at once on a 
UPI cluster today, we exhaust the capacity limit set by the maxPending 
check. To allow renewal in these cases we need to account for how many 
Nodes have joined the cluster before we block the approvals
  • Loading branch information
JoelSpeed committed Jan 20, 2022
1 parent 2153a9f commit 7baac23
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 15 deletions.
29 changes: 21 additions & 8 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,13 @@ func (m *CertificateApprover) Reconcile(ctx context.Context, req ctrl.Request) (
return reconcile.Result{}, fmt.Errorf("Failed to list machines: %w", err)
}

if offLimits := reconcileLimits(req.Name, machines, csrs); offLimits {
nodes := &corev1.NodeList{}
if err := m.NodeClient.List(ctx, nodes); err != nil {
klog.Errorf("%v: Failed to list Nodes: %v", req.Name, err)
return reconcile.Result{}, fmt.Errorf("Failed to get Nodes: %w", err)
}

if offLimits := reconcileLimits(req.Name, machines, nodes, csrs); offLimits {
// Stop all reconciliation
return reconcile.Result{}, nil
}
Expand All @@ -151,7 +157,7 @@ func (m *CertificateApprover) Reconcile(ctx context.Context, req ctrl.Request) (
// When an error occurs, we requeue and so update the limits on the
// next reconcile.
// Don't use a cached client here else we may not have up to date CSRs.
return reconcile.Result{}, reconcileLimitsUncached(m.NodeRestCfg, csr.Name, machines)
return reconcile.Result{}, reconcileLimitsUncached(m.NodeRestCfg, csr.Name, machines, nodes)
}
}

Expand All @@ -161,8 +167,8 @@ func (m *CertificateApprover) Reconcile(ctx context.Context, req ctrl.Request) (
}

// reconcileLimits will short circut logic if number of pending CSRs is exceeding limit
func reconcileLimits(csrName string, machines []machinehandlerpkg.Machine, csrs *certificatesv1.CertificateSigningRequestList) bool {
maxPending := getMaxPending(machines)
func reconcileLimits(csrName string, machines []machinehandlerpkg.Machine, nodes *corev1.NodeList, csrs *certificatesv1.CertificateSigningRequestList) bool {
maxPending := getMaxPending(machines, nodes)
atomic.StoreUint32(&MaxPendingCSRs, uint32(maxPending))
pending := recentlyPendingCSRs(csrs.Items)
atomic.StoreUint32(&PendingCSRs, uint32(pending))
Expand All @@ -177,7 +183,7 @@ func reconcileLimits(csrName string, machines []machinehandlerpkg.Machine, csrs
// reconcileLimitsUncached is used to update the limits using an uncached certificates list.
// This is used at the end of the approval process to ensure that the limits (and therefore)
// the metrics are always up to date.
func reconcileLimitsUncached(cfg *rest.Config, csrName string, machines []machinehandlerpkg.Machine) error {
func reconcileLimitsUncached(cfg *rest.Config, csrName string, machines []machinehandlerpkg.Machine, nodes *corev1.NodeList) error {
certClient, err := certificatesv1client.NewForConfig(cfg)
if err != nil {
return fmt.Errorf("could not initialise certificates client: %v", err)
Expand All @@ -188,7 +194,7 @@ func reconcileLimitsUncached(cfg *rest.Config, csrName string, machines []machin
return fmt.Errorf("could not list CSRs: %v", err)
}

reconcileLimits(csrName, machines, certificates)
reconcileLimits(csrName, machines, nodes, certificates)
return nil
}

Expand Down Expand Up @@ -287,6 +293,13 @@ func parseCSR(obj *certificatesv1.CertificateSigningRequest) (*x509.CertificateR
return x509.ParseCertificateRequest(block.Bytes)
}

func getMaxPending(machines []machinehandlerpkg.Machine) int {
return len(machines) + maxDiffBetweenPendingCSRsAndMachinesCount
func getMaxPending(machines []machinehandlerpkg.Machine, nodes *corev1.NodeList) int {
return max(len(machines), len(nodes.Items)) + maxDiffBetweenPendingCSRsAndMachinesCount
}

func max(x, y int) int {
if x < y {
return y
}
return x
}
45 changes: 38 additions & 7 deletions pkg/controller/csr_check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2885,19 +2885,50 @@ func creationTimestamp(delta time.Duration) metav1.Time {
}

func TestGetMaxPending(t *testing.T) {
ml := []machinehandlerpkg.Machine{
testCases := []struct {
name string
machines []machinehandlerpkg.Machine
nodes []corev1.Node
expectedMax int
}{
{
Status: machinehandlerpkg.MachineStatus{},
name: "with more machines than nodes",
machines: []machinehandlerpkg.Machine{
{},
{},
{},
},
nodes: []corev1.Node{
{},
{},
},
expectedMax: 3 + maxDiffBetweenPendingCSRsAndMachinesCount,
},
{
Status: machinehandlerpkg.MachineStatus{},
name: "with more nodes than machines",
machines: []machinehandlerpkg.Machine{
{},
{},
{},
},
nodes: []corev1.Node{
{},
{},
{},
{},
},
expectedMax: 4 + maxDiffBetweenPendingCSRsAndMachinesCount,
},
}

res := getMaxPending(ml)
expected := len(ml) + maxDiffBetweenPendingCSRsAndMachinesCount
if res != expected {
t.Errorf("getMaxPending returned incorrect value: %v, expect: %v", res, expected)
for _, tc := range testCases {
nodeList := &corev1.NodeList{
Items: tc.nodes,
}
res := getMaxPending(tc.machines, nodeList)
if res != tc.expectedMax {
t.Errorf("getMaxPending returned incorrect value: %v, expect: %v", res, tc.expectedMax)
}
}
}

Expand Down

0 comments on commit 7baac23

Please sign in to comment.