New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bug 1772192: Clean-up and refactor actuator #309
Bug 1772192: Clean-up and refactor actuator #309
Conversation
/retest |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is big, so quite hard to review, but I've added a bunch of comments throughout, I think the behaviour should remain the same
Have you run this and tested it in a real cluster yet?
@@ -365,3 +352,20 @@ func conditionFailed() awsproviderv1.AWSMachineProviderCondition { | |||
Reason: awsproviderv1.MachineCreationFailed, | |||
} | |||
} | |||
|
|||
func validateMachine(machine machinev1.Machine) error { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a comment on this method
return nil | ||
} | ||
|
||
func getClusterID(machine *machinev1.Machine) (string, bool) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a comment on this method
pkg/actuators/machine/actuator.go
Outdated
return a.handleMachineError(machine, err, deleteEventAction) | ||
} | ||
|
||
// Get all instances not terminated. | ||
existingInstances, err := a.getMachineInstances(machine) | ||
if err != nil { | ||
klog.Errorf("%s: error getting existing instances: %v", machine.Name, err) | ||
return err | ||
} | ||
existingLen := len(existingInstances) | ||
klog.Infof("%s: found %d existing instances for machine", machine.Name, existingLen) | ||
if existingLen == 0 { | ||
klog.Warningf("%s: no instances found to delete for machine", machine.Name) | ||
return nil | ||
} | ||
|
||
terminatingInstances, err := terminateInstances(client, existingInstances) | ||
if err != nil { | ||
return a.handleMachineError(machine, machinecontroller.DeleteMachine(err.Error()), noEventAction) | ||
} | ||
|
||
if len(terminatingInstances) == 1 { | ||
if terminatingInstances[0] != nil && terminatingInstances[0].CurrentState != nil && terminatingInstances[0].CurrentState.Name != nil { | ||
machineCopy := machine.DeepCopy() | ||
machineCopy.Annotations[machinecontroller.MachineInstanceStateAnnotationName] = aws.StringValue(terminatingInstances[0].CurrentState.Name) | ||
a.client.Update(context.Background(), machineCopy) | ||
} | ||
} | ||
|
||
a.eventRecorder.Eventf(machine, corev1.EventTypeNormal, "Deleted", "Deleted machine %v", machine.Name) | ||
|
||
return nil | ||
} | ||
|
||
// Update attempts to sync machine state with an existing instance. Today this just updates status | ||
// for details that may have changed. (IPs and hostnames) We do not currently support making any | ||
// changes to actual machines in AWS. Instead these will be replaced via MachineDeployments. | ||
func (a *Actuator) Update(context context.Context, machine *machinev1.Machine) error { | ||
klog.Infof("%s: updating machine", machine.Name) | ||
|
||
machineToBePatched := client.MergeFrom(machine.DeepCopy()) | ||
|
||
machineProviderConfig, err := providerConfigFromMachine(machine, a.codec) | ||
if err != nil { | ||
return a.handleMachineError(machine, machinecontroller.InvalidMachineConfiguration("error decoding MachineProviderConfig: %v", err), updateEventAction) | ||
} | ||
|
||
region := machineProviderConfig.Placement.Region | ||
klog.Infof("%s: obtaining EC2 client for region", machine.Name) | ||
credentialsSecretName := "" | ||
if machineProviderConfig.CredentialsSecret != nil { | ||
credentialsSecretName = machineProviderConfig.CredentialsSecret.Name | ||
} | ||
client, err := a.awsClientBuilder(a.client, credentialsSecretName, machine.Namespace, region) | ||
if err != nil { | ||
return a.handleMachineError(machine, err, updateEventAction) | ||
} | ||
// Get all instances not terminated. | ||
existingInstances, err := a.getMachineInstances(machine) | ||
if err != nil { | ||
klog.Errorf("%s: error getting existing instances: %v", machine.Name, err) | ||
return err | ||
} | ||
existingLen := len(existingInstances) | ||
klog.Infof("%s: found %d existing instances for machine", machine.Name, existingLen) | ||
|
||
// Parent controller should prevent this from ever happening by calling Exists and then Create, | ||
// but instance could be deleted between the two calls. | ||
if existingLen == 0 { | ||
if machine.Spec.ProviderID != nil && *machine.Spec.ProviderID != "" && (machine.Status.LastUpdated == nil || machine.Status.LastUpdated.Add(requeueAfterSeconds*time.Second).After(time.Now())) { | ||
klog.Infof("%s: Possible eventual-consistency discrepancy; returning an error to requeue", machine.Name) | ||
return &machinecontroller.RequeueAfterError{RequeueAfter: requeueAfterSeconds * time.Second} | ||
} | ||
|
||
klog.Warningf("%s: attempted to update machine but no instances found", machine.Name) | ||
|
||
a.handleMachineError(machine, machinecontroller.UpdateMachine("no instance found, reason unknown"), updateEventAction) | ||
|
||
// Update status to clear out machine details. | ||
if err := a.setStatus(machine, nil, conditionSuccess()); err != nil { | ||
return err | ||
} | ||
// This is an unrecoverable error condition. We should delay to | ||
// minimize unnecessary API calls. | ||
return &machinecontroller.RequeueAfterError{RequeueAfter: requeueAfterFatalSeconds * time.Second} | ||
} | ||
sortInstances(existingInstances) | ||
runningInstances := getRunningFromInstances(existingInstances) | ||
runningLen := len(runningInstances) | ||
var newestInstance *ec2.Instance | ||
if runningLen > 0 { | ||
// It would be very unusual to have more than one here, but it is | ||
// possible if someone manually provisions a machine with same tag name. | ||
klog.Infof("%s: found %d running instances for machine", machine.Name, runningLen) | ||
newestInstance = runningInstances[0] | ||
|
||
err = a.updateLoadBalancers(client, machineProviderConfig, newestInstance, machine.Name) | ||
if err != nil { | ||
a.handleMachineError(machine, machinecontroller.CreateMachine("Error updating load balancers: %v", err), updateEventAction) | ||
// Create creates a machine and is invoked by the machine controller. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be a double comment like this?
type machineScopeParams struct { | ||
context.Context | ||
|
||
apiReader runtimeclient.Reader |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be worth commenting on the params to explain what they are/why they are used
apiReader runtimeclient.Reader | |
// client reader that bypasses the manager's cache | |
apiReader runtimeclient.Reader |
func (s *machineScope) setProviderStatus(instance *ec2.Instance, condition awsproviderv1.AWSMachineProviderCondition) error { | ||
klog.Infof("%s: Updating status", s.machine.Name) | ||
|
||
// Save this, we need to check if it changed later. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any idea what this comment is about? I'm not sure I see it being checked later?
} | ||
|
||
func TestGetUserData(t *testing.T) { | ||
userDataSecretName := "vsphere-ignition" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
userDataSecretName := "vsphere-ignition" | |
userDataSecretName := "aws-ignition" |
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crds")}, | ||
} | ||
|
||
var err error |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't needed as you use :=
on the line below
|
||
// Create the machine | ||
gs.Expect(k8sClient.Create(ctx, machine)).To(Succeed()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would remove this empty line to join the Create and Delete together to show they're related
pkg/actuators/machine/reconciler.go
Outdated
// Unable to determine if a machine is a master machine. | ||
// Yet, it's only used to delete stopped machines that are not masters. | ||
// So we can safely continue to create a new machine since in the worst case | ||
// we just don't delete any stopped machine. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this comment be within the if err !- nil
? Seems to be in an odd place to me
return r.requeueIfInstancePending(instance) | ||
} | ||
|
||
func (r *Reconciler) delete() error { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a comment to this method
@JoelSpeed Thanks for the review. This worked on my cluster, it passed all e2e tests. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking good, added a couple more suggestions
// client for interacting with AWS | ||
awsClient awsclient.Client | ||
// client reader that bypasses the manager's cache | ||
apiReader runtimeclient.Reader |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I might have missed it, but I'm not sure if this is used anywhere, might be worth getting rid of it if it isn't used? WDYT?
pkg/actuators/machine/utils.go
Outdated
clusterID, ok := machine.Labels[machinev1.MachineClusterIDLabel] | ||
// NOTE: This block can be removed after the label renaming transition to machine.openshift.io | ||
if !ok { | ||
clusterID, ok = machine.Labels["sigs.k8s.io/cluster-api-cluster"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest maybe making this a const at the top of the file
pkg/actuators/machine/reconciler.go
Outdated
return false, fmt.Errorf("failed to get node from machine %s", r.machine.Name) | ||
} | ||
|
||
if _, exists := node.Labels["node-role.kubernetes.io/master"]; exists { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potentially put the label key into a const at the top?
pkg/actuators/machine/reconciler.go
Outdated
|
||
// We explicitly do NOT want to remove stopped masters. | ||
isMaster, err := r.isMaster() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, would drop this empty new line to pair the if err
with the call to isMaster
return fmt.Errorf("%v: failed validating machine provider spec: %v", r.machine.GetName(), err) | ||
} | ||
|
||
// We explicitly do NOT want to remove stopped masters. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all this logic 44 - 59 is not meaningful any more should be dropped in a follow up. Can you add a TODO?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
func getClusterID(machine *machinev1.Machine) (string, bool) { | ||
clusterID, ok := machine.Labels[machinev1.MachineClusterIDLabel] | ||
// NOTE: This block can be removed after the label renaming transition to machine.openshift.io | ||
if !ok { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can go away in a follow up, no need to support upstreamMachineClusterIDLabel
anymore. Can you add a TODO?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
@@ -1,1490 +0,0 @@ | |||
package machine |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this being moved somewhere else? otherwise this would just drop unit coverage
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, tests were partially distributed to machine scope and I'm planning to create a follow-up for bringing back tests for events we fire and etc. We also need the same tests for vSphere, its actuator is missing them too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before:
ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machine 9.638s coverage: 85.8% of statements
ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machineset 10.799s coverage: 85.7% of statements
? sigs.k8s.io/cluster-api-provider-aws/pkg/apis [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider [no test files]
ok sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider/v1beta1 0.252s coverage: 37.4% of statements
? sigs.k8s.io/cluster-api-provider-aws/pkg/client [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/client/fake [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/client/mock [no test files]
ok sigs.k8s.io/cluster-api-provider-aws/pkg/termination 13.309s coverage: 79.6% of statements
? sigs.k8s.io/cluster-api-provider-aws/pkg/version [no test files]
After
ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machine 7.199s coverage: 65.4% of statements
ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machineset 8.615s coverage: 88.1% of statements
? sigs.k8s.io/cluster-api-provider-aws/pkg/apis [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider/v1beta1 [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/client [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/client/fake [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/client/mock [no test files]
? sigs.k8s.io/cluster-api-provider-aws/pkg/version [no test files]
I'd like to help to move this forward, can we make sure we increase the 65.4% before merging?
/retest |
6 similar comments
/retest |
/retest |
/retest |
/retest |
/retest |
/retest |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couple of nits in tests, but otherwise LGTM I think, I may have missed some stuff though, probably worth another pair of eyes reviewing due to its size
}, | ||
LaunchTime: aws.Time(time.Now()), | ||
Placement: &ec2.Placement{ | ||
AvailabilityZone: &az, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For consistency
AvailabilityZone: &az, | |
AvailabilityZone: aws.String("us-east-1a"), |
"sigs.k8s.io/controller-runtime/pkg/manager" | ||
) | ||
|
||
const TestNamespace = "aws-test" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably be private
const TestNamespace = "aws-test" | |
const testNamespace = "aws-test" |
@JoelSpeed @enxebre This PR should be fine now |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/lgtm
/approve |
[APPROVALNOTIFIER] This PR is APPROVED This pull-request has been approved by: enxebre The full list of commands accepted by this bot can be found here. The pull request process is described here
Needs approval from an approver in each of these files:
Approvers can indicate their approval by writing |
/retest Please review the full test history for this PR and help us cut down flakes. |
1 similar comment
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest |
/retest |
1 similar comment
/retest |
/lgtm |
/retest Please review the full test history for this PR and help us cut down flakes. |
10 similar comments
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
/retest Please review the full test history for this PR and help us cut down flakes. |
@alexander-demichev: All pull requests linked via external trackers have merged: . Bugzilla bug 1772192 has been moved to the MODIFIED state. In response to this:
Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository. |
This PR introduces some clean-up and refactoring.
I'll make some follow-up PRs to remove unused code, unneeded aws-actuator in
bin
and improve/refactor unit tests.