Bug 1772192: Clean-up and refactor actuator #309

alexander-demicev · 2020-03-23T13:36:33Z

This PR introduces some clean-up and refactoring.

Add machine scope and reconciler
Make actuator look like vSphere
Refactor provider spec and status serialization, also similar to vSphere

I'll make some follow-up PRs to remove unused code, unneeded aws-actuator in bin and improve/refactor unit tests.

alexander-demicev · 2020-03-23T13:55:12Z

/retest

JoelSpeed

This is big, so quite hard to review, but I've added a bunch of comments throughout, I think the behaviour should remain the same

Have you run this and tested it in a real cluster yet?

JoelSpeed · 2020-03-23T14:19:22Z

pkg/actuators/machine/utils.go

@@ -365,3 +352,20 @@ func conditionFailed() awsproviderv1.AWSMachineProviderCondition {
 		Reason: awsproviderv1.MachineCreationFailed,
 	}
 }
+
+func validateMachine(machine machinev1.Machine) error {


Should add a comment on this method

JoelSpeed · 2020-03-23T14:19:31Z

pkg/actuators/machine/utils.go

+	return nil
+}
+
+func getClusterID(machine *machinev1.Machine) (string, bool) {


Should add a comment on this method

JoelSpeed · 2020-03-23T14:33:46Z

pkg/actuators/machine/actuator.go

-		return a.handleMachineError(machine, err, deleteEventAction)
-	}
-
-	// Get all instances not terminated.
-	existingInstances, err := a.getMachineInstances(machine)
-	if err != nil {
-		klog.Errorf("%s: error getting existing instances: %v", machine.Name, err)
-		return err
-	}
-	existingLen := len(existingInstances)
-	klog.Infof("%s: found %d existing instances for machine", machine.Name, existingLen)
-	if existingLen == 0 {
-		klog.Warningf("%s: no instances found to delete for machine", machine.Name)
-		return nil
-	}
-
-	terminatingInstances, err := terminateInstances(client, existingInstances)
-	if err != nil {
-		return a.handleMachineError(machine, machinecontroller.DeleteMachine(err.Error()), noEventAction)
-	}
-
-	if len(terminatingInstances) == 1 {
-		if terminatingInstances[0] != nil && terminatingInstances[0].CurrentState != nil && terminatingInstances[0].CurrentState.Name != nil {
-			machineCopy := machine.DeepCopy()
-			machineCopy.Annotations[machinecontroller.MachineInstanceStateAnnotationName] = aws.StringValue(terminatingInstances[0].CurrentState.Name)
-			a.client.Update(context.Background(), machineCopy)
-		}
-	}
-
-	a.eventRecorder.Eventf(machine, corev1.EventTypeNormal, "Deleted", "Deleted machine %v", machine.Name)
-
-	return nil
-}
-
-// Update attempts to sync machine state with an existing instance. Today this just updates status
-// for details that may have changed. (IPs and hostnames) We do not currently support making any
-// changes to actual machines in AWS. Instead these will be replaced via MachineDeployments.
-func (a *Actuator) Update(context context.Context, machine *machinev1.Machine) error {
-	klog.Infof("%s: updating machine", machine.Name)
-
-	machineToBePatched := client.MergeFrom(machine.DeepCopy())
-
-	machineProviderConfig, err := providerConfigFromMachine(machine, a.codec)
-	if err != nil {
-		return a.handleMachineError(machine, machinecontroller.InvalidMachineConfiguration("error decoding MachineProviderConfig: %v", err), updateEventAction)
-	}
-
-	region := machineProviderConfig.Placement.Region
-	klog.Infof("%s: obtaining EC2 client for region", machine.Name)
-	credentialsSecretName := ""
-	if machineProviderConfig.CredentialsSecret != nil {
-		credentialsSecretName = machineProviderConfig.CredentialsSecret.Name
-	}
-	client, err := a.awsClientBuilder(a.client, credentialsSecretName, machine.Namespace, region)
-	if err != nil {
-		return a.handleMachineError(machine, err, updateEventAction)
-	}
-	// Get all instances not terminated.
-	existingInstances, err := a.getMachineInstances(machine)
-	if err != nil {
-		klog.Errorf("%s: error getting existing instances: %v", machine.Name, err)
-		return err
-	}
-	existingLen := len(existingInstances)
-	klog.Infof("%s: found %d existing instances for machine", machine.Name, existingLen)
-
-	// Parent controller should prevent this from ever happening by calling Exists and then Create,
-	// but instance could be deleted between the two calls.
-	if existingLen == 0 {
-		if machine.Spec.ProviderID != nil && *machine.Spec.ProviderID != "" && (machine.Status.LastUpdated == nil || machine.Status.LastUpdated.Add(requeueAfterSeconds*time.Second).After(time.Now())) {
-			klog.Infof("%s: Possible eventual-consistency discrepancy; returning an error to requeue", machine.Name)
-			return &machinecontroller.RequeueAfterError{RequeueAfter: requeueAfterSeconds * time.Second}
-		}
-
-		klog.Warningf("%s: attempted to update machine but no instances found", machine.Name)
-
-		a.handleMachineError(machine, machinecontroller.UpdateMachine("no instance found, reason unknown"), updateEventAction)
-
-		// Update status to clear out machine details.
-		if err := a.setStatus(machine, nil, conditionSuccess()); err != nil {
-			return err
-		}
-		// This is an unrecoverable error condition.  We should delay to
-		// minimize unnecessary API calls.
-		return &machinecontroller.RequeueAfterError{RequeueAfter: requeueAfterFatalSeconds * time.Second}
-	}
-	sortInstances(existingInstances)
-	runningInstances := getRunningFromInstances(existingInstances)
-	runningLen := len(runningInstances)
-	var newestInstance *ec2.Instance
-	if runningLen > 0 {
-		// It would be very unusual to have more than one here, but it is
-		// possible if someone manually provisions a machine with same tag name.
-		klog.Infof("%s: found %d running instances for machine", machine.Name, runningLen)
-		newestInstance = runningInstances[0]
-
-		err = a.updateLoadBalancers(client, machineProviderConfig, newestInstance, machine.Name)
-		if err != nil {
-			a.handleMachineError(machine, machinecontroller.CreateMachine("Error updating load balancers: %v", err), updateEventAction)
+// Create creates a machine and is invoked by the machine controller.


Should this be a double comment like this?

JoelSpeed · 2020-03-23T14:40:28Z

pkg/actuators/machine/machine_scope.go

+type machineScopeParams struct {
+	context.Context
+
+	apiReader     runtimeclient.Reader


Might be worth commenting on the params to explain what they are/why they are used

Suggested change

apiReader runtimeclient.Reader

// client reader that bypasses the manager's cache

apiReader runtimeclient.Reader

JoelSpeed · 2020-03-23T14:43:43Z

pkg/actuators/machine/machine_scope.go

+func (s *machineScope) setProviderStatus(instance *ec2.Instance, condition awsproviderv1.AWSMachineProviderCondition) error {
+	klog.Infof("%s: Updating status", s.machine.Name)
+
+	// Save this, we need to check if it changed later.


Any idea what this comment is about? I'm not sure I see it being checked later?

JoelSpeed · 2020-03-23T15:07:20Z

pkg/actuators/machine/machine_scope_test.go

+}
+
+func TestGetUserData(t *testing.T) {
+	userDataSecretName := "vsphere-ignition"


Suggested change

userDataSecretName := "vsphere-ignition"

userDataSecretName := "aws-ignition"

JoelSpeed · 2020-03-23T15:09:06Z

pkg/actuators/machine/machine_scope_test.go

+		CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crds")},
+	}
+
+	var err error


This isn't needed as you use := on the line below

JoelSpeed · 2020-03-23T15:10:10Z

pkg/actuators/machine/machine_scope_test.go

+
+			// Create the machine
+			gs.Expect(k8sClient.Create(ctx, machine)).To(Succeed())
+


I would remove this empty line to join the Create and Delete together to show they're related

JoelSpeed · 2020-03-23T15:22:48Z

pkg/actuators/machine/reconciler.go

+	// Unable to determine if a machine is a master machine.
+	// Yet, it's only used to delete stopped machines that are not masters.
+	// So we can safely continue to create a new machine since in the worst case
+	// we just don't delete any stopped machine.


Should this comment be within the if err !- nil? Seems to be in an odd place to me

JoelSpeed · 2020-03-23T15:24:03Z

pkg/actuators/machine/reconciler.go

+	return r.requeueIfInstancePending(instance)
+}
+
+func (r *Reconciler) delete() error {


Should add a comment to this method

alexander-demicev · 2020-03-23T15:35:48Z

@JoelSpeed Thanks for the review. This worked on my cluster, it passed all e2e tests.

JoelSpeed

Looking good, added a couple more suggestions

JoelSpeed · 2020-03-24T10:20:11Z

pkg/actuators/machine/machine_scope.go

+	// client for interacting with AWS
+	awsClient awsclient.Client
+	// client reader that bypasses the manager's cache
+	apiReader runtimeclient.Reader


I might have missed it, but I'm not sure if this is used anywhere, might be worth getting rid of it if it isn't used? WDYT?

JoelSpeed · 2020-03-24T10:22:56Z

pkg/actuators/machine/utils.go

+	clusterID, ok := machine.Labels[machinev1.MachineClusterIDLabel]
+	// NOTE: This block can be removed after the label renaming transition to machine.openshift.io
+	if !ok {
+		clusterID, ok = machine.Labels["sigs.k8s.io/cluster-api-cluster"]


I'd suggest maybe making this a const at the top of the file

JoelSpeed · 2020-03-24T10:27:28Z

pkg/actuators/machine/reconciler.go

+		return false, fmt.Errorf("failed to get node from machine %s", r.machine.Name)
+	}
+
+	if _, exists := node.Labels["node-role.kubernetes.io/master"]; exists {


Potentially put the label key into a const at the top?

JoelSpeed · 2020-03-24T10:28:19Z

pkg/actuators/machine/reconciler.go

+
+	// We explicitly do NOT want to remove stopped masters.
+	isMaster, err := r.isMaster()
+


Nit, would drop this empty new line to pair the if err with the call to isMaster

enxebre · 2020-03-25T13:53:11Z

pkg/actuators/machine/reconciler.go

+		return fmt.Errorf("%v: failed validating machine provider spec: %v", r.machine.GetName(), err)
+	}
+
+	// We explicitly do NOT want to remove stopped masters.


all this logic 44 - 59 is not meaningful any more should be dropped in a follow up. Can you add a TODO?

enxebre · 2020-03-25T13:58:03Z

pkg/actuators/machine/utils.go

+func getClusterID(machine *machinev1.Machine) (string, bool) {
+	clusterID, ok := machine.Labels[machinev1.MachineClusterIDLabel]
+	// NOTE: This block can be removed after the label renaming transition to machine.openshift.io
+	if !ok {


This can go away in a follow up, no need to support upstreamMachineClusterIDLabel anymore. Can you add a TODO?

enxebre · 2020-03-25T14:01:06Z

pkg/actuators/machine/actuator_test.go

@@ -1,1490 +0,0 @@
-package machine


Is this being moved somewhere else? otherwise this would just drop unit coverage

Yes, tests were partially distributed to machine scope and I'm planning to create a follow-up for bringing back tests for events we fire and etc. We also need the same tests for vSphere, its actuator is missing them too.

Before:

ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machine 9.638s coverage: 85.8% of statements ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machineset 10.799s coverage: 85.7% of statements ? sigs.k8s.io/cluster-api-provider-aws/pkg/apis [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider [no test files] ok sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider/v1beta1 0.252s coverage: 37.4% of statements ? sigs.k8s.io/cluster-api-provider-aws/pkg/client [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/client/fake [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/client/mock [no test files] ok sigs.k8s.io/cluster-api-provider-aws/pkg/termination 13.309s coverage: 79.6% of statements ? sigs.k8s.io/cluster-api-provider-aws/pkg/version [no test files]

After

ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machine 7.199s coverage: 65.4% of statements ok sigs.k8s.io/cluster-api-provider-aws/pkg/actuators/machineset 8.615s coverage: 88.1% of statements ? sigs.k8s.io/cluster-api-provider-aws/pkg/apis [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/apis/awsprovider/v1beta1 [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/client [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/client/fake [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/client/mock [no test files] ? sigs.k8s.io/cluster-api-provider-aws/pkg/version [no test files]

I'd like to help to move this forward, can we make sure we increase the 65.4% before merging?

alexander-demicev · 2020-03-25T14:56:13Z

/retest

alexander-demicev · 2020-03-30T15:59:11Z

/retest

alexander-demicev · 2020-03-30T16:55:07Z

/retest

alexander-demicev · 2020-03-31T10:21:38Z

/retest

alexander-demicev · 2020-03-31T15:44:54Z

/retest

alexander-demicev · 2020-03-31T17:23:38Z

/retest

alexander-demicev · 2020-04-01T10:03:39Z

/retest

JoelSpeed

Couple of nits in tests, but otherwise LGTM I think, I may have missed some stuff though, probably worth another pair of eyes reviewing due to its size

JoelSpeed · 2020-04-01T11:38:56Z

pkg/actuators/machine/reconciler_test.go

+							},
+							LaunchTime: aws.Time(time.Now()),
+							Placement: &ec2.Placement{
+								AvailabilityZone: &az,


For consistency

Suggested change

AvailabilityZone: &az,

AvailabilityZone: aws.String("us-east-1a"),

JoelSpeed · 2020-04-01T11:44:31Z

pkg/actuators/machine/machine_scope_test.go

+	"sigs.k8s.io/controller-runtime/pkg/manager"
+)
+
+const TestNamespace = "aws-test"


This should probably be private

Suggested change

const TestNamespace = "aws-test"

const testNamespace = "aws-test"

alexander-demicev · 2020-04-01T18:18:39Z

@JoelSpeed @enxebre This PR should be fine now

JoelSpeed

/lgtm

enxebre · 2020-04-03T12:20:21Z

/approve
/retest

openshift-ci-robot · 2020-04-03T12:20:56Z

[APPROVALNOTIFIER] This PR is APPROVED

This pull-request has been approved by: enxebre

The full list of commands accepted by this bot can be found here.

The pull request process is described here

Needs approval from an approver in each of these files:

~~OWNERS~~ [enxebre]

Approvers can indicate their approval by writing /approve in a comment
Approvers can cancel approval by writing /approve cancel in a comment

openshift-bot · 2020-04-03T13:35:57Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-03T14:53:58Z

/retest

Please review the full test history for this PR and help us cut down flakes.

alexander-demicev · 2020-04-05T11:18:55Z

/retest

alexander-demicev · 2020-04-05T11:25:54Z

/retest

alexander-demicev · 2020-04-05T11:33:38Z

/retest

enxebre · 2020-04-06T12:54:59Z

/lgtm

openshift-bot · 2020-04-06T13:06:09Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T13:32:15Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T14:11:16Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T14:24:13Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T15:13:04Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T15:16:51Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T16:21:16Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T17:13:14Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T17:39:14Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T20:41:12Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-bot · 2020-04-06T20:54:15Z

/retest

Please review the full test history for this PR and help us cut down flakes.

openshift-ci-robot · 2020-04-07T10:35:04Z

@alexander-demichev: All pull requests linked via external trackers have merged: . Bugzilla bug 1772192 has been moved to the MODIFIED state.

In response to this:

Bug 1772192: Clean-up and refactor actuator

Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository.

openshift-ci-robot requested review from JoelSpeed and michaelgugino March 23, 2020 13:36

JoelSpeed reviewed Mar 23, 2020

View reviewed changes

JoelSpeed reviewed Mar 24, 2020

View reviewed changes

openshift-ci-robot added needs-rebase Indicates a PR cannot be merged because it has merge conflicts with HEAD. and removed needs-rebase Indicates a PR cannot be merged because it has merge conflicts with HEAD. labels Mar 25, 2020

enxebre reviewed Mar 25, 2020

View reviewed changes

JoelSpeed reviewed Apr 1, 2020

View reviewed changes

JoelSpeed approved these changes Apr 2, 2020

View reviewed changes

openshift-ci-robot assigned JoelSpeed Apr 2, 2020

openshift-ci-robot added the lgtm Indicates that a PR is ready to be merged. label Apr 2, 2020

openshift-ci-robot added the approved Indicates a PR has been approved by an approver from all required OWNERS files. label Apr 3, 2020

openshift-ci-robot removed the lgtm Indicates that a PR is ready to be merged. label Apr 3, 2020

Alexander Demichev added 9 commits April 6, 2020 12:39

Add machine scope

81fa42a

Add reconciler

24005b7

Refactor actuator

1bec1e1

Refactor provider spec and status serialization

c72c26f

Fix cmd aws-actuator

2a7614e

Add and fix unit tests

78b588a

Revendor after refactoring

c32b779

Add aws client builder function

e66c6b5

Increase unit test coverage

10fd44b

openshift-ci-robot assigned enxebre Apr 6, 2020

openshift-ci-robot added the lgtm Indicates that a PR is ready to be merged. label Apr 6, 2020

openshift-merge-robot merged commit 3df1940 into openshift:master Apr 6, 2020

alexander-demicev changed the title ~~Clean-up and refactor actuator~~ Bug 1772192: Clean-up and refactor actuator Apr 7, 2020

alexander-demicev deleted the scope branch April 7, 2020 10:35

JoelSpeed mentioned this pull request Apr 17, 2020

BUG 1824497: Enable error checks to unwrap errors openshift/machine-api-operator#559

Merged

	apiReader runtimeclient.Reader
	// client reader that bypasses the manager's cache
	apiReader runtimeclient.Reader

	userDataSecretName := "vsphere-ignition"
	userDataSecretName := "aws-ignition"


		// Create the machine
		gs.Expect(k8sClient.Create(ctx, machine)).To(Succeed())


		// We explicitly do NOT want to remove stopped masters.
		isMaster, err := r.isMaster()

	AvailabilityZone: &az,
	AvailabilityZone: aws.String("us-east-1a"),

	const TestNamespace = "aws-test"
	const testNamespace = "aws-test"

Bug 1772192: Clean-up and refactor actuator #309

Bug 1772192: Clean-up and refactor actuator #309

Conversation

alexander-demicev commented Mar 23, 2020

alexander-demicev commented Mar 23, 2020

JoelSpeed left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

alexander-demicev commented Mar 23, 2020

JoelSpeed left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

alexander-demicev commented Mar 25, 2020

alexander-demicev commented Mar 30, 2020

alexander-demicev commented Mar 30, 2020

alexander-demicev commented Mar 31, 2020

alexander-demicev commented Mar 31, 2020

alexander-demicev commented Mar 31, 2020

alexander-demicev commented Apr 1, 2020

JoelSpeed left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

alexander-demicev commented Apr 1, 2020

JoelSpeed left a comment

Choose a reason for hiding this comment

enxebre commented Apr 3, 2020

openshift-ci-robot commented Apr 3, 2020

openshift-bot commented Apr 3, 2020

openshift-bot commented Apr 3, 2020

alexander-demicev commented Apr 5, 2020

alexander-demicev commented Apr 5, 2020

alexander-demicev commented Apr 5, 2020

enxebre commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-bot commented Apr 6, 2020

openshift-ci-robot commented Apr 7, 2020