Skip to content

Commit

Permalink
Add separate cluster 'provisioning' state
Browse files Browse the repository at this point in the history
This new state contains the logic for creating the k8s operators
that are necessary to deploy Mattermost installations. This process
used to occur directly after cluster creation with no separation.
Now, clusters can be 'provisioned' again in the stable state, which
will update all operator resources to the newest version contained
in the provisioner manifests.

This change also does the following:
 - Refactors much of the k8s client library to support updates of
   previously-created k8s resources.
 - Fixes a unit test bug where we were not properly checking that
   the right cluster states were being returned for work by the
   supervisor.
 - Cleans up logging in various places.
  • Loading branch information
gabrieljackson committed Jun 21, 2019
1 parent 5b09172 commit 066d609
Show file tree
Hide file tree
Showing 24 changed files with 671 additions and 311 deletions.
23 changes: 23 additions & 0 deletions cmd/cloud/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ func init() {
clusterCreateCmd.Flags().String("zones", "us-east-1a", "The zones where the cluster will be deployed. Use commas to separate multiple zones.")
clusterCreateCmd.Flags().Int("wait", 600, "The amount of seconds to wait for k8s to become fully ready before exiting. Set to 0 to exit immediately.")

clusterProvisionCmd.Flags().String("cluster", "", "The id of the cluster to be provisioned.")
clusterProvisionCmd.MarkFlagRequired("cluster")

clusterUpgradeCmd.Flags().String("cluster", "", "The id of the cluster to be upgraded.")
clusterUpgradeCmd.Flags().String("version", "latest", "The Kubernetes version to target.")
clusterUpgradeCmd.Flags().Int("wait", 600, "The amount of seconds to wait for k8s to become fully ready before exiting. Set to 0 to exit immediately.")
Expand All @@ -35,6 +38,7 @@ func init() {
clusterListCmd.Flags().Bool("include-deleted", false, "Whether to include deleted clusters.")

clusterCmd.AddCommand(clusterCreateCmd)
clusterCmd.AddCommand(clusterProvisionCmd)
clusterCmd.AddCommand(clusterUpgradeCmd)
clusterCmd.AddCommand(clusterDeleteCmd)
clusterCmd.AddCommand(clusterGetCmd)
Expand Down Expand Up @@ -87,6 +91,25 @@ var clusterCreateCmd = &cobra.Command{
},
}

var clusterProvisionCmd = &cobra.Command{
Use: "provision",
Short: "Provision/Reprovision a cluster's k8s operators.",
RunE: func(command *cobra.Command, args []string) error {
command.SilenceUsage = true

serverAddress, _ := command.Flags().GetString("server")
client := api.NewClient(serverAddress)

clusterID, _ := command.Flags().GetString("cluster")
err := client.ProvisionCluster(clusterID)
if err != nil {
return errors.Wrap(err, "failed to provision cluster")
}

return nil
},
}

var clusterUpgradeCmd = &cobra.Command{
Use: "upgrade",
Short: "Upgrade k8s on a cluster.",
Expand Down
17 changes: 17 additions & 0 deletions internal/api/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,23 @@ func (c *Client) RetryCreateCluster(clusterID string) error {
}
}

// ProvisionCluster provisions k8s operators on a cluster from the configured provisioning server.
func (c *Client) ProvisionCluster(clusterID string) error {
resp, err := c.doPost(c.buildURL("/api/cluster/%s/provision", clusterID), nil)
if err != nil {
return err
}
defer closeBody(resp)

switch resp.StatusCode {
case http.StatusAccepted:
return nil

default:
return errors.Errorf("failed with status code %d", resp.StatusCode)
}
}

// GetCluster fetches the specified cluster from the configured provisioning server.
func (c *Client) GetCluster(clusterID string) (*model.Cluster, error) {
resp, err := c.doGet(c.buildURL("/api/cluster/%s", clusterID))
Expand Down
44 changes: 44 additions & 0 deletions internal/api/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ func initCluster(apiRouter *mux.Router, context *Context) {
clusterRouter := apiRouter.PathPrefix("/cluster/{cluster:[A-Za-z0-9]{26}}").Subrouter()
clusterRouter.Handle("", addContext(handleGetCluster)).Methods("GET")
clusterRouter.Handle("", addContext(handleRetryCreateCluster)).Methods("POST")
clusterRouter.Handle("/provision", addContext(handleProvisionCluster)).Methods("POST")
clusterRouter.Handle("/kubernetes/{version}", addContext(handleUpgradeCluster)).Methods("PUT")
clusterRouter.Handle("", addContext(handleDeleteCluster)).Methods("DELETE")
}
Expand Down Expand Up @@ -130,6 +131,49 @@ func handleRetryCreateCluster(c *Context, w http.ResponseWriter, r *http.Request
outputJSON(c, w, cluster)
}

// handleProvisionCluster responds to POST /api/cluster/{cluster}/provision,
// provisioning k8s resources on a previously-created cluster.
func handleProvisionCluster(c *Context, w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
clusterID := vars["cluster"]
c.Logger = c.Logger.WithField("cluster", clusterID)

cluster, status, unlockOnce := lockCluster(c, clusterID)
if status != 0 {
w.WriteHeader(status)
return
}
defer unlockOnce()

switch cluster.State {
case model.ClusterStateStable:
case model.ClusterStateProvisioningFailed:
case model.ClusterStateProvisioningRequested:
default:
c.Logger.Warnf("unable to provision cluster while in state %s", cluster.State)
w.WriteHeader(http.StatusBadRequest)
return
}

if cluster.State != model.ClusterStateProvisioningRequested {
cluster.State = model.ClusterStateProvisioningRequested

err := c.Store.UpdateCluster(cluster)
if err != nil {
c.Logger.WithError(err).Errorf("failed to mark cluster provisioning state")
w.WriteHeader(http.StatusInternalServerError)
return
}
}

// Notify even if we didn't make changes, to expedite even the no-op operations above.
unlockOnce()
c.Supervisor.Do()

w.WriteHeader(http.StatusAccepted)
outputJSON(c, w, cluster)
}

// handleGetCluster responds to GET /api/clusters/{cluster}, returning the cluster in question.
func handleGetCluster(c *Context, w http.ResponseWriter, r *http.Request) {
vars := mux.Vars(r)
Expand Down
122 changes: 122 additions & 0 deletions internal/api/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,128 @@ func TestRetryCreateCluster(t *testing.T) {
})
}

func TestProvisionCluster(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)

router := mux.NewRouter()
api.Register(router, &api.Context{
Store: sqlStore,
Supervisor: &mockSupervisor{},
Logger: logger,
})
ts := httptest.NewServer(router)
defer ts.Close()

client := api.NewClient(ts.URL)

cluster1, err := client.CreateCluster(&api.CreateClusterRequest{
Provider: model.ProviderAWS,
Size: model.SizeAlef500,
Zones: []string{"zone"},
})
require.NoError(t, err)

t.Run("unknown cluster", func(t *testing.T) {
err := client.ProvisionCluster(model.NewID())
require.EqualError(t, err, "failed with status code 404")
})

t.Run("while locked", func(t *testing.T) {
cluster1.State = model.ClusterStateStable
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

lockerID := model.NewID()

locked, err := sqlStore.LockCluster(cluster1.ID, lockerID)
require.NoError(t, err)
require.True(t, locked)
defer func() {
unlocked, err := sqlStore.UnlockCluster(cluster1.ID, lockerID, false)
require.NoError(t, err)
require.True(t, unlocked)
}()

err = client.ProvisionCluster(cluster1.ID)
require.EqualError(t, err, "failed with status code 409")
})

t.Run("while provisioning", func(t *testing.T) {
cluster1.State = model.ClusterStateProvisioningRequested
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.NoError(t, err)

cluster1, err = client.GetCluster(cluster1.ID)
require.NoError(t, err)
require.Equal(t, model.ClusterStateProvisioningRequested, cluster1.State)
})

t.Run("after provisioning failed", func(t *testing.T) {
cluster1.State = model.ClusterStateProvisioningFailed
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.NoError(t, err)

cluster1, err = client.GetCluster(cluster1.ID)
require.NoError(t, err)
require.Equal(t, model.ClusterStateProvisioningRequested, cluster1.State)
})

t.Run("while upgrading", func(t *testing.T) {
cluster1.State = model.ClusterStateUpgradeRequested
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.EqualError(t, err, "failed with status code 400")

cluster1, err = client.GetCluster(cluster1.ID)
require.NoError(t, err)
require.Equal(t, model.ClusterStateUpgradeRequested, cluster1.State)
})

t.Run("after upgrade failed", func(t *testing.T) {
cluster1.State = model.ClusterStateUpgradeFailed
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.EqualError(t, err, "failed with status code 400")

cluster1, err = client.GetCluster(cluster1.ID)
require.NoError(t, err)
require.Equal(t, model.ClusterStateUpgradeFailed, cluster1.State)
})

t.Run("while stable", func(t *testing.T) {
cluster1.State = model.ClusterStateStable
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.NoError(t, err)

cluster1, err = client.GetCluster(cluster1.ID)
require.NoError(t, err)
require.Equal(t, model.ClusterStateProvisioningRequested, cluster1.State)
})

t.Run("while deleting", func(t *testing.T) {
cluster1.State = model.ClusterStateDeletionRequested
err = sqlStore.UpdateCluster(cluster1)
require.NoError(t, err)

err = client.ProvisionCluster(cluster1.ID)
require.EqualError(t, err, "failed with status code 400")
})
}

func TestUpgradeCluster(t *testing.T) {
logger := testlib.MakeLogger(t)
sqlStore := store.MakeTestSQLStore(t, logger)
Expand Down
5 changes: 5 additions & 0 deletions internal/model/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ const (
ClusterStateCreationRequested = "creation-requested"
// ClusterStateCreationFailed is a cluster that failed creation.
ClusterStateCreationFailed = "creation-failed"
// ClusterStateProvisioningRequested is a cluster in the process of being
// provisioned with operators.
ClusterStateProvisioningRequested = "provisioning-requested"
// ClusterStateProvisioningFailed is a cluster that failed provisioning.
ClusterStateProvisioningFailed = "provisioning-failed"
// ClusterStateDeletionRequested is a cluster in the process of being deleted.
ClusterStateDeletionRequested = "deletion-requested"
// ClusterStateDeletionFailed is a cluster that failed deletion.
Expand Down
Loading

0 comments on commit 066d609

Please sign in to comment.