Skip to content

Commit

Permalink
go/worker/keymanager: Add and refine key manager worker metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
peternose committed May 8, 2023
1 parent 1899ef9 commit ad27710
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 15 deletions.
47 changes: 47 additions & 0 deletions .changelog/5196.feature.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,50 @@ one of the key manager enclaves must publish a proposal for the next
generation of the master secret, which must then be replicated by
the majority of enclaves. If the replication process is not completed
by the end of the epoch, the proposal can be replaced with a new one.

The following metrics have been added:

- `oasis_worker_keymanager_consensus_ephemeral_secret_epoch_number`
is the epoch number of the latest ephemeral secret.

- `oasis_worker_keymanager_consensus_master_secret_generation_number`
is the generation number of the latest master secret.

- `oasis_worker_keymanager_consensus_master_secret_rotation_epoch_number`
is the epoch number of the latest master secret rotation.

- `oasis_worker_keymanager_consensus_master_secret_proposal_generation_number`
is the generation number of the latest master secret proposal.

- `oasis_worker_keymanager_consensus_master_secret_proposal_epoch_number`
is the epoch number of the latest master secret proposal.

- `oasis_worker_keymanager_enclave_ephemeral_secret_epoch_number`
is the epoch number of the latest ephemeral secret loaded into the enclave.

- `oasis_worker_keymanager_enclave_master_secret_generation_number`
is the generation number of the latest master secret as seen by the enclave.

- `oasis_worker_keymanager_enclave_master_secret_proposal_generation_number`
is the generation number of the latest master secret proposal loaded
into the enclave.

- `oasis_worker_keymanager_enclave_master_secret_proposal_epoch_number`
is the epoch number of the latest master secret proposal loaded
into the enclave.

- `oasis_worker_keymanager_enclave_generated_master_secret_generation_number`
is the generation number of the latest master secret generated
by the enclave.

- `oasis_worker_keymanager_enclave_generated_master_secret_epoch_number`
is the epoch number of the latest master secret generated by the enclave.

- `oasis_worker_keymanager_enclave_generated_ephemeral_secret_epoch_number`
is the epoch number of the latest ephemeral secret generated by the enclave.

The following metrics have had runtime labels added:

- `oasis_worker_keymanager_compute_runtime_count`,

- `oasis_worker_keymanager_policy_update_count`.
16 changes: 14 additions & 2 deletions docs/oasis-node/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,21 @@ oasis_worker_executor_liveness_live_ratio | Gauge | Ratio between live and total
oasis_worker_executor_liveness_live_rounds | Gauge | Number of live rounds in last epoch. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_executor_liveness_total_rounds | Gauge | Number of total rounds in last epoch. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_failed_round_count | Counter | Number of failed roothash rounds. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_keymanager_compute_runtime_count | Counter | Number of compute runtimes using the key manager. | | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_compute_runtime_count | Counter | Number of compute runtimes using the key manager. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_consensus_ephemeral_secret_epoch_number | Gauge | Epoch number of the latest ephemeral secret. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_consensus_master_secret_generation_number | Gauge | Generation number of the latest master secret. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_consensus_master_secret_proposal_epoch_number | Gauge | Epoch number of the latest master secret proposal. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_consensus_master_secret_proposal_generation_number | Gauge | Generation number of the latest master secret proposal. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_consensus_master_secret_rotation_epoch_number | Gauge | Epoch number of the latest master secret rotation. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_ephemeral_secret_epoch_number | Gauge | Epoch number of the latest ephemeral secret loaded into the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_generated_ephemeral_secret_epoch_number | Gauge | Epoch number of the latest ephemeral secret generated by the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_generated_master_secret_epoch_number | Gauge | Epoch number of the latest master secret generated by the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_generated_master_secret_generation_number | Gauge | Generation number of the latest master secret generated by the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_master_secret_generation_number | Gauge | Generation number of the latest master secret as seen by the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_master_secret_proposal_epoch_number | Gauge | Epoch number of the latest master secret proposal loaded into the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_master_secret_proposal_generation_number | Gauge | Generation number of the latest master secret proposal loaded into the enclave. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_enclave_rpc_count | Counter | Number of remote Enclave RPC requests via P2P. | method | [worker/keymanager/p2p](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/p2p/metrics.go)
oasis_worker_keymanager_policy_update_count | Counter | Number of key manager policy updates. | | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_keymanager_policy_update_count | Counter | Number of key manager policy updates. | runtime | [worker/keymanager](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/keymanager/metrics.go)
oasis_worker_node_registered | Gauge | Is oasis node registered (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_registration_eligible | Gauge | Is oasis node eligible for registration (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_status_frozen | Gauge | Is oasis node frozen (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
Expand Down
2 changes: 2 additions & 0 deletions go/worker/keymanager/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,11 @@ func New(
w.privatePeers[peerID] = struct{}{}
}

// Parse runtime ID.
if err := w.runtimeID.UnmarshalHex(config.GlobalConfig.Keymanager.RuntimeID); err != nil {
return nil, fmt.Errorf("worker/keymanager: failed to parse runtime ID: %w", err)
}
w.runtimeLabel = w.runtimeID.String()

var err error
w.roleProvider, err = r.NewRuntimeRoleProvider(node.RoleKeyManager, w.runtimeID)
Expand Down
114 changes: 112 additions & 2 deletions go/worker/keymanager/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,133 @@ import (
)

var (
computeRuntimeCount = prometheus.NewCounter(
computeRuntimeCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "oasis_worker_keymanager_compute_runtime_count",
Help: "Number of compute runtimes using the key manager.",
},
[]string{"runtime"},
)

policyUpdateCount = prometheus.NewCounter(
policyUpdateCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "oasis_worker_keymanager_policy_update_count",
Help: "Number of key manager policy updates.",
},
[]string{"runtime"},
)

consensusEphemeralSecretEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_consensus_ephemeral_secret_epoch_number",
Help: "Epoch number of the latest ephemeral secret.",
},
[]string{"runtime"},
)

consensusMasterSecretGenerationNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_consensus_master_secret_generation_number",
Help: "Generation number of the latest master secret.",
},
[]string{"runtime"},
)

consensusMasterSecretRotationEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_consensus_master_secret_rotation_epoch_number",
Help: "Epoch number of the latest master secret rotation.",
},
[]string{"runtime"},
)

consensusMasterSecretProposalEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_consensus_master_secret_proposal_epoch_number",
Help: "Epoch number of the latest master secret proposal.",
},
[]string{"runtime"},
)

consensusMasterSecretProposalGenerationNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_consensus_master_secret_proposal_generation_number",
Help: "Generation number of the latest master secret proposal.",
},
[]string{"runtime"},
)

enclaveEphemeralSecretEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_ephemeral_secret_epoch_number",
Help: "Epoch number of the latest ephemeral secret loaded into the enclave.",
},
[]string{"runtime"},
)

enclaveMasterSecretGenerationNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_master_secret_generation_number",
Help: "Generation number of the latest master secret as seen by the enclave.",
},
[]string{"runtime"},
)

enclaveMasterSecretProposalEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_master_secret_proposal_epoch_number",
Help: "Epoch number of the latest master secret proposal loaded into the enclave.",
},
[]string{"runtime"},
)

enclaveMasterSecretProposalGenerationNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_master_secret_proposal_generation_number",
Help: "Generation number of the latest master secret proposal loaded into the enclave.",
},
[]string{"runtime"},
)

enclaveGeneratedMasterSecretEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_generated_master_secret_epoch_number",
Help: "Epoch number of the latest master secret generated by the enclave.",
},
[]string{"runtime"},
)

enclaveGeneratedMasterSecretGenerationNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_generated_master_secret_generation_number",
Help: "Generation number of the latest master secret generated by the enclave.",
},
[]string{"runtime"},
)

enclaveGeneratedEphemeralSecretEpochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_keymanager_enclave_generated_ephemeral_secret_epoch_number",
Help: "Epoch number of the latest ephemeral secret generated by the enclave.",
},
[]string{"runtime"},
)

keymanagerWorkerCollectors = []prometheus.Collector{
computeRuntimeCount,
policyUpdateCount,
consensusEphemeralSecretEpochNumber,
consensusMasterSecretGenerationNumber,
consensusMasterSecretRotationEpochNumber,
consensusMasterSecretProposalEpochNumber,
consensusMasterSecretProposalGenerationNumber,
enclaveEphemeralSecretEpochNumber,
enclaveMasterSecretGenerationNumber,
enclaveMasterSecretProposalEpochNumber,
enclaveMasterSecretProposalGenerationNumber,
enclaveGeneratedMasterSecretEpochNumber,
enclaveGeneratedMasterSecretGenerationNumber,
enclaveGeneratedEphemeralSecretEpochNumber,
}

metricsOnce sync.Once
Expand Down
51 changes: 40 additions & 11 deletions go/worker/keymanager/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ type Worker struct { // nolint: maligned
quitCh chan struct{}
initCh chan struct{}

runtime runtimeRegistry.Runtime
runtimeID common.Namespace
runtime runtimeRegistry.Runtime
runtimeID common.Namespace
runtimeLabel string

clientRuntimes map[common.Namespace]*clientRuntimeWatcher

Expand Down Expand Up @@ -383,14 +384,16 @@ func (w *Worker) initEnclave(kmStatus *api.Status, rtStatus *runtimeStatus) (*ap
"next_rsk", signedInitResp.InitResponse.NextRSK,
)

// Cache the key manager enclave status and the currently active policy.
w.Lock()
defer w.Unlock()

// Update metrics.
enclaveMasterSecretGenerationNumber.WithLabelValues(w.runtimeLabel).Set(float64(kmStatus.Generation))
if w.enclaveStatus == nil || !bytes.Equal(w.enclaveStatus.InitResponse.PolicyChecksum, signedInitResp.InitResponse.PolicyChecksum) {
policyUpdateCount.Inc()
policyUpdateCount.WithLabelValues(w.runtimeLabel).Inc()
}

// Cache the key manager enclave status and the currently active policy.
w.enclaveStatus = &signedInitResp
w.policy = kmStatus.Policy

Expand Down Expand Up @@ -554,7 +557,8 @@ func (w *Worker) startClientRuntimeWatcher(rt *registry.Runtime, kmStatus *api.S

w.addClientRuntimeWatcher(rt.ID, crw)

computeRuntimeCount.Inc()
// Update metrics.
computeRuntimeCount.WithLabelValues(w.runtimeLabel).Inc()

return nil
}
Expand Down Expand Up @@ -700,6 +704,11 @@ func (w *Worker) generateMasterSecret(runtimeID common.Namespace, generation uin
return err
}

// Update metrics.
enclaveGeneratedMasterSecretGenerationNumber.WithLabelValues(w.runtimeLabel).Set(float64(rsp.SignedSecret.Secret.Generation))
enclaveGeneratedMasterSecretEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(rsp.SignedSecret.Secret.Epoch))
w.setLastGeneratedMasterSecretGeneration(rsp.SignedSecret.Secret.Generation)

return err
}

Expand Down Expand Up @@ -771,6 +780,10 @@ func (w *Worker) generateEphemeralSecret(runtimeID common.Namespace, epoch beaco
return err
}

// Update metrics.
enclaveGeneratedEphemeralSecretEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(rsp.SignedSecret.Secret.Epoch))
w.setLastGeneratedEphemeralSecretEpoch(rsp.SignedSecret.Secret.Epoch)

return err
}

Expand Down Expand Up @@ -854,6 +867,11 @@ func (w *Worker) loadMasterSecret(sigSecret *api.SignedEncryptedMasterSecret) er
return fmt.Errorf("failed to load master secret: %w", err)
}

// Update metrics.
enclaveMasterSecretProposalGenerationNumber.WithLabelValues(w.runtimeLabel).Set(float64(w.mstSecret.Secret.Generation))
enclaveMasterSecretProposalEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(w.mstSecret.Secret.Epoch))
w.setLastLoadedMasterSecretGeneration(w.mstSecret.Secret.Generation)

return nil
}

Expand All @@ -874,6 +892,10 @@ func (w *Worker) loadEphemeralSecret(sigSecret *api.SignedEncryptedEphemeralSecr
return fmt.Errorf("failed to load ephemeral secret: %w", err)
}

// Update metrics.
enclaveEphemeralSecretEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(w.ephSecret.Secret.Epoch))
w.setLastLoadedEphemeralSecretEpoch(w.ephSecret.Secret.Epoch)

return nil
}

Expand Down Expand Up @@ -946,6 +968,10 @@ func (w *Worker) handleStatusUpdate(kmStatus *api.Status) {
"checksum", hex.EncodeToString(kmStatus.Checksum),
)

// Update metrics.
consensusMasterSecretGenerationNumber.WithLabelValues(w.runtimeLabel).Set(float64(kmStatus.Generation))
consensusMasterSecretRotationEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(kmStatus.RotationEpoch))

// Cache the latest status.
w.setStatus(kmStatus)
w.kmStatus = kmStatus
Expand Down Expand Up @@ -1140,6 +1166,11 @@ func (w *Worker) handleNewMasterSecret(secret *api.SignedEncryptedMasterSecret)
"checksum", hex.EncodeToString(secret.Secret.Secret.Checksum),
)

// Update metrics.
consensusMasterSecretProposalGenerationNumber.WithLabelValues(w.runtimeLabel).Set(float64(secret.Secret.Generation))
consensusMasterSecretProposalEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(secret.Secret.Epoch))

// Rearm master secret loading.
w.mstSecret = secret
w.loadMstSecRetry = 0

Expand Down Expand Up @@ -1179,8 +1210,6 @@ func (w *Worker) handleGenerateMasterSecret(height int64, epoch beacon.EpochTime
w.genMstSecDoneCh <- false
return
}

w.setLastGeneratedMasterSecretGeneration(nextGen)
w.genMstSecDoneCh <- true
}

Expand Down Expand Up @@ -1218,7 +1247,6 @@ func (w *Worker) handleLoadMasterSecret() {

// Disarm master secret loading.
w.loadMstSecRetry = math.MaxInt64
w.setLastLoadedMasterSecretGeneration(w.mstSecret.Secret.Generation)

// Announce that the enclave has replicated the proposal for the next master
// secret and is ready for rotation.
Expand All @@ -1234,6 +1262,10 @@ func (w *Worker) handleNewEphemeralSecret(secret *api.SignedEncryptedEphemeralSe
"epoch", secret.Secret.Epoch,
)

// Update metrics.
consensusEphemeralSecretEpochNumber.WithLabelValues(w.runtimeLabel).Set(float64(secret.Secret.Epoch))

// Rearm ephemeral secret loading.
w.ephSecret = secret
w.loadEphSecRetry = 0

Expand Down Expand Up @@ -1276,8 +1308,6 @@ func (w *Worker) handleGenerateEphemeralSecret(height int64, epoch beacon.EpochT
w.genEphSecDoneCh <- false
return
}

w.setLastGeneratedEphemeralSecretEpoch(nextEpoch)
w.genEphSecDoneCh <- true
}

Expand Down Expand Up @@ -1314,7 +1344,6 @@ func (w *Worker) handleLoadEphemeralSecret() {

// Disarm ephemeral secret loading.
w.loadEphSecRetry = math.MaxInt64
w.setLastLoadedEphemeralSecretEpoch(w.ephSecret.Secret.Epoch)
}

func (w *Worker) handleStop() {
Expand Down

0 comments on commit ad27710

Please sign in to comment.