Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions module/metrics/alsp.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ func NewAlspMetrics() *AlspMetrics {
return alsp
}

// OnClusterTopicMetricsCleanup removes all misbehavior counter label values associated with the given
// cluster topic to prevent unbounded metric cardinality growth during epoch transitions.
func (a *AlspMetrics) OnClusterTopicMetricsCleanup(topic string) {
a.reportedMisbehaviorCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})
}

// OnMisbehaviorReported is called when a misbehavior is reported by the application layer to ALSP.
// An engine detecting a spamming-related misbehavior reports it to the ALSP module. It increases
// the counter vector of reported misbehavior.
Expand Down
10 changes: 10 additions & 0 deletions module/metrics/gossipsub_score.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,13 @@ func (g *GossipSubScoreMetrics) OnInvalidMessageDeliveredUpdated(topic channels.
func (g *GossipSubScoreMetrics) SetWarningStateCount(u uint) {
g.warningStateGauge.Set(float64(u))
}

// OnClusterTopicMetricsCleanup removes all per-topic scoring metric label values associated with
// the given cluster topic. Call this when the local node leaves a cluster topic to prevent
// unbounded metric cardinality growth across epoch transitions.
func (g *GossipSubScoreMetrics) OnClusterTopicMetricsCleanup(topic string) {
g.timeInMesh.DeleteLabelValues(topic)
g.meshMessageDelivery.DeleteLabelValues(topic)
g.firstMessageDelivery.DeleteLabelValues(topic)
g.invalidMessageDelivery.DeleteLabelValues(topic)
}
23 changes: 18 additions & 5 deletions module/metrics/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,32 @@ func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, mes
// OnClusterTopicMetricsCleanup removes all metric label values associated with the given cluster topic.
// This prevents unbounded metric cardinality growth during epoch transitions when collection nodes
// join new clusters and leave old ones. Only call this for cluster topics (sync-cluster-*, consensus-cluster-*).
// This method overrides the embedded LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup to also
// clean up inbound/outbound message size metrics and iHave message ID metrics.
func (nc *NetworkCollector) OnClusterTopicMetricsCleanup(topic string) {
// Clean up LocalGossipSubRouterMetrics (localMeshSize, peerGraftTopicCount, peerPruneTopicCount)
// LocalGossipSubRouterMetrics: localMeshSize, peerGraftTopicCount, peerPruneTopicCount
nc.LocalGossipSubRouterMetrics.OnClusterTopicMetricsCleanup(topic)

// Clean up GossipSubRpcValidationInspectorMetrics (receivedIHaveMsgIDsHistogram)
// GossipSubRpcValidationInspectorMetrics: receivedIHaveMsgIDsHistogram
nc.GossipSubRpcValidationInspectorMetrics.OnClusterTopicMetricsCleanup(topic)

// Clean up inbound/outbound message size metrics using partial match on topic
// GossipSubScoreMetrics: timeInMesh, meshMessageDelivery, firstMessageDelivery, invalidMessageDelivery
nc.GossipSubScoreMetrics.OnClusterTopicMetricsCleanup(topic)

// inbound/outbound message size and duplicate drop counters (multi-label: channel + protocol + message)
nc.inboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})
nc.outboundMessageSize.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})
nc.duplicateMessagesDropped.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})

// message processing gauges and inbound process time counter (single label: channel)
nc.numMessagesProcessing.DeleteLabelValues(topic)
nc.numDirectMessagesSending.DeleteLabelValues(topic)
nc.inboundProcessTime.DeleteLabelValues(topic)

// security metrics (multi-label: role + message + channel + reason)
nc.unAuthorizedMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})
nc.rateLimitedUnicastMessagesCount.DeletePartialMatch(prometheus.Labels{LabelChannel: topic})

// ALSP misbehavior counter (multi-label: channel + misbehavior)
nc.AlspMetrics.OnClusterTopicMetricsCleanup(topic)
}

func (nc *NetworkCollector) MessageAdded(priority int) {
Expand Down