From e570d2ee4225972db1c28a318a04211a65dd8e87 Mon Sep 17 00:00:00 2001 From: Pierre Besson Date: Wed, 2 Feb 2022 13:06:25 +0100 Subject: [PATCH] Substrate alerts rules update (#10642) * .maintain/monitoring: Update substrate prometheus alert rules * match the `substrate_` metrics prefix in alerts instead of `polkadot_`, following changes in #9543 * remove the filtering on polkadot|kusama domain for NumberOfFileDescriptorsHigh alert * .maintain/monitoring: Update substrate Grafana dashboards * match the `substrate_` metrics prefix instead of `polkadot_` in dashboards, following changes in #9543 * .maintain/monitoring: make the NumberOfFileDescriptorsHigh alert only apply for metrics tagged with 'chain' --- .../alerting-rules/alerting-rule-tests.yaml | 144 +++++++++--------- .../alerting-rules/alerting-rules.yaml | 50 +++--- .../substrate-networking.json | 2 +- .../substrate-service-tasks.json | 2 +- 4 files changed, 99 insertions(+), 99 deletions(-) diff --git a/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml b/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml index 7ad916f022154..df5e020d067ea 100644 --- a/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml +++ b/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml @@ -6,39 +6,39 @@ evaluation_interval: 1m tests: - interval: 1m input_series: - - series: 'polkadot_sub_libp2p_peers_count{ - job="polkadot", - pod="polkadot-abcdef01234-abcdef", - instance="polkadot-abcdef01234-abcdef", + - series: 'substrate_sub_libp2p_peers_count{ + job="substrate", + pod="substrate-abcdef01234-abcdef", + instance="substrate-abcdef01234-abcdef", }' values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 - - series: 'polkadot_sub_txpool_validations_scheduled{ - job="polkadot", - pod="polkadot-abcdef01234-abcdef", - instance="polkadot-abcdef01234-abcdef", + - series: 'substrate_sub_txpool_validations_scheduled{ + job="substrate", + pod="substrate-abcdef01234-abcdef", + instance="substrate-abcdef01234-abcdef", }' values: '11+1x10 22+2x30 10043x5' - - series: 'polkadot_sub_txpool_validations_finished{ - job="polkadot", - pod="polkadot-abcdef01234-abcdef", - instance="polkadot-abcdef01234-abcdef", + - series: 'substrate_sub_txpool_validations_finished{ + job="substrate", + pod="substrate-abcdef01234-abcdef", + instance="substrate-abcdef01234-abcdef", }' values: '0+1x42 42x5' - - series: 'polkadot_block_height{ - status="best", job="polkadot", - pod="polkadot-abcdef01234-abcdef", - instance="polkadot-abcdef01234-abcdef", + - series: 'substrate_block_height{ + status="best", job="substrate", + pod="substrate-abcdef01234-abcdef", + instance="substrate-abcdef01234-abcdef", }' values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... - - series: 'polkadot_block_height{ + - series: 'substrate_block_height{ status="finalized", - job="polkadot", - pod="polkadot-abcdef01234-abcdef", - instance="polkadot-abcdef01234-abcdef", + job="substrate", + pod="substrate-abcdef01234-abcdef", + instance="substrate-abcdef01234-abcdef", }' values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... @@ -56,13 +56,13 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: best exp_annotations: message: "Best block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 3 minutes." - eval_time: 14m @@ -70,23 +70,23 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: best exp_annotations: message: "Best block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 3 minutes." - exp_labels: severity: critical - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: best exp_annotations: message: "Best block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 10 minutes." ###################################################################### @@ -101,13 +101,13 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: finalized exp_annotations: message: "Finalized block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 3 minutes." - eval_time: 14m @@ -115,23 +115,23 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: finalized exp_annotations: message: "Finalized block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 3 minutes." - exp_labels: severity: critical - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate status: finalized exp_annotations: message: "Finalized block on instance - polkadot-abcdef01234-abcdef increases by less than 1 per + substrate-abcdef01234-abcdef increases by less than 1 per minute for more than 10 minutes." ###################################################################### @@ -152,12 +152,12 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: message: "The transaction pool size on node - polkadot-abcdef01234-abcdef has been monotonically + substrate-abcdef01234-abcdef has been monotonically increasing for more than 10 minutes." - eval_time: 43m alertname: TransactionQueueSizeIncreasing @@ -167,21 +167,21 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: message: "The transaction pool size on node - polkadot-abcdef01234-abcdef has been monotonically + substrate-abcdef01234-abcdef has been monotonically increasing for more than 10 minutes." - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: message: "The transaction pool size on node - polkadot-abcdef01234-abcdef has been monotonically + substrate-abcdef01234-abcdef has been monotonically increasing for more than 30 minutes." - eval_time: 49m alertname: TransactionQueueSizeHigh @@ -191,12 +191,12 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: message: "The transaction pool size on node - polkadot-abcdef01234-abcdef has been above 10_000 for more + substrate-abcdef01234-abcdef has been above 10_000 for more than 5 minutes." ###################################################################### @@ -211,11 +211,11 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has less + message: "The node substrate-abcdef01234-abcdef has less than 3 peers for more than 3 minutes" - eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 @@ -223,17 +223,17 @@ tests: exp_alerts: - exp_labels: severity: warning - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has less + message: "The node substrate-abcdef01234-abcdef has less than 3 peers for more than 3 minutes" - exp_labels: severity: critical - pod: polkadot-abcdef01234-abcdef - instance: polkadot-abcdef01234-abcdef - job: polkadot + pod: substrate-abcdef01234-abcdef + instance: substrate-abcdef01234-abcdef + job: substrate exp_annotations: - message: "The node polkadot-abcdef01234-abcdef has less + message: "The node substrate-abcdef01234-abcdef has less than 3 peers for more than 15 minutes" diff --git a/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/.maintain/monitoring/alerting-rules/alerting-rules.yaml index 2711610024330..4171f92f68fef 100644 --- a/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -1,5 +1,5 @@ groups: -- name: polkadot.rules +- name: substrate.rules rules: ############################################################################## @@ -10,7 +10,7 @@ groups: annotations: message: 'Best block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 3 minutes.' - expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + expr: increase(substrate_block_height{status="best"}[1m]) < 1 for: 3m labels: severity: warning @@ -18,7 +18,7 @@ groups: annotations: message: 'Best block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 10 minutes.' - expr: increase(polkadot_block_height{status="best"}[1m]) < 1 + expr: increase(substrate_block_height{status="best"}[1m]) < 1 for: 10m labels: severity: critical @@ -28,7 +28,7 @@ groups: ############################################################################## - alert: BlockFinalizationSlow - expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 for: 3m labels: severity: warning @@ -36,7 +36,7 @@ groups: message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute for more than 3 minutes.' - alert: BlockFinalizationSlow - expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 + expr: increase(substrate_block_height{status="finalized"}[1m]) < 1 for: 10m labels: severity: critical @@ -47,8 +47,8 @@ groups: # Under the assumption of an average block production of 6 seconds, # "best" and "finalized" being more than 10 blocks apart would imply # more than a 1 minute delay between block production and finalization. - expr: '(polkadot_block_height{status="best"} - ignoring(status) - polkadot_block_height{status="finalized"}) > 10' + expr: '(substrate_block_height{status="best"} - ignoring(status) + substrate_block_height{status="finalized"}) > 10' for: 8m labels: severity: critical @@ -61,8 +61,8 @@ groups: ############################################################################## - alert: TransactionQueueSizeIncreasing - expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - - increase(polkadot_sub_txpool_validations_finished[5m]) > 0' + expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - + increase(substrate_sub_txpool_validations_finished[5m]) > 0' for: 10m labels: severity: warning @@ -70,8 +70,8 @@ groups: message: 'The transaction pool size on node {{ $labels.instance }} has been monotonically increasing for more than 10 minutes.' - alert: TransactionQueueSizeIncreasing - expr: 'increase(polkadot_sub_txpool_validations_scheduled[5m]) - - increase(polkadot_sub_txpool_validations_finished[5m]) > 0' + expr: 'increase(substrate_sub_txpool_validations_scheduled[5m]) - + increase(substrate_sub_txpool_validations_finished[5m]) > 0' for: 30m labels: severity: warning @@ -79,8 +79,8 @@ groups: message: 'The transaction pool size on node {{ $labels.instance }} has been monotonically increasing for more than 30 minutes.' - alert: TransactionQueueSizeHigh - expr: 'polkadot_sub_txpool_validations_scheduled - - polkadot_sub_txpool_validations_finished > 10000' + expr: 'substrate_sub_txpool_validations_scheduled - + substrate_sub_txpool_validations_finished > 10000' for: 5m labels: severity: warning @@ -93,7 +93,7 @@ groups: ############################################################################## - alert: NumberOfPeersLow - expr: polkadot_sub_libp2p_peers_count < 3 + expr: substrate_sub_libp2p_peers_count < 3 for: 3m labels: severity: warning @@ -101,7 +101,7 @@ groups: message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes' - alert: NumberOfPeersLow - expr: polkadot_sub_libp2p_peers_count < 3 + expr: substrate_sub_libp2p_peers_count < 3 for: 15m labels: severity: critical @@ -109,7 +109,7 @@ groups: message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes' - alert: NoIncomingConnection - expr: increase(polkadot_sub_libp2p_incoming_connections_total[20m]) == 0 + expr: increase(substrate_sub_libp2p_incoming_connections_total[20m]) == 0 labels: severity: warning annotations: @@ -121,7 +121,7 @@ groups: ############################################################################## - alert: NumberOfFileDescriptorsHigh - expr: 'node_filefd_allocated{domain=~"kusama|polkadot"} > 10000' + expr: 'node_filefd_allocated{chain!=""} > 10000' for: 3m labels: severity: warning @@ -134,9 +134,9 @@ groups: ############################################################################## - alert: AuthorityDiscoveryDiscoveryFailureHigh - expr: 'polkadot_authority_discovery_handle_value_found_event_failure / + expr: 'substrate_authority_discovery_handle_value_found_event_failure / ignoring(name) - polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5' + substrate_authority_discovery_dht_event_received{name="value_found"} > 0.5' for: 2h labels: severity: warning @@ -147,9 +147,9 @@ groups: - alert: UnboundedChannelPersistentlyLarge expr: '( - (polkadot_unbounded_channel_len{action = "send"} - - ignoring(action) polkadot_unbounded_channel_len{action = "received"}) - or on(instance) polkadot_unbounded_channel_len{action = "send"} + (substrate_unbounded_channel_len{action = "send"} - + ignoring(action) substrate_unbounded_channel_len{action = "received"}) + or on(instance) substrate_unbounded_channel_len{action = "send"} ) >= 200' for: 5m labels: @@ -160,9 +160,9 @@ groups: - alert: UnboundedChannelVeryLarge expr: '( - (polkadot_unbounded_channel_len{action = "send"} - - ignoring(action) polkadot_unbounded_channel_len{action = "received"}) - or on(instance) polkadot_unbounded_channel_len{action = "send"} + (substrate_unbounded_channel_len{action = "send"} - + ignoring(action) substrate_unbounded_channel_len{action = "received"}) + or on(instance) substrate_unbounded_channel_len{action = "send"} ) > 15000' labels: severity: warning diff --git a/.maintain/monitoring/grafana-dashboards/substrate-networking.json b/.maintain/monitoring/grafana-dashboards/substrate-networking.json index 46942cf582fc6..abd675ed13ec3 100644 --- a/.maintain/monitoring/grafana-dashboards/substrate-networking.json +++ b/.maintain/monitoring/grafana-dashboards/substrate-networking.json @@ -4,7 +4,7 @@ "name": "VAR_METRIC_NAMESPACE", "type": "constant", "label": "Prefix of the metrics", - "value": "polkadot", + "value": "substrate", "description": "" } ], diff --git a/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json b/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json index 2f08ac7bb34c5..ce7e9f78cd8ae 100644 --- a/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json +++ b/.maintain/monitoring/grafana-dashboards/substrate-service-tasks.json @@ -4,7 +4,7 @@ "name": "VAR_METRIC_NAMESPACE", "type": "constant", "label": "Prefix of the metrics", - "value": "polkadot", + "value": "substrate", "description": "" } ],