Skip to content

Commit

Permalink
monitor and alert of component capacity
Browse files Browse the repository at this point in the history
  • Loading branch information
psycofdj committed Jul 15, 2021
1 parent 2c12ba1 commit e01b371
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
14 changes: 13 additions & 1 deletion jobs/logservice_alerts/spec
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ properties:
logservice_alerts.nologs.evaluation_time:
description: "NoLogs alert evaluation time"
default: 5m

logservice_alerts.nologs.span_time:
description: "NoLogs trigger when 0 logs where received in the given interval"
default: 5m
Expand All @@ -45,3 +44,16 @@ properties:
logservice_alerts.cache.evaluation_time:
description: "Logs send without cache too high evaluation time"
default: 10m

logservice_alerts.capacity.instance_max_rate:
description: "Maximum number of log-per-second that an instance can handle"
default: 3333
logservice_alerts.capacity.rolling_instance:
description: "Number of unavailable instances during update"
default: 1
logservice_alerts.capacity.redundancy_factor:
description: "Percent of additional log-per-second that must be reserved in case of loss of a redundant cluster"
default: 50
logservice_alerts.capacity.threshold:
description: "Minimum remaining of log-per-second before triggering the alert"
default: 0
46 changes: 46 additions & 0 deletions jobs/logservice_alerts/templates/logservice.alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,48 @@ groups:
expr: increase(logs_sent_duration_sum[10m])
- record: job:logs_sent_duration_count:increase10m
expr: increase(logs_sent_duration_count[10m])

- record: logs_capacity:rate_instance
expr: sum(rate(logs_sent_total[5m])) by (instance)
- record: logs_capacity:rate
expr: sum(logs_capacity:rate_instance)
- record: logs_capacity:rate_max24h
expr: max_over_time(logs_capacity:rate[24h])
- record: logs_capacity:instance_count
expr: count(logs_capacity:rate_instance)
- record: logs_capacity:rate_max
expr: |
(logs_capacity:instance_count - <%= p('logservice_alerts.capacity.rolling_instance') %>)
*
<%= p('logservice_alerts.capacity.instance_max_rate') %>
- record: logs_capacity:redundancy_factor
expr: <%= p('logservice_alerts.capacity.redundancy_factor') %> / 100
- record: logs_capacity:threshold
expr: <%= p('logservice_alerts.capacity.threshold') %>
- record: logs_capacity:rate_remaining
expr: |
logs_capacity:rate_max
-
(logs_capacity:rate_max24h * (1 + logs_capacity:redundancy_factor))
- alert: LogserviceCapacityTooLow
expr: logs_capacity:rate_remaining < logs_capacity:threshold
for: 5m
labels:
service: logservice
severity: warning
annotations:
summary: "Logservice doesn't have enough instances to process logs safely"
description: |
Logservice needs scale-up. It has not enough instances to process all logs while
undergoing simultaneously a cluster failure and a rolling update.
Impact:
- chances of loosing application logs in case of cluster failure or maintenance
Resolution:
- contact administrator to scale up the component
- alert: LogserviceDurationTooHigh
expr: |
avg(
Expand Down Expand Up @@ -71,3 +113,7 @@ groups:
Please contact Cloud Foundry admin team
<% end %>

# Local Variables:
# ispell-local-dictionary: "american"
# End:

0 comments on commit e01b371

Please sign in to comment.