monitor and alert of component capacity

orange-cloudfoundry · Jul 15, 2021 · e01b371 · e01b371
1 parent 2c12ba1
commit e01b371
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 1 deletion.
diff --git a/jobs/logservice_alerts/spec b/jobs/logservice_alerts/spec
@@ -31,7 +31,6 @@ properties:
   logservice_alerts.nologs.evaluation_time:
     description: "NoLogs alert evaluation time"
     default: 5m
-
   logservice_alerts.nologs.span_time:
     description: "NoLogs trigger when 0 logs where received in the given interval"
     default: 5m
@@ -45,3 +44,16 @@ properties:
   logservice_alerts.cache.evaluation_time:
     description: "Logs send without cache too high evaluation time"
     default: 10m
+
+  logservice_alerts.capacity.instance_max_rate:
+    description: "Maximum number of log-per-second that an instance can handle"
+    default: 3333
+  logservice_alerts.capacity.rolling_instance:
+    description: "Number of unavailable instances during update"
+    default: 1
+  logservice_alerts.capacity.redundancy_factor:
+    description: "Percent of additional log-per-second that must be reserved in case of loss of a redundant cluster"
+    default: 50
+  logservice_alerts.capacity.threshold:
+    description: "Minimum remaining of log-per-second before triggering the alert"
+    default: 0
diff --git a/jobs/logservice_alerts/templates/logservice.alerts.yml b/jobs/logservice_alerts/templates/logservice.alerts.yml
@@ -5,6 +5,48 @@ groups:
         expr: increase(logs_sent_duration_sum[10m])
       - record: job:logs_sent_duration_count:increase10m
         expr: increase(logs_sent_duration_count[10m])
+
+      - record: logs_capacity:rate_instance
+        expr: sum(rate(logs_sent_total[5m])) by (instance)
+      - record: logs_capacity:rate
+        expr: sum(logs_capacity:rate_instance)
+      - record: logs_capacity:rate_max24h
+        expr: max_over_time(logs_capacity:rate[24h])
+      - record: logs_capacity:instance_count
+        expr: count(logs_capacity:rate_instance)
+      - record: logs_capacity:rate_max
+        expr: |
+          (logs_capacity:instance_count - <%= p('logservice_alerts.capacity.rolling_instance') %>)
+          *
+          <%= p('logservice_alerts.capacity.instance_max_rate') %>
+      - record: logs_capacity:redundancy_factor
+        expr: <%= p('logservice_alerts.capacity.redundancy_factor') %> / 100
+      - record: logs_capacity:threshold
+        expr: <%= p('logservice_alerts.capacity.threshold') %>
+      - record: logs_capacity:rate_remaining
+        expr: |
+          logs_capacity:rate_max
+          -
+          (logs_capacity:rate_max24h * (1 + logs_capacity:redundancy_factor))
+      - alert: LogserviceCapacityTooLow
+        expr: logs_capacity:rate_remaining < logs_capacity:threshold
+        for: 5m
+        labels:
+          service: logservice
+          severity: warning
+        annotations:
+          summary: "Logservice doesn't have enough instances to process logs safely"
+          description: |
+            Logservice needs scale-up. It has not enough instances to process all logs while
+            undergoing simultaneously a cluster failure and a rolling update.
+
+            Impact:
+            - chances of loosing application logs in case of cluster failure or maintenance
+
+            Resolution:
+            - contact administrator to scale up the component
+
+
       - alert: LogserviceDurationTooHigh
         expr: |
           avg(
@@ -71,3 +113,7 @@ groups:
 
             Please contact Cloud Foundry admin team
 <% end %>
+
+# Local Variables:
+# ispell-local-dictionary: "american"
+# End: