diff --git a/health/health.c b/health/health.c index 85d2a245815d14..ccd26031769fb8 100644 --- a/health/health.c +++ b/health/health.c @@ -308,26 +308,44 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { int n_warn=0, n_crit=0; RRDCALC *rc; EVAL_EXPRESSION *expr=NULL; + BUFFER *warn_alarms, *crit_alarms; + + warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); for(rc = host->alarms; rc ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { - n_warn++; - if (ae->alarm_id == rc->id) - expr=rc->warning; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_warn) + buffer_strcat(warn_alarms, ","); + buffer_strcat(warn_alarms, rc->name); + buffer_strcat(warn_alarms, "="); + buffer_snprintf(warn_alarms, 11, "%ld", rc->last_status_change); + n_warn++; + } else if (ae->alarm_id == rc->id) + expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { - n_crit++; - if (ae->alarm_id == rc->id) - expr=rc->critical; + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_crit) + buffer_strcat(crit_alarms, ","); + buffer_strcat(crit_alarms, rc->name); + buffer_strcat(crit_alarms, "="); + buffer_snprintf(crit_alarms, 11, "%ld", rc->last_status_change); + n_crit++; + } else if (ae->alarm_id == rc->id) + expr = rc->critical; } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { if (ae->alarm_id == rc->id) - expr=rc->warning; + expr = rc->warning; } } - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'", + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0"); + + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -352,7 +370,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { (expr && expr->source)?expr->source:"NOSOURCE", (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", n_warn, - n_crit + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae->classification:"Unknown", + edit_command, + localhost->registry_hostname ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -363,6 +386,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + freez(edit_command); + buffer_free(warn_alarms); + buffer_free(crit_alarms); + return; //health_alarm_wait_for_execution done: health_alarm_log_save(host, ae); diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 9a3a80ad6c9aa4..98f84a71d35bb8 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -239,6 +239,11 @@ else calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation total_warnings="${23}" # Total number of alarms in WARNING state total_critical="${24}" # Total number of alarms in CRITICAL state + total_warn_alarms="${25}" # List of alarms in warning state + total_crit_alarms="${26}" # List of alarms in critical state + classification="${27}" # The class field from .conf files + edit_command_line="${28}" # The command to edit the alarm, with the line number + sender_host="${29}" # The host sending this notification fi # ----------------------------------------------------------------------------- @@ -252,6 +257,17 @@ else host="${args_host}" fi +# ----------------------------------------------------------------------------- +# Do the same for sender_host (find a suitable hostname to use, if netdata did not supply a hostname) + +if [ -z ${sender_host} ]; then + this_host=$(hostname -s 2>/dev/null) + s_host="${this_host}" + sender_host="${this_host}" +else + s_host="${sender_host}" +fi + # ----------------------------------------------------------------------------- # screen statuses we don't need to send a notification @@ -810,6 +826,14 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null) [ -z "${date}" ] && date=$(date --date=@${when} 2>/dev/null) [ -z "${date}" ] && date=$(date 2>/dev/null) +# ----------------------------------------------------------------------------- +# get the date in utc the alarm happened + +date_utc=$(date --date=@${when} "${date_format}" -u 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u "${date_format}" 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u --date=@${when} 2>/dev/null) +[ -z "${date_utc}" ] && date_utc=$(date -u 2>/dev/null) + # ---------------------------------------------------------------------------- # prepare some extra headers if we've been asked to thread e-mails if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then @@ -2266,8 +2290,10 @@ urlencode "${family}" >/dev/null url_family="${REPLY}" urlencode "${name}" >/dev/null url_name="${REPLY}" +urlencode "${value_string}" >/dev/null +url_value_string="${REPLY}" -redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}" +redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}" GOTOCLOUD=0 if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then @@ -2284,9 +2310,9 @@ fi if [ ${GOTOCLOUD} -eq 0 ]; then goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" else - # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud - #goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}" + # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud + # Re-allow alarm redirection, for alarms 2.0, new template + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" fi # the severity of the alarm @@ -2311,48 +2337,79 @@ alarm="${name//_/ } = ${value_string}" # the image of the alarm image="${images_base_url}/images/banner-icon-144x144.png" +# have a default email status, in case the following case does not catch it +status_email_subject="${status}" + # prepare the title based on status case "${status}" in CRITICAL) image="${images_base_url}/images/alert-128-red.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png" status_message="is critical" + status_email_subject="Critical" color="#ca414b" + rich_status_raised_for="Raised to critical, for ${non_clear_duration_txt}" + background_color="#FFEBEF" + border_color="#FF4136" + text_color="#FF4136" + action_text_color="#FFFFFF" ;; WARNING) image="${images_base_url}/images/alert-128-orange.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png" status_message="needs attention" + status_email_subject="Warning" color="#ffc107" + rich_status_raised_for="Raised to warning, for ${non_clear_duration_txt}" + background_color="#FFF8E1" + border_color="#FFC300" + text_color="#536775" + action_text_color="#35414A" ;; CLEAR) image="${images_base_url}/images/check-mark-2-128-green.png" + alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png" status_message="recovered" + status_email_subject="Clear" color="#77ca6d" + rich_status_raised_for= + background_color="#E5F5E8" + border_color="#68C47D" + text_color="#00AB44" + action_text_color="#FFFFFF" ;; esac +# the html email subject +html_email_subject="${status_email_subject}, ${name} = ${value_string}, on ${host}" + if [ "${status}" = "CLEAR" ]; then severity="Recovered from ${old_status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm was raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Recovered from ${old_status,,}, ${raised_for}" # don't show the value when the status is CLEAR # for certain alarms, this value might not have any meaning alarm="${name//_/ } ${raised_for}" + html_email_subject="${status_email_subject}, ${name} ${raised_for}, on ${host}" elif { [ "${old_status}" = "WARNING" ] && [ "${status}" = "CRITICAL" ]; }; then severity="Escalated to ${status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Escalated to critical, ${raised_for}" elif { [ "${old_status}" = "CRITICAL" ] && [ "${status}" = "WARNING" ]; }; then severity="Demoted to ${status}" if [ ${non_clear_duration} -gt ${duration} ]; then raised_for="(alarm is raised for ${non_clear_duration_txt})" fi + rich_status_raised_for="Demoted to warning, ${raised_for}" else raised_for= @@ -2638,117 +2695,732 @@ EOF else +now=$(date "+%s") + +if [ -n "$total_warn_alarms" ]; then + while read -d, -r pair; do + IFS='=' read -r key val <<<"$pair" + + date_w=$(date --date=@${val} "${date_format}" 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date "${date_format}" 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date --date=@${val} 2>/dev/null) + [ -z "${date_w}" ] && date_w=$(date 2>/dev/null) + + elapsed=$((now - val)) + + duration4human ${elapsed} >/dev/null + elapsed_txt="${REPLY}" + + WARN_ALARMS+=" +
+ + + + + + +
+ +
+ + + + + + + + + +
+
${key}
+
+
${date_w}
+
+
+ +
+ + + + + + +
+ + + + + + +
+
+ Warning for ${elapsed_txt} +
+
+
+
+ +
+
+ " + + done <<<"$total_warn_alarms," +fi + +if [ -n "$total_crit_alarms" ]; then + while read -d, -r pair; do + IFS='=' read -r key val <<<"$pair" + + date_c=$(date --date=@${val} "${date_format}" 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date "${date_format}" 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date --date=@${val} 2>/dev/null) + [ -z "${date_c}" ] && date_c=$(date 2>/dev/null) + + elapsed=$((now - val)) + + duration4human ${elapsed} >/dev/null + elapsed_txt="${REPLY}" + + CRIT_ALARMS+=" +
+ + + + + + +
+ +
+ + + + + + + + + +
+
${key}
+
+
${date_c}
+
+
+ +
+ + + + + + +
+ + + + + + +
+
+ Critical for ${elapsed_txt} +
+
+
+
+ +
+
+ " + + done <<<"$total_crit_alarms," +fi + +if [ -n "$edit_command_line" ]; then + IFS='=' read -r edit_command line <<<"$edit_command_line" +fi + IFS='' read -r -d '' email_html_part < - - - - - - - - - -
-
- + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + + + + +
+ +
+ + + + + + +
+ - + + +
-
netdata notification
-
+ Netdata Logo +
+
+
+ +
+ + + + + + +
+ + - + + +
-

${host} ${status_message}

+
+
Notification
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ +
+ + + + + + +
+
${name}
+
+
+ +
+ + + + + + +
+ + + + + + +
+ +
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
on ${host}
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
${value_string} +
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+
Details: ${info}
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + +
+

+ GO TO CHART +

+
+
+
+ +
+
+ +
+
+ +
+ +
+ + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
Chart: + ${chart}
+
+
Family: + ${family}
+
+
${rich_status_raised_for}
+
+

+

+ +
+
On + ${date}
+
+
By: + ${host}
+
+
Global time: + ${date_utc}
+
+

+

+ +
+
Classification: + ${classification}
+
+
Role: + ${roles}
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + + + +
+ + + + + + +
+ +
+
+
+
+ +
+ + + + + + +
+ + + + + + + + +
+
Want to know more about this alert?
+
+
Discuss and troubleshoot with others on the Netdata community forums
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + - + + +
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- ${chart} - Chart -
- ${alarm}${info_html} - Alarm -
- ${family} - Family -
- ${severity} - Severity -
${date} - ${raised_for_html} Time -
- ${calc_expression} - Evaluated Expression -
- ${calc_param_values} - Expression Variables -
- The host has ${total_warnings} WARNING and ${total_critical} CRITICAL alarm(s) raised. -
- View Netdata -
The source of this alarm is line ${src}
(alarms are configurable, edit this file to adapt the alarm to your needs) -
Sent by - netdata, the real-time performance and health monitoring, on ${host}. -
-
+
+ + + + + + +
+ +
+
+
+
+ +
+ + + + + + +
+ + + + + + + + + + + + +
+
Need to configure this alert?
+
+
Edit this alert's configuration file by logging into $s_host and running the following command:
+
+
${edit_command}
+ The alarm to edit is at line {${line}}
+
+
+
+ +
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ +
+ + + + + + +
+
The node has + ${total_warnings} warning + and + ${total_critical} critical + additional active alert(s)
+
+ +
+
+ ${CRIT_ALARMS} + ${WARN_ALARMS} + +
+
+ +
+ + + + + + +
+ +
+ + + +
+ + + + -
+
© Netdata 2021 - The real-time performance and health monitoring
+
- +
+
+
+
+ + + + + + EOF send_email <