diff --git a/health/health.c b/health/health.c
index 85d2a245815d14..ccd26031769fb8 100644
--- a/health/health.c
+++ b/health/health.c
@@ -308,26 +308,44 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
int n_warn=0, n_crit=0;
RRDCALC *rc;
EVAL_EXPRESSION *expr=NULL;
+ BUFFER *warn_alarms, *crit_alarms;
+
+ warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
+ crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
for(rc = host->alarms; rc ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- n_warn++;
- if (ae->alarm_id == rc->id)
- expr=rc->warning;
+ if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
+ if (n_warn)
+ buffer_strcat(warn_alarms, ",");
+ buffer_strcat(warn_alarms, rc->name);
+ buffer_strcat(warn_alarms, "=");
+ buffer_snprintf(warn_alarms, 11, "%ld", rc->last_status_change);
+ n_warn++;
+ } else if (ae->alarm_id == rc->id)
+ expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- n_crit++;
- if (ae->alarm_id == rc->id)
- expr=rc->critical;
+ if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
+ if (n_crit)
+ buffer_strcat(crit_alarms, ",");
+ buffer_strcat(crit_alarms, rc->name);
+ buffer_strcat(crit_alarms, "=");
+ buffer_snprintf(crit_alarms, 11, "%ld", rc->last_status_change);
+ n_crit++;
+ } else if (ae->alarm_id == rc->id)
+ expr = rc->critical;
} else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
if (ae->alarm_id == rc->id)
- expr=rc->warning;
+ expr = rc->warning;
}
}
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'",
+ char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0");
+
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -352,7 +370,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
(expr && expr->source)?expr->source:"NOSOURCE",
(expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
n_warn,
- n_crit
+ n_crit,
+ buffer_tostring(warn_alarms),
+ buffer_tostring(crit_alarms),
+ ae->classification?ae->classification:"Unknown",
+ edit_command,
+ localhost->registry_hostname
);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
@@ -363,6 +386,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
+ freez(edit_command);
+ buffer_free(warn_alarms);
+ buffer_free(crit_alarms);
+
return; //health_alarm_wait_for_execution
done:
health_alarm_log_save(host, ae);
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 9a3a80ad6c9aa4..98f84a71d35bb8 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -239,6 +239,11 @@ else
calc_param_values="${22}" # the values of the parameters in the expression, at the time of the evaluation
total_warnings="${23}" # Total number of alarms in WARNING state
total_critical="${24}" # Total number of alarms in CRITICAL state
+ total_warn_alarms="${25}" # List of alarms in warning state
+ total_crit_alarms="${26}" # List of alarms in critical state
+ classification="${27}" # The class field from .conf files
+ edit_command_line="${28}" # The command to edit the alarm, with the line number
+ sender_host="${29}" # The host sending this notification
fi
# -----------------------------------------------------------------------------
@@ -252,6 +257,17 @@ else
host="${args_host}"
fi
+# -----------------------------------------------------------------------------
+# Do the same for sender_host (find a suitable hostname to use, if netdata did not supply a hostname)
+
+if [ -z ${sender_host} ]; then
+ this_host=$(hostname -s 2>/dev/null)
+ s_host="${this_host}"
+ sender_host="${this_host}"
+else
+ s_host="${sender_host}"
+fi
+
# -----------------------------------------------------------------------------
# screen statuses we don't need to send a notification
@@ -810,6 +826,14 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null)
[ -z "${date}" ] && date=$(date --date=@${when} 2>/dev/null)
[ -z "${date}" ] && date=$(date 2>/dev/null)
+# -----------------------------------------------------------------------------
+# get the date in utc the alarm happened
+
+date_utc=$(date --date=@${when} "${date_format}" -u 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u "${date_format}" 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u --date=@${when} 2>/dev/null)
+[ -z "${date_utc}" ] && date_utc=$(date -u 2>/dev/null)
+
# ----------------------------------------------------------------------------
# prepare some extra headers if we've been asked to thread e-mails
if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then
@@ -2266,8 +2290,10 @@ urlencode "${family}" >/dev/null
url_family="${REPLY}"
urlencode "${name}" >/dev/null
url_name="${REPLY}"
+urlencode "${value_string}" >/dev/null
+url_value_string="${REPLY}"
-redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}"
+redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}"
GOTOCLOUD=0
if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then
@@ -2284,9 +2310,9 @@ fi
if [ ${GOTOCLOUD} -eq 0 ]; then
goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}"
else
- # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
- #goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
- goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}"
+ # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
+ # Re-allow alarm redirection, for alarms 2.0, new template
+ goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
fi
# the severity of the alarm
@@ -2311,48 +2337,79 @@ alarm="${name//_/ } = ${value_string}"
# the image of the alarm
image="${images_base_url}/images/banner-icon-144x144.png"
+# have a default email status, in case the following case does not catch it
+status_email_subject="${status}"
+
# prepare the title based on status
case "${status}" in
CRITICAL)
image="${images_base_url}/images/alert-128-red.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png"
status_message="is critical"
+ status_email_subject="Critical"
color="#ca414b"
+ rich_status_raised_for="Raised to critical, for ${non_clear_duration_txt}"
+ background_color="#FFEBEF"
+ border_color="#FF4136"
+ text_color="#FF4136"
+ action_text_color="#FFFFFF"
;;
WARNING)
image="${images_base_url}/images/alert-128-orange.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png"
status_message="needs attention"
+ status_email_subject="Warning"
color="#ffc107"
+ rich_status_raised_for="Raised to warning, for ${non_clear_duration_txt}"
+ background_color="#FFF8E1"
+ border_color="#FFC300"
+ text_color="#536775"
+ action_text_color="#35414A"
;;
CLEAR)
image="${images_base_url}/images/check-mark-2-128-green.png"
+ alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png"
status_message="recovered"
+ status_email_subject="Clear"
color="#77ca6d"
+ rich_status_raised_for=
+ background_color="#E5F5E8"
+ border_color="#68C47D"
+ text_color="#00AB44"
+ action_text_color="#FFFFFF"
;;
esac
+# the html email subject
+html_email_subject="${status_email_subject}, ${name} = ${value_string}, on ${host}"
+
if [ "${status}" = "CLEAR" ]; then
severity="Recovered from ${old_status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm was raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Recovered from ${old_status,,}, ${raised_for}"
# don't show the value when the status is CLEAR
# for certain alarms, this value might not have any meaning
alarm="${name//_/ } ${raised_for}"
+ html_email_subject="${status_email_subject}, ${name} ${raised_for}, on ${host}"
elif { [ "${old_status}" = "WARNING" ] && [ "${status}" = "CRITICAL" ]; }; then
severity="Escalated to ${status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm is raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Escalated to critical, ${raised_for}"
elif { [ "${old_status}" = "CRITICAL" ] && [ "${status}" = "WARNING" ]; }; then
severity="Demoted to ${status}"
if [ ${non_clear_duration} -gt ${duration} ]; then
raised_for="(alarm is raised for ${non_clear_duration_txt})"
fi
+ rich_status_raised_for="Demoted to warning, ${raised_for}"
else
raised_for=
@@ -2638,117 +2695,732 @@ EOF
else
+now=$(date "+%s")
+
+if [ -n "$total_warn_alarms" ]; then
+ while read -d, -r pair; do
+ IFS='=' read -r key val <<<"$pair"
+
+ date_w=$(date --date=@${val} "${date_format}" 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date "${date_format}" 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date --date=@${val} 2>/dev/null)
+ [ -z "${date_w}" ] && date_w=$(date 2>/dev/null)
+
+ elapsed=$((now - val))
+
+ duration4human ${elapsed} >/dev/null
+ elapsed_txt="${REPLY}"
+
+ WARN_ALARMS+="
+
+
+
+
+
+
+
+
+
+
+
+ ${key}
+ |
+
+
+
+ ${date_w}
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Warning for ${elapsed_txt}
+
+ |
+
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+ "
+
+ done <<<"$total_warn_alarms,"
+fi
+
+if [ -n "$total_crit_alarms" ]; then
+ while read -d, -r pair; do
+ IFS='=' read -r key val <<<"$pair"
+
+ date_c=$(date --date=@${val} "${date_format}" 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date "${date_format}" 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date --date=@${val} 2>/dev/null)
+ [ -z "${date_c}" ] && date_c=$(date 2>/dev/null)
+
+ elapsed=$((now - val))
+
+ duration4human ${elapsed} >/dev/null
+ elapsed_txt="${REPLY}"
+
+ CRIT_ALARMS+="
+
+
+
+
+
+
+
+
+
+
+
+ ${key}
+ |
+
+
+
+ ${date_c}
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Critical for ${elapsed_txt}
+
+ |
+
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+ "
+
+ done <<<"$total_crit_alarms,"
+fi
+
+if [ -n "$edit_command_line" ]; then
+ IFS='=' read -r edit_command line <<<"$edit_command_line"
+fi
+
IFS='' read -r -d '' email_html_part <
-
-
-
-
-
- |
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
- netdata notification
- |
+
+
+ |
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
-
- ${host} ${status_message}
+ |
+ Notification
+ |
+
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ |
+
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${value_string}
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Details: ${info}
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Chart:
+ ${chart}
+ |
+
+
+
+ Family:
+ ${family}
+ |
+
+
+
+ ${rich_status_raised_for}
+ |
+
+
+
+
+
+
+ |
+
+
+
+ On
+ ${date}
+ |
+
+
+
+ By:
+ ${host}
+ |
+
+
+
+ Global time:
+ ${date_utc}
+ |
+
+
+
+
+
+
+ |
+
+
+
+ Classification:
+ ${classification}
+ |
+
+
+
+ Role:
+ ${roles}
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ |
+
+
+
+ |
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Want to know more about this alert?
+ |
+
+
+
+
+ |
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
- ${chart}
- Chart
- |
-
-
-
- ${alarm}${info_html}
- Alarm
- |
-
-
-
- ${family}
- Family
- |
-
-
-
- ${severity}
- Severity
- |
-
-
- ${date}
- ${raised_for_html} Time
- |
-
-
-
- ${calc_expression}
- Evaluated Expression
- |
-
-
-
- ${calc_param_values}
- Expression Variables
- |
-
-
-
- The host has ${total_warnings} WARNING and ${total_critical} CRITICAL alarm(s) raised.
- |
-
-
-
-
- View Netdata
- |
-
-
- The source of this alarm is line ${src} (alarms are configurable, edit this file to adapt the alarm to your needs)
- |
-
-
- Sent by
- netdata, the real-time performance and health monitoring, on ${host} .
- |
-
-
-
-
+ |
+
+
+
+
+
+ |
+
+
+
+ |
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Need to configure this alert?
+ |
+
+
+
+ Edit this alert's configuration file by logging into $s_host and running the following command:
+ |
+
+
+
+ ${edit_command}
+ The alarm to edit is at line {${line}}
+ |
+
+
+
+ |
+
+
+
+
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The node has
+ ${total_warnings} warning
+ and
+ ${total_critical} critical
+ additional active alert(s)
|
+
+
+
+
+
+ |
+
+
+
+
+ ${CRIT_ALARMS}
+ ${WARN_ALARMS}
+
+ |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ © Netdata 2021 - The real-time performance and health monitoring
+ |
-
-
+ |
+ |
+
+
+
+
+
-
-
-
+ |
+
+
+
+
+
EOF
send_email <