Skip to content

Commit

Permalink
Prometheus metric endpoint and Opentelemetry stats (aws#172)
Browse files Browse the repository at this point in the history
* adding/testing prometheus flags for enabling and port.

* adding prometheus and OT metrics

* adding constant for default prometheus port with doc

* adding events metrics, and refactoring

* rewording, fixing tests

* removing events metrics counter, refactoring nodeName

* extends helm chart and doc with prometheus values

* ignoring binary, removing unused metric

* adding e2e test prometheus enabled, name better args cli

* refactoring error event metrics

* removing metrics in func not used

* fixing env for prometheus port

* adding license and improving documentation

* adding metrics endpoint consideration in the readme

* change type port to int

* adding e2e tests for metrics verifying them
  • Loading branch information
Manuel Alonso committed Jun 18, 2020
1 parent 3012867 commit 011a797
Show file tree
Hide file tree
Showing 12 changed files with 398 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -12,6 +12,9 @@
# Output of the go coverage tool, specifically when used with LiteIDE
*.out

# Output of the go build for the cmd binary
/node-termination-handler

### Go Patch ###
/vendor/
/Godeps/
Expand Down
27 changes: 21 additions & 6 deletions cmd/node-termination-handler.go
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/aws/aws-node-termination-handler/pkg/interruptionevent"
"github.com/aws/aws-node-termination-handler/pkg/interruptioneventstore"
"github.com/aws/aws-node-termination-handler/pkg/node"
"github.com/aws/aws-node-termination-handler/pkg/observability"
"github.com/aws/aws-node-termination-handler/pkg/webhook"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
Expand Down Expand Up @@ -64,6 +65,11 @@ func main() {
log.Logger = zerolog.New(os.Stderr).With().Timestamp().Logger()
}

metrics, err := observability.InitMetrics(nthConfig.EnablePrometheus, nthConfig.PrometheusPort)
if err != nil {
log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,")
}

imds := ec2metadata.New(nthConfig.MetadataURL, nthConfig.MetadataTries)

interruptionEventStore := interruptioneventstore.New(nthConfig)
Expand Down Expand Up @@ -96,6 +102,7 @@ func main() {
err := monitorFn(interruptionChan, cancelChan, imds)
if err != nil {
log.Log().Msgf("There was a problem monitoring for %s events: %v", eventType, err)
metrics.ErrorEventsInc(eventType)
}
}
}(fn, eventType)
Expand All @@ -105,7 +112,7 @@ func main() {
log.Log().Msg("Started watching for interruption events")
log.Log().Msg("Kubernetes AWS Node Termination Handler has started successfully!")

go watchForCancellationEvents(cancelChan, interruptionEventStore, node, nodeMetadata)
go watchForCancellationEvents(cancelChan, interruptionEventStore, node, nodeMetadata, metrics)
log.Log().Msg("Started watching for event cancellations")

for range time.NewTicker(1 * time.Second).C {
Expand All @@ -114,7 +121,7 @@ func main() {
// Exit interruption loop if a SIGTERM is received or the channel is closed
break
default:
drainOrCordonIfNecessary(interruptionEventStore, *node, nthConfig, nodeMetadata)
drainOrCordonIfNecessary(interruptionEventStore, *node, nthConfig, nodeMetadata, metrics)
}
}
log.Log().Msg("AWS Node Termination Handler is shutting down")
Expand Down Expand Up @@ -148,7 +155,7 @@ func watchForInterruptionEvents(interruptionChan <-chan interruptionevent.Interr
}
}

func watchForCancellationEvents(cancelChan <-chan interruptionevent.InterruptionEvent, interruptionEventStore *interruptioneventstore.Store, node *node.Node, nodeMetadata ec2metadata.NodeMetadata) {
func watchForCancellationEvents(cancelChan <-chan interruptionevent.InterruptionEvent, interruptionEventStore *interruptioneventstore.Store, node *node.Node, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics) {
for {
interruptionEvent := <-cancelChan
log.Log().Msgf("Got cancel event from channel %+v %+v", nodeMetadata, interruptionEvent)
Expand All @@ -159,6 +166,8 @@ func watchForCancellationEvents(cancelChan <-chan interruptionevent.Interruption
if err != nil {
log.Log().Msgf("Uncordoning the node failed: %v", err)
}
metrics.NodeActionsInc("uncordon", node.GetName(), err)

node.RemoveNTHLabels()
node.RemoveNTHTaints()
} else {
Expand All @@ -167,29 +176,35 @@ func watchForCancellationEvents(cancelChan <-chan interruptionevent.Interruption
}
}

func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata) {
func drainOrCordonIfNecessary(interruptionEventStore *interruptioneventstore.Store, node node.Node, nthConfig config.Config, nodeMetadata ec2metadata.NodeMetadata, metrics observability.Metrics) {
if drainEvent, ok := interruptionEventStore.GetActiveEvent(); ok {
nodeName := node.GetName()
if drainEvent.PreDrainTask != nil {
err := drainEvent.PreDrainTask(*drainEvent, node)
if err != nil {
log.Log().Msgf("There was a problem executing the pre-drain task: %v", err)
}
metrics.NodeActionsInc("pre-dain", nodeName, err)
}

if nthConfig.CordonOnly {
err := node.Cordon()
if err != nil {
log.Log().Msgf("There was a problem while trying to cordon the node: %v", err)
os.Exit(1)
}
log.Log().Msgf("Node %q successfully cordoned.", nthConfig.NodeName)
log.Log().Msgf("Node %q successfully cordoned.", nodeName)
metrics.NodeActionsInc("cordon", nodeName, err)
} else {
err := node.CordonAndDrain()
if err != nil {
log.Log().Msgf("There was a problem while trying to cordon and drain the node: %v", err)
os.Exit(1)
}
log.Log().Msgf("Node %q successfully cordoned and drained.", nthConfig.NodeName)
log.Log().Msgf("Node %q successfully cordoned and drained.", nodeName)
metrics.NodeActionsInc("cordon-and-drain", nodeName, err)
}

interruptionEventStore.MarkAllAsDrained()
if nthConfig.WebhookURL != "" {
webhook.Post(nodeMetadata, drainEvent, nthConfig)
Expand Down
11 changes: 9 additions & 2 deletions config/helm/aws-node-termination-handler/README.md
Expand Up @@ -60,7 +60,7 @@ Parameter | Description | Default
`ignoreDaemonsSets` | Causes kubectl to skip daemon set managed pods | `true`
`instanceMetadataURL` | The URL of EC2 instance metadata. This shouldn't need to be changed unless you are testing. | `http://169.254.169.254:80`
`webhookURL` | Posts event data to URL upon instance interruption action | ``
`webhookProxy` | Uses the specified HTTP(S) proxy for sending webhooks | ``
`webhookProxy` | Uses the specified HTTP(S) proxy for sending webhooks | ``
`webhookHeaders` | Replaces the default webhook headers. | `{"Content-type":"application/json"}`
`webhookTemplate` | Replaces the default webhook message template. | `{"text":"[NTH][Instance Interruption] EventID: {{ .EventID }} - Kind: {{ .Kind }} - Description: {{ .Description }} - State: {{ .State }} - Start Time: {{ .StartTime }}"}`
`dryRun` | If true, only log if a node would be drained | `false`
Expand All @@ -84,6 +84,13 @@ Parameter | Description | Default
`serviceAccount.annotations` | Specifies the annotations for ServiceAccount | `{}`
`procUptimeFile` | (Used for Testing) Specify the uptime file | `/proc/uptime`
`securityContext.runAsUserID` | User ID to run the container | `1000`
`securityContext.runAsGroupID` | Group ID to run the container | `1000`
`securityContext.runAsGroupID` | Group ID to run the container | `1000`
`nodeSelectorTermsOs` | Operating System Node Selector Key | `beta.kubernetes.io/os`
`nodeSelectorTermsArch` | CPU Architecture Node Selector Key | `beta.kubernetes.io/arch`
`enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false`
`prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092`

## Metrics endpoint consideration
If prometheus server is enabled and since NTH is a daemonset with `host_networking=true`, nothing else will be able to bind to `:9092` (or the port configured) in the root network namespace
since it's listening on all interfaces.
Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
Expand Up @@ -49,7 +49,7 @@ spec:
- arm64
serviceAccountName: {{ template "aws-node-termination-handler.serviceAccountName" . }}
hostNetwork: true
dnsPolicy: {{ .Values.dnsPolicy }}
dnsPolicy: {{ .Values.dnsPolicy }}
containers:
- name: {{ include "aws-node-termination-handler.name" . }}
image: {{ .Values.image.repository}}:{{ .Values.image.tag }}
Expand Down Expand Up @@ -115,6 +115,10 @@ spec:
value: {{ .Values.jsonLogging | quote }}
- name: WEBHOOK_PROXY
value: {{ .Values.webhookProxy | quote }}
- name: ENABLE_PROMETHEUS_SERVER
value: {{ .Values.enablePrometheusServer | quote }}
- name: PROMETHEUS_SERVER_PORT
value: {{ .Values.prometheusServerPort | quote }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.nodeSelector }}
Expand Down
5 changes: 4 additions & 1 deletion config/helm/aws-node-termination-handler/values.yaml
Expand Up @@ -78,7 +78,10 @@ nodeSelector: {}
nodeSelectorTermsOs: ""
nodeSelectorTermsArch: ""

tolerations:
enablePrometheusServer: false
prometheusServerPort: "9092"

tolerations:
- operator: "Exists"

affinity: {}
Expand Down
4 changes: 4 additions & 0 deletions go.mod
Expand Up @@ -3,7 +3,11 @@ module github.com/aws/aws-node-termination-handler
go 1.14

require (
github.com/prometheus/client_golang v1.5.0 // indirect
github.com/rs/zerolog v1.18.0
go.opentelemetry.io/contrib/instrumentation/runtime v0.6.1
go.opentelemetry.io/otel v0.6.0
go.opentelemetry.io/otel/exporters/metric/prometheus v0.6.0
golang.org/x/time v0.0.0-20190921001708-c4c64cad1fd0 // indirect
k8s.io/api v0.0.0-20191010143144-fbf594f18f80
k8s.io/apimachinery v0.0.0-20191016060620-86f2f1b9c076
Expand Down

0 comments on commit 011a797

Please sign in to comment.