Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prometheus metrics output to docker #25820

Merged
merged 1 commit into from Oct 27, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmd/dockerd/daemon.go
Expand Up @@ -248,6 +248,15 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) {
return fmt.Errorf("Error starting daemon: %v", err)
}

if cli.Config.MetricsAddress != "" {
if !d.HasExperimental() {
return fmt.Errorf("metrics-addr is only supported when experimental is enabled")
}
if err := startMetricsServer(cli.Config.MetricsAddress); err != nil {
return err
}
}

name, _ := os.Hostname()

c, err := cluster.New(cluster.Config{
Expand Down
27 changes: 27 additions & 0 deletions cmd/dockerd/metrics.go
@@ -0,0 +1,27 @@
package main

import (
"net"
"net/http"

"github.com/Sirupsen/logrus"
metrics "github.com/docker/go-metrics"
)

func startMetricsServer(addr string) error {
if err := allocateDaemonPort(addr); err != nil {
return err
}
l, err := net.Listen("tcp", addr)
if err != nil {
return err
}
mux := http.NewServeMux()
mux.Handle("/metrics", metrics.Handler())
go func() {
if err := http.Serve(l, mux); err != nil {
logrus.Errorf("serve metrics api: %s", err)
}
}()
return nil
}
14 changes: 12 additions & 2 deletions daemon/changes.go
@@ -1,15 +1,25 @@
package daemon

import "github.com/docker/docker/pkg/archive"
import (
"time"

"github.com/docker/docker/pkg/archive"
)

// ContainerChanges returns a list of container fs changes
func (daemon *Daemon) ContainerChanges(name string) ([]archive.Change, error) {
start := time.Now()
container, err := daemon.GetContainer(name)
if err != nil {
return nil, err
}

container.Lock()
defer container.Unlock()
return container.RWLayer.Changes()
c, err := container.RWLayer.Changes()
if err != nil {
return nil, err
}
containerActions.WithValues("changes").UpdateSince(start)
return c, nil
}
2 changes: 2 additions & 0 deletions daemon/commit.go
Expand Up @@ -120,6 +120,7 @@ func merge(userConf, imageConf *containertypes.Config) error {
// Commit creates a new filesystem image from the current state of a container.
// The image can optionally be tagged into a repository.
func (daemon *Daemon) Commit(name string, c *backend.ContainerCommitConfig) (string, error) {
start := time.Now()
container, err := daemon.GetContainer(name)
if err != nil {
return "", err
Expand Down Expand Up @@ -244,6 +245,7 @@ func (daemon *Daemon) Commit(name string, c *backend.ContainerCommitConfig) (str
"comment": c.Comment,
}
daemon.LogContainerEventWithAttributes(container, "commit", attributes)
containerActions.WithValues("commit").UpdateSince(start)
return id.String(), nil
}

Expand Down
3 changes: 3 additions & 0 deletions daemon/config.go
Expand Up @@ -146,6 +146,7 @@ type CommonConfig struct {
// given to the /swarm/init endpoint and no advertise address is
// specified.
SwarmDefaultAdvertiseAddr string `json:"swarm-default-advertise-addr"`
MetricsAddress string `json:"metrics-addr"`

LogConfig
bridgeConfig // bridgeConfig holds bridge network specific configuration.
Expand Down Expand Up @@ -191,6 +192,8 @@ func (config *Config) InstallCommonFlags(flags *pflag.FlagSet) {
flags.StringVar(&config.SwarmDefaultAdvertiseAddr, "swarm-default-advertise-addr", "", "Set default address or interface for swarm advertised address")
flags.BoolVar(&config.Experimental, "experimental", false, "Enable experimental features")

flags.StringVar(&config.MetricsAddress, "metrics-addr", "", "Set default address and port to serve the metrics api on")

config.MaxConcurrentDownloads = &maxConcurrentDownloads
config.MaxConcurrentUploads = &maxConcurrentUploads
}
Expand Down
23 changes: 18 additions & 5 deletions daemon/container_operations.go
Expand Up @@ -8,6 +8,7 @@ import (
"path"
"runtime"
"strings"
"time"

"github.com/Sirupsen/logrus"
derr "github.com/docker/docker/api/errors"
Expand Down Expand Up @@ -284,8 +285,11 @@ func (daemon *Daemon) updateEndpointNetworkSettings(container *container.Contain
// UpdateNetwork is used to update the container's network (e.g. when linked containers
// get removed/unlinked).
func (daemon *Daemon) updateNetwork(container *container.Container) error {
ctrl := daemon.netController
sid := container.NetworkSettings.SandboxID
var (
start = time.Now()
ctrl = daemon.netController
sid = container.NetworkSettings.SandboxID
)

sb, err := ctrl.SandboxByID(sid)
if err != nil {
Expand Down Expand Up @@ -319,6 +323,8 @@ func (daemon *Daemon) updateNetwork(container *container.Container) error {
return fmt.Errorf("Update network failed: Failure in refresh sandbox %s: %v", sid, err)
}

networkActions.WithValues("update").UpdateSince(start)

return nil
}

Expand Down Expand Up @@ -452,6 +458,7 @@ func (daemon *Daemon) updateContainerNetworkSettings(container *container.Contai
}

func (daemon *Daemon) allocateNetwork(container *container.Container) error {
start := time.Now()
controller := daemon.netController

if daemon.netController == nil {
Expand Down Expand Up @@ -503,7 +510,11 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error {
}
}

return container.WriteHostConfig()
if err := container.WriteHostConfig(); err != nil {
return err
}
networkActions.WithValues("allocate").UpdateSince(start)
return nil
}

func (daemon *Daemon) getNetworkSandbox(container *container.Container) libnetwork.Sandbox {
Expand Down Expand Up @@ -613,16 +624,15 @@ func (daemon *Daemon) updateNetworkConfig(container *container.Container, n libn
}

func (daemon *Daemon) connectToNetwork(container *container.Container, idOrName string, endpointConfig *networktypes.EndpointSettings, updateSettings bool) (err error) {
start := time.Now()
if container.HostConfig.NetworkMode.IsContainer() {
return runconfig.ErrConflictSharedNetwork
}

if containertypes.NetworkMode(idOrName).IsBridge() &&
daemon.configStore.DisableBridge {
container.Config.NetworkDisabled = true
return nil
}

if endpointConfig == nil {
endpointConfig = &networktypes.EndpointSettings{}
}
Expand Down Expand Up @@ -714,6 +724,7 @@ func (daemon *Daemon) connectToNetwork(container *container.Container, idOrName
container.NetworkSettings.Ports = getPortMapInfo(sb)

daemon.LogNetworkEventWithAttributes(n, "connect", map[string]string{"container": container.ID})
networkActions.WithValues("connect").UpdateSince(start)
return nil
}

Expand Down Expand Up @@ -835,6 +846,7 @@ func (daemon *Daemon) getNetworkedContainer(containerID, connectedContainerID st
}

func (daemon *Daemon) releaseNetwork(container *container.Container) {
start := time.Now()
if daemon.netController == nil {
return
}
Expand Down Expand Up @@ -885,6 +897,7 @@ func (daemon *Daemon) releaseNetwork(container *container.Container) {
}
daemon.LogNetworkEventWithAttributes(nw, "disconnect", attributes)
}
networkActions.WithValues("release").UpdateSince(start)
}

func errRemovalContainer(containerID string) error {
Expand Down
4 changes: 3 additions & 1 deletion daemon/create.go
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"net"
"strings"
"time"

"github.com/Sirupsen/logrus"
"github.com/docker/docker/api/errors"
Expand Down Expand Up @@ -31,6 +32,7 @@ func (daemon *Daemon) ContainerCreate(params types.ContainerCreateConfig, valida
}

func (daemon *Daemon) containerCreate(params types.ContainerCreateConfig, managed bool, validateHostname bool) (types.ContainerCreateResponse, error) {
start := time.Now()
if params.Config == nil {
return types.ContainerCreateResponse{}, fmt.Errorf("Config cannot be empty in order to create a container")
}
Expand All @@ -57,7 +59,7 @@ func (daemon *Daemon) containerCreate(params types.ContainerCreateConfig, manage
if err != nil {
return types.ContainerCreateResponse{Warnings: warnings}, daemon.imageNotExistToErrcode(err)
}

containerActions.WithValues("create").UpdateSince(start)
return types.ContainerCreateResponse{ID: container.ID, Warnings: warnings}, nil
}

Expand Down
16 changes: 15 additions & 1 deletion daemon/daemon.go
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/docker/docker/container"
"github.com/docker/docker/daemon/events"
"github.com/docker/docker/daemon/exec"
"github.com/docker/docker/dockerversion"
"github.com/docker/libnetwork/cluster"
// register graph drivers
_ "github.com/docker/docker/daemon/graphdriver/register"
Expand Down Expand Up @@ -684,12 +685,25 @@ func NewDaemon(config *Config, registryService registry.Service, containerdRemot
return nil, err
}

// FIXME: this method never returns an error
info, _ := d.SystemInfo()

engineVersion.WithValues(
dockerversion.Version,
dockerversion.GitCommit,
info.Architecture,
info.Driver,
info.KernelVersion,
info.OperatingSystem,
).Set(1)
engineCpus.Set(float64(info.NCPU))
engineMemory.Set(float64(info.MemTotal))

return d, nil
}

func (daemon *Daemon) shutdownContainer(c *container.Container) error {
stopTimeout := c.StopTimeout()

// TODO(windows): Handle docker restart with paused containers
if c.IsPaused() {
// To terminate a process in freezer cgroup, we should send
Expand Down
7 changes: 6 additions & 1 deletion daemon/delete.go
Expand Up @@ -5,6 +5,7 @@ import (
"os"
"path"
"strings"
"time"

"github.com/Sirupsen/logrus"
"github.com/docker/docker/api/errors"
Expand All @@ -19,6 +20,7 @@ import (
// fails. If the remove succeeds, the container name is released, and
// network links are removed.
func (daemon *Daemon) ContainerRm(name string, config *types.ContainerRmConfig) error {
start := time.Now()
container, err := daemon.GetContainer(name)
if err != nil {
return err
Expand All @@ -40,7 +42,10 @@ func (daemon *Daemon) ContainerRm(name string, config *types.ContainerRmConfig)
return daemon.rmLink(container, name)
}

return daemon.cleanupContainer(container, config.ForceRemove, config.RemoveVolume)
err = daemon.cleanupContainer(container, config.ForceRemove, config.RemoveVolume)
containerActions.WithValues("delete").UpdateSince(start)

return err
}

func (daemon *Daemon) rmLink(container *container.Container, name string) error {
Expand Down
4 changes: 4 additions & 0 deletions daemon/events/events.go
Expand Up @@ -33,6 +33,7 @@ func New() *Events {
// of interface{}, so you need type assertion), and a function to call
// to stop the stream of events.
func (e *Events) Subscribe() ([]eventtypes.Message, chan interface{}, func()) {
eventSubscribers.Inc()
e.mu.Lock()
current := make([]eventtypes.Message, len(e.events))
copy(current, e.events)
Expand All @@ -49,6 +50,7 @@ func (e *Events) Subscribe() ([]eventtypes.Message, chan interface{}, func()) {
// last events, a channel in which you can expect new events (in form
// of interface{}, so you need type assertion).
func (e *Events) SubscribeTopic(since, until time.Time, ef *Filter) ([]eventtypes.Message, chan interface{}) {
eventSubscribers.Inc()
e.mu.Lock()

var topic func(m interface{}) bool
Expand All @@ -72,12 +74,14 @@ func (e *Events) SubscribeTopic(since, until time.Time, ef *Filter) ([]eventtype

// Evict evicts listener from pubsub
func (e *Events) Evict(l chan interface{}) {
eventSubscribers.Dec()
e.pub.Evict(l)
}

// Log broadcasts event to listeners. Each listener has 100 millisecond for
// receiving event or it will be skipped.
func (e *Events) Log(action, eventType string, actor eventtypes.Actor) {
eventsCounter.Inc()
now := time.Now().UTC()
jm := eventtypes.Message{
Action: action,
Expand Down
15 changes: 15 additions & 0 deletions daemon/events/metrics.go
@@ -0,0 +1,15 @@
package events
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Creating a separate file for metrics is a smell, metrics should be treated like logging and live in the same file as what they're instrumenting.

Similarly that the same metric is being updated in multiple files is a smell, indicating that you're not instrumenting at the right place in your code. Either you shouldn't be using a label, or you should move the instrumentation up the call stack so each metric only lives in one file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the inferred pungency here?

Are you saying that metrics should be representative of the call tree? How do we reconcile implementation versus the exported API that metrics invariably become?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you saying that metrics should be representative of the call tree?

I'm saying that metrics should usually be right beside what they're instrumenting, and that a given metric should almost always live in exactly one file.

For this particular file I think it should be merged into events.go. This saves someone trying to understand the code having to jump around files.

My stance is based on years of trying to debug code that looked just like this. It gets really frustrating when you're already trying to understand a bug code spread across 10s functions in 10s files, to then have to open up another set of files to find out what the metrics available are and another set of files again to see which confusing behaviour this team's metrics wrapper has.

How do we reconcile implementation versus the exported API that metrics invariably become?

I'd expect metrics to change over time as changes are made to the code. A handful are likely to end up highly depended on, so some future compromise might be required if the choice is made to make them an official API.

For Prometheus itself our stability guarantees don't include the metrics, as that'd be too constraining and effectively prevent significant development as any major internal change would likely become a breaking change to metrics. Given Docker is under rapid development, you may wish to take a similar stance.


import "github.com/docker/go-metrics"

var (
eventsCounter metrics.Counter
eventSubscribers metrics.Gauge
)

func init() {
ns := metrics.NewNamespace("engine", "daemon", nil)
eventsCounter = ns.NewCounter("events", "The number of events logged")
eventSubscribers = ns.NewGauge("events_subscribers", "The number of current subscribers to events", metrics.Total)
metrics.Register(ns)
}
3 changes: 3 additions & 0 deletions daemon/health.go
Expand Up @@ -60,6 +60,7 @@ type cmdProbe struct {
// exec the healthcheck command in the container.
// Returns the exit code and probe output (if any)
func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {

cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
if p.shell {
if runtime.GOOS != "windows" {
Expand Down Expand Up @@ -157,8 +158,10 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
results := make(chan *types.HealthcheckResult)
go func() {
healthChecksCounter.Inc()
result, err := probe.run(ctx, d, c)
if err != nil {
healthChecksFailedCounter.Inc()
logrus.Warnf("Health check for container %s error: %v", c.ID, err)
results <- &types.HealthcheckResult{
ExitCode: -1,
Expand Down
10 changes: 9 additions & 1 deletion daemon/image_delete.go
Expand Up @@ -3,6 +3,7 @@ package daemon
import (
"fmt"
"strings"
"time"

"github.com/docker/docker/api/errors"
"github.com/docker/docker/api/types"
Expand Down Expand Up @@ -61,6 +62,7 @@ const (
// package. This would require that we no longer need the daemon to determine
// whether images are being used by a stopped or running container.
func (daemon *Daemon) ImageDelete(imageRef string, force, prune bool) ([]types.ImageDelete, error) {
start := time.Now()
records := []types.ImageDelete{}

imgID, err := daemon.GetImageID(imageRef)
Expand Down Expand Up @@ -168,7 +170,13 @@ func (daemon *Daemon) ImageDelete(imageRef string, force, prune bool) ([]types.I
}
}

return records, daemon.imageDeleteHelper(imgID, &records, force, prune, removedRepositoryRef)
if err := daemon.imageDeleteHelper(imgID, &records, force, prune, removedRepositoryRef); err != nil {
return nil, err
}

imageActions.WithValues("delete").UpdateSince(start)

return records, nil
}

// isSingleReference returns true when all references are from one repository
Expand Down