pkg/diagnostics/systemd/systemd.go

package systemd

import (
	"fmt"
	"regexp"

	"github.com/openshift/origin/pkg/diagnostics/log"
	"github.com/openshift/origin/pkg/diagnostics/types"
)

type logEntry struct {
	Message   string `json:"MESSAGE"`
	TimeStamp string `json:"__REALTIME_TIMESTAMP"` // epoch + ns
}

// logMatcher provides a regex for scanning log messages and parameters to interpret them when found.
type logMatcher struct {
	Regexp         *regexp.Regexp
	Level          log.Level
	Id             string
	Interpretation string // log with above level+id if it's simple
	KeepAfterMatch bool   // usually note only first matched entry, ignore rest
	// Interpret, if provided, runs custom logic against a match. Its return bool is the equivalent of KeepAfterMatch.
	Interpret func(
		entry *logEntry,
		matches []string,
		r types.DiagnosticResult,
	) bool
}

type unitSpec struct {
	Names       []string
	StartMatch  *regexp.Regexp // regex to look for in log messages indicating startup
	LogMatchers []logMatcher   // suspect log patterns to check for - checked in order
}

//
// -------- These are things that feed into the diagnostics definitions -----------
//

// Reusable log matchers:
var badImageTemplate = logMatcher{
	Regexp: regexp.MustCompile(`Unable to find an image for .* due to an error processing the format: the key .* is not recognized`),
	Level:  log.InfoLevel,
	Interpretation: `
This error indicates the openshift command was given the --images flag
with an invalid format variable. Valid formats can include (literally)
${component} and ${version}; any others will cause this error.
This could be a typo or you might be intending to hardcode something,
such as a version which should be specified as e.g. v3.0, not ${v3.0}.
Note that the --images flag may be supplied via the master, node, or
"openshift admin registry/router" invocations and should usually
be the same for each.`,
}

// captures for logMatcher Interpret functions to store state between matches
var tlsClientErrorSeen map[string]bool

// TODO, these should probably be split into interfaces similar to Diagnostic
// Specify what units we can check and what to look for and say about it
var unitLogSpecs = []*unitSpec{
	{
		Names:      []string{"origin-master", "atomic-openshift-master"},
		StartMatch: regexp.MustCompile("Starting \\w+ Master"),
		LogMatchers: []logMatcher{
			badImageTemplate,
			{
				Regexp:         regexp.MustCompile("Unable to decode an event from the watch stream: local error: unexpected message"),
				Level:          log.InfoLevel,
				Id:             "DS2003",
				Interpretation: "You can safely ignore this message.",
			},
			{
				// TODO: don't rely on ipv4 format, should be ipv6 "soon"
				Regexp: regexp.MustCompile("http: TLS handshake error from ([\\d.]+):\\d+: remote error: bad certificate"),
				Level:  log.WarnLevel,
				Interpret: func(entry *logEntry, matches []string, r types.DiagnosticResult) bool {
					client := matches[1]
					prelude := fmt.Sprintf("Found master journald log message:\n  %s\n", entry.Message)
					if tlsClientErrorSeen == nil { // first time this message was seen
						tlsClientErrorSeen = map[string]bool{client: true}
						// TODO: too generic, adjust message depending on subnet of the "from" address
						r.Warn("DS2001", nil, prelude+`
This error indicates that a client attempted to connect to the master
HTTPS API server but broke off the connection because the master's
certificate is not validated by a cerificate authority (CA) acceptable
to the client. There are a number of ways this can occur, some more
problematic than others.

At this time, the master API certificate is signed by a private CA
(created the first time the master runs) and clients should have a copy of
that CA certificate in order to validate connections to the master. Most
likely, either:
1. the master has generated a new CA (e.g. after the administrator
   deleted the old one) and the client has a copy of the old CA cert, or
2. the client hasn't been configured with a private CA at all (or the
   wrong one), or
3. the client is attempting to reach the master at a URL that isn't
   covered by the master's server certificate, e.g. a public-facing
   name or IP that isn't known to the master automatically; this may
   need to be specified with the --public-master flag on the master
   in order to generate a new server certificate including it.

Clients of the master may include users, nodes, and infrastructure
components running as containers. Check the "from" IP address in the
log message:
* If it is from a SDN IP, it is likely from an infrastructure
  component. Check pod logs and recreate it with the correct CA cert.
  Routers and registries won't work properly with the wrong CA.
* If it is from a node IP, the client is likely a node. Check the
  node logs and reconfigure with the correct CA cert.
  Nodes will be unable to create pods until this is corrected.
* If it is from an external IP, it is likely from a user (CLI, browser,
  etc.). Command line clients should be configured with the correct
  CA cert; browsers can also add CA certs but it is usually easier
  to just have them accept the server certificate on the first visit
  (so this message may simply indicate that the master generated a new
  server certificate, e.g. to add a different --public-master, and a
  browser hasn't accepted it yet and is still attempting API calls;
  try logging out of the console and back in again).`)

					} else if !tlsClientErrorSeen[client] {
						tlsClientErrorSeen[client] = true
						r.Warn("DS2002", nil, prelude+`This message was diagnosed above, but for a different client address.`)
					} // else, it's a repeat, don't mention it
					return true // show once for every client failing to connect, not just the first
				},
			},
		},
	},
	{
		Names:      []string{"origin-node", "atomic-openshift-node"},
		StartMatch: regexp.MustCompile("Starting \\w+ Node"), //systemd puts this out; could change
		LogMatchers: []logMatcher{
			badImageTemplate,
			{
				Regexp: regexp.MustCompile(`Unable to register.*"system:anonymous"`),
				Level:  log.ErrorLevel,
				Id:     "DS2004",
				Interpretation: `
This node could not register with the master API because it lacks
the proper credentials. Nodes should specify a client certificate in
order to identify themselves to the master. This message typically means
that either no client key/cert was supplied, or it is not validated
by the certificate authority (CA) the master uses. You should supply
a correct client key and certificate in the .kubeconfig specified in
node-config.yaml

This host will not function as a node until this is resolved. Pods
scheduled for this node will remain in pending or unknown state forever.`,
			},
			{
				Regexp: regexp.MustCompile("Could not find an allocated subnet for"),
				Level:  log.WarnLevel,
				Id:     "DS2005",
				Interpretation: `
This warning occurs when the node is trying to request the
SDN subnet it should be configured with according to the master,
but either can't connect to it or has not yet been assigned a subnet.

This can occur before the master becomes fully available and defines a
record for the node to use; the node will wait until that occurs,
so the presence of this message in the node log isn't necessarily a
problem as long as the SDN is actually working, but this message may
help indicate the problem if it is not working.

If the master is available and this log message persists, then it may
be a sign of a different misconfiguration. Check the master's URL in
the node kubeconfig.
 * Is the protocol http? It should be https.
 * Can you reach the address and port from the node using curl -k?
`,
			},
		},
	},
	{
		Names:      []string{"docker"},
		StartMatch: regexp.MustCompile(`Starting Docker`), // RHEL Docker at least
		LogMatchers: []logMatcher{
			{
				Regexp: regexp.MustCompile(`Usage: docker \\[OPTIONS\\] COMMAND`),
				Level:  log.ErrorLevel,
				Id:     "DS2006",
				Interpretation: `
This indicates that docker failed to parse its command line
successfully, so it just printed a standard usage message and exited.
Its command line is built from variables in /etc/sysconfig/docker
(which may be overridden by variables in /etc/sysconfig/openshift-sdn-node)
so check there for problems.

The node will not run on this host until this is resolved.`,
			},
			{
				Regexp: regexp.MustCompile(`^Unable to open the database file: unable to open database file$`),
				Level:  log.ErrorLevel,
				Id:     "DS2007",
				Interpretation: `
This indicates that docker failed to record its state to its database.
The most likely reason is that it is out of disk space. It is also
possible for other device or permissions problems to be at fault.

Sometimes this is due to excess completed containers not being cleaned
up. You can delete all completed containers with this command (running
containers will not be deleted):

  # docker rm $(docker ps -qa)

Whatever the reason, docker will not function in this state.
The node will not run on this host until this is resolved.`,
			},
			{
				Regexp: regexp.MustCompile(`no space left on device$`),
				Level:  log.ErrorLevel,
				Id:     "DS2008",
				Interpretation: `
This indicates that docker has run out of space for container volumes
or metadata (by default, stored in /var/lib/docker, but configurable).

docker will not function in this state. It requires that disk space be
added to the relevant filesystem or files deleted to make space.
Sometimes this is due to excess completed containers not being cleaned
up. You can delete all completed containers with this command (running
containers will not be deleted):

  # docker rm $(docker ps -qa)

The node will not run on this host until this is resolved.`,
			},
			{ // generic error seen - do this last
				Regexp: regexp.MustCompile(`\\slevel="fatal"\\s`),
				Level:  log.ErrorLevel,
				Id:     "DS2009",
				Interpretation: `
This is not a known problem, but it is causing Docker to crash,
so the node will not run on this host until it is resolved.`,
			},
		},
	},
}