Skip to content

Commit

Permalink
Merge pull request #480 from p0lyn0mial/release-4.6-sar-tr-no-stack-o…
Browse files Browse the repository at this point in the history
…n-panic

Bug 1894916: Panic output due to timeouts in kube apiserver
  • Loading branch information
openshift-merge-robot committed Jan 28, 2021
2 parents c7377c1 + b929fa7 commit 24907ce
Show file tree
Hide file tree
Showing 13 changed files with 327 additions and 25 deletions.
4 changes: 4 additions & 0 deletions cmd/cloud-controller-manager/app/options/options_test.go
Expand Up @@ -111,10 +111,12 @@ func TestDefaultFlags(t *testing.T) {
ExtraHeaderPrefixes: []string{"x-remote-extra-"},
},
RemoteKubeConfigFileOptional: true,
ClientTimeout: 10 * time.Second,
},
Authorization: &apiserveroptions.DelegatingAuthorizationOptions{
AllowCacheTTL: 10 * time.Second,
DenyCacheTTL: 10 * time.Second,
ClientTimeout: 10 * time.Second,
RemoteKubeConfigFileOptional: true,
AlwaysAllowPaths: []string{"/healthz"}, // note: this does not match /healthz/ or
},
Expand Down Expand Up @@ -243,10 +245,12 @@ func TestAddFlags(t *testing.T) {
ExtraHeaderPrefixes: []string{"x-remote-extra-"},
},
RemoteKubeConfigFileOptional: true,
ClientTimeout: 10 * time.Second,
},
Authorization: &apiserveroptions.DelegatingAuthorizationOptions{
AllowCacheTTL: 10 * time.Second,
DenyCacheTTL: 10 * time.Second,
ClientTimeout: 10 * time.Second,
RemoteKubeConfigFileOptional: true,
AlwaysAllowPaths: []string{"/healthz"}, // note: this does not match /healthz/ or
},
Expand Down
2 changes: 1 addition & 1 deletion cmd/controller-manager/app/serve.go
Expand Up @@ -48,7 +48,7 @@ func BuildHandlerChain(apiHandler http.Handler, authorizationInfo *apiserver.Aut
}
handler = genericapifilters.WithRequestInfo(handler, requestInfoResolver)
handler = genericapifilters.WithCacheControl(handler)
handler = genericfilters.WithPanicRecovery(handler, nil)
handler = genericfilters.WithPanicRecovery(handler, nil, requestInfoResolver)

return handler
}
Expand Down
6 changes: 4 additions & 2 deletions cmd/kube-controller-manager/app/options/options_test.go
Expand Up @@ -399,8 +399,9 @@ func TestAddFlags(t *testing.T) {
BindNetwork: "tcp",
}).WithLoopback(),
Authentication: &apiserveroptions.DelegatingAuthenticationOptions{
CacheTTL: 10 * time.Second,
ClientCert: apiserveroptions.ClientCertAuthenticationOptions{},
CacheTTL: 10 * time.Second,
ClientTimeout: 10 * time.Second,
ClientCert: apiserveroptions.ClientCertAuthenticationOptions{},
RequestHeader: apiserveroptions.RequestHeaderAuthenticationOptions{
UsernameHeaders: []string{"x-remote-user"},
GroupHeaders: []string{"x-remote-group"},
Expand All @@ -411,6 +412,7 @@ func TestAddFlags(t *testing.T) {
Authorization: &apiserveroptions.DelegatingAuthorizationOptions{
AllowCacheTTL: 10 * time.Second,
DenyCacheTTL: 10 * time.Second,
ClientTimeout: 10 * time.Second,
RemoteKubeConfigFileOptional: true,
AlwaysAllowPaths: []string{"/healthz"}, // note: this does not match /healthz/ or /healthz/*
},
Expand Down
2 changes: 1 addition & 1 deletion cmd/kube-scheduler/app/server.go
Expand Up @@ -237,7 +237,7 @@ func buildHandlerChain(handler http.Handler, authn authenticator.Request, authz
handler = genericapifilters.WithAuthentication(handler, authn, failedHandler, nil)
handler = genericapifilters.WithRequestInfo(handler, requestInfoResolver)
handler = genericapifilters.WithCacheControl(handler)
handler = genericfilters.WithPanicRecovery(handler, nil)
handler = genericfilters.WithPanicRecovery(handler, nil, requestInfoResolver)

return handler
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/kubeapiserver/server/insecure_handler.go
Expand Up @@ -26,6 +26,7 @@ import (

// BuildInsecureHandlerChain sets up the server to listen to http. Should be removed.
func BuildInsecureHandlerChain(apiHandler http.Handler, c *server.Config) http.Handler {
requestInfoResolver := server.NewRequestInfoResolver(c)
handler := apiHandler
// Temporarily disable APIPriorityAndFairness during development
// so that /debug/pprof works even while this feature is totally
Expand All @@ -40,10 +41,10 @@ func BuildInsecureHandlerChain(apiHandler http.Handler, c *server.Config) http.H
handler = genericfilters.WithCORS(handler, c.CorsAllowedOriginList, nil, nil, nil, "true")
handler = genericfilters.WithTimeoutForNonLongRunningRequests(handler, c.LongRunningFunc, c.RequestTimeout)
handler = genericfilters.WithWaitGroup(handler, c.LongRunningFunc, c.HandlerChainWaitGroup)
handler = genericapifilters.WithRequestInfo(handler, server.NewRequestInfoResolver(c))
handler = genericapifilters.WithRequestInfo(handler, requestInfoResolver)
handler = genericapifilters.WithWarningRecorder(handler)
handler = genericapifilters.WithCacheControl(handler)
handler = genericfilters.WithPanicRecovery(handler, nil)
handler = genericfilters.WithPanicRecovery(handler, nil, requestInfoResolver)

return handler
}
27 changes: 27 additions & 0 deletions staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
Expand Up @@ -185,6 +185,16 @@ var (
},
[]string{"verb", "group", "version", "resource", "subresource", "scope", "component", "code"},
)
// requestAbortsTotal is a number of aborted requests with http.ErrAbortHandler
requestAbortsTotal = compbasemetrics.NewCounterVec(
&compbasemetrics.CounterOpts{
Name: "apiserver_request_aborts_total",
Help: "Number of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope",
StabilityLevel: compbasemetrics.ALPHA,
},
[]string{"verb", "group", "version", "resource", "subresource", "scope"},
)

kubectlExeRegexp = regexp.MustCompile(`^.*((?i:kubectl\.exe))`)

metrics = []resettableCollector{
Expand All @@ -201,6 +211,7 @@ var (
currentInflightRequests,
currentInqueueRequests,
requestTerminationsTotal,
requestAbortsTotal,
}

// these are the known (e.g. whitelisted/known) content types which we will report for
Expand Down Expand Up @@ -290,6 +301,22 @@ func UpdateInflightRequestMetrics(phase string, nonmutating, mutating int) {
}
}

// RecordRequestAbort records that the request was aborted possibly due to a timeout.
func RecordRequestAbort(req *http.Request, requestInfo *request.RequestInfo) {
if requestInfo == nil {
requestInfo = &request.RequestInfo{Verb: req.Method, Path: req.URL.Path}
}

scope := CleanScope(requestInfo)
reportedVerb := cleanVerb(canonicalVerb(strings.ToUpper(req.Method), scope), req)
resource := requestInfo.Resource
subresource := requestInfo.Subresource
group := requestInfo.APIGroup
version := requestInfo.APIVersion

requestAbortsTotal.WithLabelValues(reportedVerb, group, version, resource, subresource, scope).Inc()
}

// RecordRequestTermination records that the request was terminated early as part of a resource
// preservation or apiserver self-defense mechanism (e.g. timeouts, maxinflight throttling,
// proxyHandler errors). RecordRequestTermination should only be called zero or one times
Expand Down
2 changes: 1 addition & 1 deletion staging/src/k8s.io/apiserver/pkg/server/config.go
Expand Up @@ -777,7 +777,7 @@ func DefaultBuildHandlerChain(apiHandler http.Handler, c *Config) http.Handler {
handler = genericapifilters.WithWarningRecorder(handler)
handler = genericapifilters.WithCacheControl(handler)
handler = genericapifilters.WithRequestReceivedTimestamp(handler)
handler = genericfilters.WithPanicRecovery(handler, c.IsTerminating)
handler = genericfilters.WithPanicRecovery(handler, c.IsTerminating, c.RequestInfoResolver)
return handler
}

Expand Down
1 change: 1 addition & 0 deletions staging/src/k8s.io/apiserver/pkg/server/filters/BUILD
Expand Up @@ -40,6 +40,7 @@ go_test(
"//staging/src/k8s.io/component-base/metrics/legacyregistry:go_default_library",
"//staging/src/k8s.io/component-base/metrics/testutil:go_default_library",
"//vendor/golang.org/x/net/http2:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
],
)

Expand Down
22 changes: 11 additions & 11 deletions staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go
Expand Up @@ -33,8 +33,6 @@ import (
apirequest "k8s.io/apiserver/pkg/endpoints/request"
)

var errConnKilled = fmt.Errorf("killing connection/stream because serving request timed out and response had been started")

// WithTimeoutForNonLongRunningRequests times out non-long-running requests after the time given by timeout.
func WithTimeoutForNonLongRunningRequests(handler http.Handler, longRunning apirequest.LongRunningRequestCheck, timeout time.Duration) http.Handler {
if longRunning == nil {
Expand Down Expand Up @@ -246,15 +244,17 @@ func (tw *baseTimeoutWriter) timeout(err *apierrors.StatusError) {
// no way to timeout the HTTP request at the point. We have to shutdown
// the connection for HTTP1 or reset stream for HTTP2.
//
// Note from: Brad Fitzpatrick
// if the ServeHTTP goroutine panics, that will do the best possible thing for both
// HTTP/1 and HTTP/2. In HTTP/1, assuming you're replying with at least HTTP/1.1 and
// you've already flushed the headers so it's using HTTP chunking, it'll kill the TCP
// connection immediately without a proper 0-byte EOF chunk, so the peer will recognize
// the response as bogus. In HTTP/2 the server will just RST_STREAM the stream, leaving
// the TCP connection open, but resetting the stream to the peer so it'll have an error,
// like the HTTP/1 case.
panic(errConnKilled)
// Note from the golang's docs:
// If ServeHTTP panics, the server (the caller of ServeHTTP) assumes
// that the effect of the panic was isolated to the active request.
// It recovers the panic, logs a stack trace to the server error log,
// and either closes the network connection or sends an HTTP/2
// RST_STREAM, depending on the HTTP protocol. To abort a handler so
// the client sees an interrupted response but the server doesn't log
// an error, panic with the value ErrAbortHandler.
//
// We are throwing http.ErrAbortHandler deliberately so that a client is notified and to suppress a not helpful stacktrace in the logs
panic(http.ErrAbortHandler)
}
}

Expand Down

0 comments on commit 24907ce

Please sign in to comment.