Merge pull request #76 from simonpasquier/ocpbugs-18971-take-2

OCPBUGS-18971: limit number of simultaneous client requests
openshift · Oct 9, 2023 · f5d8b0a · f5d8b0a
2 parents d217980 + 4bebc51
commit f5d8b0a
Show file tree

Hide file tree

Showing 3 changed files with 207 additions and 1 deletion.
diff --git a/pkg/client/api.go b/pkg/client/api.go
@@ -128,11 +128,13 @@ func (c *httpAPIClient) Do(ctx context.Context, verb, endpoint string, query url
 
 // NewGenericAPIClient builds a new generic Prometheus API client for the given base URL and HTTP Client.
 func NewGenericAPIClient(client *http.Client, baseURL *url.URL, headers http.Header) GenericAPIClient {
-	return &httpAPIClient{
+	c := &httpAPIClient{
 		client:  client,
 		baseURL: baseURL,
 		headers: headers,
 	}
+
+	return newRequestLimiter(c)
 }
 
 const (

diff --git a/pkg/client/limiter.go b/pkg/client/limiter.go
@@ -0,0 +1,86 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package client
+
+import (
+	"context"
+	"net/url"
+
+	"k8s.io/component-base/metrics"
+	"k8s.io/component-base/metrics/legacyregistry"
+)
+
+// Without a limit, the adapter could flood the Prometheus API with many
+// requests when there are many pods running in the cluster because a query for
+// getting pod metrics across all namespaces translates into (2 x the number of
+// namespaces) queries to the Prometheus API.
+// In the worst case, the Prometheus pods can hit the maximum number of
+// listening sockets allowed by the kernel (e.g. SOMAXCONN) leading to
+// timed-out requests from other clients. In particular it can make the Kubelet
+// liveness probes being reported as down and trigger Prometheus pod restarts.
+// The number has been chosen from empirical data.
+const maxConcurrentRequests = 100
+
+var (
+	inflightRequests = metrics.NewGauge(
+		&metrics.GaugeOpts{
+			Namespace: "prometheus_adapter",
+			Subsystem: "prometheus_client",
+			Name:      "inflight_requests",
+			Help:      "Number of inflight requests to the Prometheus service",
+		})
+
+	maxRequests = metrics.NewGauge(
+		&metrics.GaugeOpts{
+			Namespace: "prometheus_adapter",
+			Subsystem: "prometheus_client",
+			Name:      "max_requests",
+			Help:      "Maximum number of requests to the Prometheus service",
+		})
+)
+
+func init() {
+	legacyregistry.MustRegister(inflightRequests, maxRequests)
+	maxRequests.Set(maxConcurrentRequests)
+}
+
+type requestLimitClient struct {
+	c        GenericAPIClient
+	inflight chan struct{}
+}
+
+func newRequestLimiter(c GenericAPIClient) GenericAPIClient {
+	return &requestLimitClient{
+		c:        c,
+		inflight: make(chan struct{}, maxConcurrentRequests),
+	}
+}
+
+func (c *requestLimitClient) Do(ctx context.Context, verb, endpoint string, query url.Values) (APIResponse, error) {
+	select {
+	case c.inflight <- struct{}{}:
+		inflightRequests.Inc()
+		defer func() {
+			inflightRequests.Dec()
+			<-c.inflight
+		}()
+	case <-ctx.Done():
+		return APIResponse{}, ctx.Err()
+	}
+
+	return c.c.Do(ctx, verb, endpoint, query)
+}
diff --git a/pkg/client/limiter_test.go b/pkg/client/limiter_test.go
@@ -0,0 +1,118 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package client
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func TestRequestLimitClient(t *testing.T) {
+	var (
+		ctx     = context.Background()
+		total   atomic.Int64
+		unblock = make(chan struct{})
+	)
+
+	srvCtx, srvCancel := context.WithCancel(ctx)
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		total.Add(1)
+
+		w.Write([]byte("{}"))
+		if r.URL.Path == "/nonblocking" {
+			return
+		}
+
+		// Requests will be blocked until the test closes the unblock channel or the test fails.
+		select {
+		case <-unblock:
+		case <-srvCtx.Done():
+		}
+	}))
+	defer func() {
+		srvCancel()
+		srv.Close()
+	}()
+
+	// Make as many requests as the max allowed number + 1.
+	var (
+		wg      sync.WaitGroup
+		errChan = make(chan error, maxConcurrentRequests+1)
+		u, _    = url.Parse(srv.URL)
+		c       = NewGenericAPIClient(&http.Client{}, u, nil)
+		do      = func(i int) {
+			defer wg.Done()
+
+			_, err := c.Do(ctx, "GET", "/", nil)
+			if err != nil {
+				err = fmt.Errorf("request #%d: %w", i, err)
+			}
+			errChan <- err
+		}
+	)
+	for i := 0; i < maxConcurrentRequests; i++ {
+		wg.Add(1)
+		go func(i int) {
+			do(i)
+		}(i)
+	}
+
+	// Wait for the first maxConcurrentRequests requests to hit the server.
+	for total.Load() != maxConcurrentRequests {
+	}
+
+	// Make one more request which should be blocked at the client level.
+	wg.Add(1)
+	go func() {
+		do(maxConcurrentRequests)
+	}()
+
+	// Make one more request which should be canceled before hitting the server.
+	ctx2, _ := context.WithTimeout(ctx, time.Second)
+	_, err := c.Do(ctx2, "GET", "/nonblocking", nil)
+	switch {
+	case err == nil:
+		t.Fatal("expected request to fail")
+	case ctx2.Err() == nil:
+		t.Fatal("expected request to timeout")
+	}
+
+	if total.Load() != maxConcurrentRequests {
+		t.Fatalf("expected %d requests on the server side, got %d", maxConcurrentRequests, total.Load())
+	}
+
+	// Release all inflight requests.
+	close(unblock)
+
+	// Wait for all requests to complete.
+	wg.Wait()
+
+	// Check that no error was returned.
+	close(errChan)
+	for err := range errChan {
+		if err != nil {
+			t.Fatalf("unexpected error: %s", err)
+		}
+	}
+}