test/e2e/instrumentation/logging/stackdriver/soak.go

/*
Copyright 2017 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package stackdriver

import (
	"fmt"
	"math"
	"time"

	"k8s.io/kubernetes/test/e2e/framework"
	instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common"
	"k8s.io/kubernetes/test/e2e/instrumentation/logging/utils"

	"github.com/onsi/ginkgo"
)

const (
	// maxAllowedLostFraction is the fraction of lost logs considered acceptable.
	maxAllowedLostFraction = 0.01
	// maxAllowedRestartsPerHour is the number of fluentd container restarts
	// considered acceptable. Once per hour is fine for now, as long as it
	// doesn't loose too much logs.
	maxAllowedRestartsPerHour = 1.0
	// lastPodIngestionSlack is the amount of time to wait for the last pod's
	// logs to be ingested by the logging agent.
	lastPodIngestionSlack = 5 * time.Minute
)

var _ = instrumentation.SIGDescribe("Cluster level logging implemented by Stackdriver [Feature:StackdriverLogging] [Soak]", func() {
	f := framework.NewDefaultFramework("sd-logging-load")

	ginkgo.It("should ingest logs from applications running for a prolonged amount of time", func() {
		withLogProviderForScope(f, podsScope, func(p *sdLogProvider) {
			nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items
			maxPodCount := 10
			jobDuration := 30 * time.Minute
			linesPerPodPerSecond := 100
			// TODO(instrumentation): Increase to 21 hrs
			testDuration := 3 * time.Hour
			ingestionInterval := 1 * time.Minute
			ingestionTimeout := testDuration + 30*time.Minute
			allowedRestarts := int(math.Ceil(float64(testDuration) /
				float64(time.Hour) * maxAllowedRestartsPerHour))

			podRunDelay := time.Duration(int64(jobDuration) / int64(maxPodCount))
			podRunCount := maxPodCount*(int(testDuration/jobDuration)-1) + 1
			linesPerPod := linesPerPodPerSecond * int(jobDuration.Seconds())

			// pods is a flat array of all pods to be run and to expect in Stackdriver.
			pods := []utils.FiniteLoggingPod{}
			// podsByRun is a two-dimensional array of pods, first dimension is the run
			// index, the second dimension is the node index. Since we want to create
			// an equal load on all nodes, for the same run we have one pod per node.
			podsByRun := [][]utils.FiniteLoggingPod{}
			for runIdx := 0; runIdx < podRunCount; runIdx++ {
				podsInRun := []utils.FiniteLoggingPod{}
				for nodeIdx, node := range nodes {
					podName := fmt.Sprintf("job-logs-generator-%d-%d-%d-%d", maxPodCount, linesPerPod, runIdx, nodeIdx)
					pod := utils.NewLoadLoggingPod(podName, node.Name, linesPerPod, jobDuration)
					pods = append(pods, pod)
					podsInRun = append(podsInRun, pod)
				}
				podsByRun = append(podsByRun, podsInRun)
			}

			ginkgo.By("Running short-living pods")
			go func() {
				t := time.NewTicker(podRunDelay)
				defer t.Stop()
				for runIdx := 0; runIdx < podRunCount; runIdx++ {
					// Starting one pod on each node.
					for _, pod := range podsByRun[runIdx] {
						if err := pod.Start(f); err != nil {
							framework.Logf("Failed to start pod: %v", err)
						}
					}
					<-t.C
				}
			}()

			checker := utils.NewFullIngestionPodLogChecker(p, maxAllowedLostFraction, pods...)
			err := utils.WaitForLogs(checker, ingestionInterval, ingestionTimeout)
			framework.ExpectNoError(err)

			utils.EnsureLoggingAgentRestartsCount(f, p.LoggingAgentName(), allowedRestarts)
		})
	})
})