-
Notifications
You must be signed in to change notification settings - Fork 0
/
summary.go
295 lines (253 loc) · 9.81 KB
/
summary.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// Copyright 2018 The Kubernetes Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package summary
import (
"context"
"fmt"
"math"
"time"
"github.com/golang/glog"
"github.com/kubernetes-incubator/metrics-server/pkg/sources"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
v1listers "k8s.io/client-go/listers/core/v1"
stats "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
)
var (
summaryRequestLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "metrics_server",
Subsystem: "kubelet_summary",
Name: "request_duration_seconds",
Help: "The Kubelet summary request latencies in seconds.",
// TODO(directxman12): it would be nice to calculate these buckets off of scrape duration,
// like we do elsewhere, but we're not passed the scrape duration at this level.
Buckets: prometheus.DefBuckets,
},
[]string{"node"},
)
scrapeTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "metrics_server",
Subsystem: "kubelet_summary",
Name: "scrapes_total",
Help: "Total number of attempted Summary API scrapes done by Metrics Server",
},
[]string{"success"},
)
)
func init() {
prometheus.MustRegister(summaryRequestLatency)
prometheus.MustRegister(scrapeTotal)
}
// NodeInfo contains the information needed to identify and connect to a particular node
// (node name and preferred address).
type NodeInfo struct {
Name string
ConnectAddress string
}
// Kubelet-provided metrics for pod and system container.
type summaryMetricsSource struct {
node NodeInfo
kubeletClient KubeletInterface
}
func NewSummaryMetricsSource(node NodeInfo, client KubeletInterface) sources.MetricSource {
return &summaryMetricsSource{
node: node,
kubeletClient: client,
}
}
func (src *summaryMetricsSource) Name() string {
return src.String()
}
func (src *summaryMetricsSource) String() string {
return fmt.Sprintf("kubelet_summary:%s", src.node.Name)
}
func (src *summaryMetricsSource) Collect(ctx context.Context) (*sources.MetricsBatch, error) {
summary, err := func() (*stats.Summary, error) {
startTime := time.Now()
defer summaryRequestLatency.WithLabelValues(src.node.Name).Observe(float64(time.Since(startTime)) / float64(time.Second))
return src.kubeletClient.GetSummary(ctx, src.node.ConnectAddress)
}()
if err != nil {
scrapeTotal.WithLabelValues("false").Inc()
return nil, fmt.Errorf("unable to fetch metrics from Kubelet %s (%s): %v", src.node.Name, src.node.ConnectAddress, err)
}
scrapeTotal.WithLabelValues("true").Inc()
res := &sources.MetricsBatch{
Nodes: make([]sources.NodeMetricsPoint, 1),
Pods: make([]sources.PodMetricsPoint, len(summary.Pods)),
}
var errs []error
errs = append(errs, src.decodeNodeStats(&summary.Node, &res.Nodes[0])...)
if len(errs) != 0 {
// if we had errors providing node metrics, discard the data point
// so that we don't incorrectly report metric values as zero.
res.Nodes = res.Nodes[:1]
}
num := 0
for _, pod := range summary.Pods {
podErrs := src.decodePodStats(&pod, &res.Pods[num])
errs = append(errs, podErrs...)
if len(podErrs) != 0 {
// NB: we explicitly want to discard pods with partial results, since
// the horizontal pod autoscaler takes special action when a pod is missing
// metrics (and zero CPU or memory does not count as "missing metrics")
// we don't care if we reuse slots in the result array,
// because they get completely overwritten in decodePodStats
continue
}
num++
}
res.Pods = res.Pods[:num]
return res, utilerrors.NewAggregate(errs)
}
func (src *summaryMetricsSource) decodeNodeStats(nodeStats *stats.NodeStats, target *sources.NodeMetricsPoint) []error {
timestamp, err := getScrapeTime(nodeStats.CPU, nodeStats.Memory)
if err != nil {
// if we can't get a timestamp, assume bad data in general
return []error{fmt.Errorf("unable to get valid timestamp for metric point for node %q, discarding data: %v", src.node.ConnectAddress, err)}
}
*target = sources.NodeMetricsPoint{
Name: src.node.Name,
MetricsPoint: sources.MetricsPoint{
Timestamp: timestamp,
},
}
var errs []error
if err := decodeCPU(&target.CpuUsage, nodeStats.CPU); err != nil {
errs = append(errs, fmt.Errorf("unable to get CPU for node %q, discarding data: %v", src.node.ConnectAddress, err))
}
if err := decodeMemory(&target.MemoryUsage, nodeStats.Memory); err != nil {
errs = append(errs, fmt.Errorf("unable to get memory for node %q, discarding data: %v", src.node.ConnectAddress, err))
}
return errs
}
func (src *summaryMetricsSource) decodePodStats(podStats *stats.PodStats, target *sources.PodMetricsPoint) []error {
// completely overwrite data in the target
*target = sources.PodMetricsPoint{
Name: podStats.PodRef.Name,
Namespace: podStats.PodRef.Namespace,
Containers: make([]sources.ContainerMetricsPoint, len(podStats.Containers)),
}
var errs []error
for i, container := range podStats.Containers {
timestamp, err := getScrapeTime(container.CPU, container.Memory)
if err != nil {
// if we can't get a timestamp, assume bad data in general
errs = append(errs, fmt.Errorf("unable to get a valid timestamp for metric point for container %q in pod %s/%s on node %q, discarding data: %v", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
continue
}
point := sources.ContainerMetricsPoint{
Name: container.Name,
MetricsPoint: sources.MetricsPoint{
Timestamp: timestamp,
},
}
if err := decodeCPU(&point.CpuUsage, container.CPU); err != nil {
errs = append(errs, fmt.Errorf("unable to get CPU for container %q in pod %s/%s on node %q, discarding data: %v", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
}
if err := decodeMemory(&point.MemoryUsage, container.Memory); err != nil {
errs = append(errs, fmt.Errorf("unable to get memory for container %q in pod %s/%s on node %q: %v, discarding data", container.Name, target.Namespace, target.Name, src.node.ConnectAddress, err))
}
target.Containers[i] = point
}
return errs
}
func decodeCPU(target *resource.Quantity, cpuStats *stats.CPUStats) error {
if cpuStats == nil || cpuStats.UsageNanoCores == nil {
return fmt.Errorf("missing cpu usage metric")
}
*target = *uint64Quantity(*cpuStats.UsageNanoCores, -9)
return nil
}
func decodeMemory(target *resource.Quantity, memStats *stats.MemoryStats) error {
if memStats == nil || memStats.WorkingSetBytes == nil {
return fmt.Errorf("missing memory usage metric")
}
*target = *uint64Quantity(*memStats.WorkingSetBytes, 0)
target.Format = resource.BinarySI
return nil
}
func getScrapeTime(cpu *stats.CPUStats, memory *stats.MemoryStats) (time.Time, error) {
// Ensure we get the earlier timestamp so that we can tell if a given data
// point was tainted by pod initialization.
var earliest *time.Time
if cpu != nil && !cpu.Time.IsZero() && (earliest == nil || earliest.After(cpu.Time.Time)) {
earliest = &cpu.Time.Time
}
if memory != nil && !memory.Time.IsZero() && (earliest == nil || earliest.After(memory.Time.Time)) {
earliest = &memory.Time.Time
}
if earliest == nil {
return time.Time{}, fmt.Errorf("no non-zero timestamp on either CPU or memory")
}
return *earliest, nil
}
// uint64Quantity converts a uint64 into a Quantity, which only has constructors
// that work with int64 (except for parse, which requires costly round-trips to string).
// We lose precision until we fit in an int64 if greater than the max int64 value.
func uint64Quantity(val uint64, scale resource.Scale) *resource.Quantity {
// easy path -- we can safely fit val into an int64
if val <= math.MaxInt64 {
return resource.NewScaledQuantity(int64(val), scale)
}
glog.V(1).Infof("unexpectedly large resource value %v, loosing precision to fit in scaled resource.Quantity", val)
// otherwise, lose an decimal order-of-magnitude precision,
// so we can fit into a scaled quantity
return resource.NewScaledQuantity(int64(val/10), resource.Scale(1)+scale)
}
type summaryProvider struct {
nodeLister v1listers.NodeLister
kubeletClient KubeletInterface
addrResolver NodeAddressResolver
}
func (p *summaryProvider) GetMetricSources() ([]sources.MetricSource, error) {
sources := []sources.MetricSource{}
nodes, err := p.nodeLister.List(labels.Everything())
if err != nil {
return nil, fmt.Errorf("unable to list nodes: %v", err)
}
var errs []error
for _, node := range nodes {
info, err := p.getNodeInfo(node)
if err != nil {
errs = append(errs, fmt.Errorf("unable to extract connection information for node %q: %v", node.Name, err))
continue
}
sources = append(sources, NewSummaryMetricsSource(info, p.kubeletClient))
}
return sources, utilerrors.NewAggregate(errs)
}
func (p *summaryProvider) getNodeInfo(node *corev1.Node) (NodeInfo, error) {
addr, err := p.addrResolver.NodeAddress(node)
if err != nil {
return NodeInfo{}, err
}
info := NodeInfo{
Name: node.Name,
ConnectAddress: addr,
}
return info, nil
}
func NewSummaryProvider(nodeLister v1listers.NodeLister, kubeletClient KubeletInterface, addrResolver NodeAddressResolver) sources.MetricSourceProvider {
return &summaryProvider{
nodeLister: nodeLister,
kubeletClient: kubeletClient,
addrResolver: addrResolver,
}
}