Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1811834: Sync jsonnet dependencies #722

Merged
merged 5 commits into from Mar 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
334 changes: 62 additions & 272 deletions assets/grafana/dashboard-definitions.yaml

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions assets/node-exporter/daemonset.yaml
Expand Up @@ -2,17 +2,20 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
name: node-exporter
namespace: openshift-monitoring
spec:
selector:
matchLabels:
app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
template:
metadata:
labels:
app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
spec:
containers:
- args:
Expand Down
8 changes: 5 additions & 3 deletions assets/node-exporter/service-monitor.yaml
Expand Up @@ -2,7 +2,8 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
name: node-exporter
namespace: openshift-monitoring
spec:
Expand All @@ -22,7 +23,8 @@ spec:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
insecureSkipVerify: false
serverName: server-name-replaced-at-runtime
jobLabel: k8s-app
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
k8s-app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
6 changes: 4 additions & 2 deletions assets/node-exporter/service.yaml
Expand Up @@ -4,7 +4,8 @@ metadata:
annotations:
service.alpha.openshift.io/serving-cert-secret-name: node-exporter-tls
labels:
k8s-app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
name: node-exporter
namespace: openshift-monitoring
spec:
Expand All @@ -14,4 +15,5 @@ spec:
port: 9100
targetPort: https
selector:
app: node-exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
61 changes: 44 additions & 17 deletions assets/prometheus-k8s/rules.yaml
Expand Up @@ -182,22 +182,22 @@ spec:
- name: kube-apiserver.rules
rules:
- expr: |
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod)
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
record: cluster:apiserver_request_duration_seconds:mean5m
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) without(instance, pod))
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
Expand Down Expand Up @@ -804,6 +804,44 @@ spec:
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used'
summary: Number of conntrack are getting close to the limit
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP
is configured on this host.
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need the job label selector here, like we had for ClockSkewDetected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That metric comes only from node_exporter which is deployed as DaemonSet, so I don't see why we would need it.

for: 10m
labels:
severity: warning
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
Expand Down Expand Up @@ -1286,7 +1324,7 @@ spec:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
expr: |
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
for: 15m
labels:
severity: warning
Expand Down Expand Up @@ -1602,17 +1640,6 @@ spec:
expr: vector(1)
labels:
severity: none
- name: node-time
rules:
- alert: ClockSkewDetected
annotations:
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
}}. Ensure NTP is configured correctly on this host.
expr: |
abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
for: 2m
labels:
severity: warning
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
Expand Down
12 changes: 12 additions & 0 deletions assets/prometheus-operator-user-workload/cluster-role.yaml
Expand Up @@ -87,3 +87,15 @@ rules:
- get
- list
- watch
- apiGroups:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know why this was added for prometheus-operator in kube-prom, can't seem to find it in the prometheus-operator repo itself? https://github.com/coreos/prometheus-operator/blob/master/example/rbac/prometheus-operator/prometheus-operator-cluster-role.yaml should this be updated there?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without those kube-rbac-proxy didn't work and metrics couldn't be scraped.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay that explains that, thanks!

- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
26 changes: 26 additions & 0 deletions assets/prometheus-operator-user-workload/deployment.yaml
Expand Up @@ -43,6 +43,28 @@ spec:
memory: 60Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
- args:
- --logtostderr
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:8080/
- --tls-cert-file=/etc/tls/private/tls.crt
- --tls-private-key-file=/etc/tls/private/tls.key
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
name: kube-rbac-proxy
ports:
- containerPort: 8443
name: https
resources:
requests:
cpu: 1m
memory: 40Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /etc/tls/private
name: prometheus-operator-user-workload-tls
readOnly: false
nodeSelector:
beta.kubernetes.io/os: linux
node-role.kubernetes.io/master: ""
Expand All @@ -53,3 +75,7 @@ spec:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
volumes:
- name: prometheus-operator-user-workload-tls
secret:
secretName: prometheus-operator-user-workload-tls
9 changes: 7 additions & 2 deletions assets/prometheus-operator-user-workload/service-monitor.yaml
Expand Up @@ -9,8 +9,13 @@ metadata:
namespace: openshift-user-workload-monitoring
spec:
endpoints:
- honorLabels: true
port: http
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
port: https
scheme: https
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: server-name-replaced-at-runtime
selector:
matchLabels:
app.kubernetes.io/component: controller
Expand Down
8 changes: 5 additions & 3 deletions assets/prometheus-operator-user-workload/service.yaml
@@ -1,6 +1,8 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.alpha.openshift.io/serving-cert-secret-name: prometheus-operator-user-workload-tls
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
Expand All @@ -10,9 +12,9 @@ metadata:
spec:
clusterIP: None
ports:
- name: http
port: 8080
targetPort: http
- name: https
port: 8443
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
12 changes: 12 additions & 0 deletions assets/prometheus-operator/cluster-role.yaml
Expand Up @@ -87,3 +87,15 @@ rules:
- get
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
28 changes: 27 additions & 1 deletion assets/prometheus-operator/deployment.yaml
Expand Up @@ -28,8 +28,8 @@ spec:
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.38.0
- --namespaces=openshift-monitoring
- --prometheus-instance-namespaces=openshift-monitoring
- --alertmanager-instance-namespaces=openshift-monitoring
- --thanos-ruler-instance-namespaces=openshift-monitoring
- --alertmanager-instance-namespaces=openshift-monitoring
- --config-reloader-cpu=0
- --config-reloader-memory=0
image: quay.io/coreos/prometheus-operator:v0.38.0
Expand All @@ -43,6 +43,28 @@ spec:
memory: 60Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
- args:
- --logtostderr
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:8080/
- --tls-cert-file=/etc/tls/private/tls.crt
- --tls-private-key-file=/etc/tls/private/tls.key
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
name: kube-rbac-proxy
ports:
- containerPort: 8443
name: https
resources:
requests:
cpu: 1m
memory: 40Mi
securityContext: {}
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /etc/tls/private
name: prometheus-operator-tls
readOnly: false
nodeSelector:
beta.kubernetes.io/os: linux
node-role.kubernetes.io/master: ""
Expand All @@ -53,3 +75,7 @@ spec:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
volumes:
- name: prometheus-operator-tls
secret:
secretName: prometheus-operator-tls
9 changes: 7 additions & 2 deletions assets/prometheus-operator/service-monitor.yaml
Expand Up @@ -9,8 +9,13 @@ metadata:
namespace: openshift-monitoring
spec:
endpoints:
- honorLabels: true
port: http
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have 2 endpoint entries?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Fixing.

honorLabels: true
port: https
scheme: https
tlsConfig:
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
serverName: server-name-replaced-at-runtime
selector:
matchLabels:
app.kubernetes.io/component: controller
Expand Down
8 changes: 5 additions & 3 deletions assets/prometheus-operator/service.yaml
@@ -1,6 +1,8 @@
apiVersion: v1
kind: Service
metadata:
annotations:
service.alpha.openshift.io/serving-cert-secret-name: prometheus-operator-tls
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
Expand All @@ -10,9 +12,9 @@ metadata:
spec:
clusterIP: None
ports:
- name: http
port: 8080
targetPort: http
- name: https
port: 8443
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator