Skip to content

Commit

Permalink
Secure metrics endpoint with oauth-proxy
Browse files Browse the repository at this point in the history
Prometheus server can't access ES metrics, because ES uses certs
signed by a self-signed CA, which isn't recognized.

Instead a proxy is used, which presents certs signed by a CA
which can be accessed through serving-certs-ca-bundle
  • Loading branch information
Josef Karasek committed Feb 27, 2019
1 parent 2263e57 commit f5ab6c1
Show file tree
Hide file tree
Showing 16 changed files with 435 additions and 1,117 deletions.
19 changes: 9 additions & 10 deletions files/prometheus_alerts.yml
Expand Up @@ -4,7 +4,7 @@
"rules":
- "alert": "ElasticsearchClusterNotHealthy"
"annotations":
"description": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"summary": "Cluster health status is RED"
"expr": |
sum by (cluster) (es_cluster_status == 2)
Expand All @@ -13,7 +13,7 @@
"severity": "critical"
- "alert": "ElasticsearchClusterNotHealthy"
"annotations":
"description": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"summary": "Cluster health status is YELLOW"
"expr": |
sum by (cluster) (es_cluster_status == 1)
Expand All @@ -22,7 +22,7 @@
"severity": "warning"
- "alert": "ElasticsearchBulkRequestsRejectionJumps"
"annotations":
"description": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"summary": "High Bulk Rejection Ratio - {{ $value }}%"
"expr": |
round( bulk:reject_ratio:rate2m * 100, 0.001 ) > 5
Expand All @@ -31,7 +31,7 @@
"severity": "warning"
- "alert": "ElasticsearchNodeDiskWatermarkReached"
"annotations":
"description": "Disk Low Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"message": "Disk Low Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%"
"expr": |
sum by (cluster, instance, node) (
Expand All @@ -47,7 +47,7 @@
"severity": "alert"
- "alert": "ElasticsearchNodeDiskWatermarkReached"
"annotations":
"description": "Disk High Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"message": "Disk High Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%"
"expr": |
sum by (cluster, instance, node) (
Expand All @@ -63,7 +63,7 @@
"severity": "high"
- "alert": "ElasticsearchNodeDiskLowForSegmentMerges"
"annotations":
"description": "Free disk at {{ $labels.node }} node in {{ $labels.cluster }} cluster may be low for optimal segment merges"
"message": "Free disk at {{ $labels.node }} node in {{ $labels.cluster }} cluster may be low for optimal segment merges"
"summary": "Free disk may be low for optimal segment merges"
"expr": |
sum by (cluster, instance, node) (es_fs_path_free_bytes) /
Expand All @@ -74,7 +74,7 @@
"severity": "warning"
- "alert": "ElasticsearchJVMHeapUseHigh"
"annotations":
"description": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "JVM Heap usage on the node is high"
"expr": |
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
Expand All @@ -83,7 +83,7 @@
"severity": "alert"
- "alert": "AggregatedLoggingSystemCPUHigh"
"annotations":
"description": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"summary": "System CPU usage is high"
"expr": |
sum by (cluster, instance, node) (es_os_cpu_percent) > 90
Expand All @@ -92,11 +92,10 @@
"severity": "alert"
- "alert": "ElasticsearchProcessCPUHigh"
"annotations":
"description": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"summary": "ES process CPU usage is high"
"expr": |
sum by (cluster, instance, node) (es_process_cpu_percent) > 90
"for": "1m"
"labels":
"severity": "alert"

4 changes: 3 additions & 1 deletion hack/cr.yaml
Expand Up @@ -9,9 +9,11 @@ spec:
image: quay.io/openshift/origin-logging-elasticsearch5:latest
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
memory: 512Mi
cpu: 500m
memory: 1Gi
nodes:
- nodeCount: 1
roles:
Expand Down
17 changes: 15 additions & 2 deletions hack/deploy-setup.sh
Expand Up @@ -7,6 +7,20 @@ set -euxo pipefail

source "$(dirname $0)/common"

if [[ -z `oc get project ${NAMESPACE} 2> /dev/null` ]] ; then
cat <<EOF | oc create -f -
apiVersion: v1
kind: Namespace
metadata:
name: ${NAMESPACE}
annotations:
openshift.io/node-selector: ""
labels:
openshift.io/cluster-logging: "true"
openshift.io/cluster-monitoring: "true"
EOF
fi

load_manifest() {
local repo=$1
local namespace=${2:-}
Expand All @@ -23,12 +37,11 @@ load_manifest() {
popd
}

oc create namespace ${NAMESPACE} ||:
load_manifest ${repo_dir} ${NAMESPACE}

#hack openshift-monitoring
pushd vendor/github.com/coreos/prometheus-operator/example/prometheus-operator-crd
for file in prometheusrule.crd.yaml servicemonitor.crd.yaml; do
for file in prometheusrule.crd.yaml servicemonitor.crd.yaml; do
oc create -n ${NAMESPACE} -f ${file} ||:
done
popd
Expand Down

0 comments on commit f5ab6c1

Please sign in to comment.