Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Secure metrics endpoint with oauth-proxy #76

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 9 additions & 10 deletions files/prometheus_alerts.yml
Expand Up @@ -4,7 +4,7 @@
"rules":
- "alert": "ElasticsearchClusterNotHealthy"
"annotations":
"description": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"summary": "Cluster health status is RED"
"expr": |
sum by (cluster) (es_cluster_status == 2)
Expand All @@ -13,7 +13,7 @@
"severity": "critical"
- "alert": "ElasticsearchClusterNotHealthy"
"annotations":
"description": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"summary": "Cluster health status is YELLOW"
"expr": |
sum by (cluster) (es_cluster_status == 1)
Expand All @@ -22,7 +22,7 @@
"severity": "warning"
- "alert": "ElasticsearchBulkRequestsRejectionJumps"
"annotations":
"description": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"summary": "High Bulk Rejection Ratio - {{ $value }}%"
"expr": |
round( bulk:reject_ratio:rate2m * 100, 0.001 ) > 5
Expand All @@ -31,7 +31,7 @@
"severity": "warning"
- "alert": "ElasticsearchNodeDiskWatermarkReached"
"annotations":
"description": "Disk Low Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"message": "Disk Low Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%"
"expr": |
sum by (cluster, instance, node) (
Expand All @@ -47,7 +47,7 @@
"severity": "alert"
- "alert": "ElasticsearchNodeDiskWatermarkReached"
"annotations":
"description": "Disk High Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"message": "Disk High Watermark Reached at {{ $labels.node }} node in {{ $labels.cluster }} cluster. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%"
"expr": |
sum by (cluster, instance, node) (
Expand All @@ -63,7 +63,7 @@
"severity": "high"
- "alert": "ElasticsearchNodeDiskLowForSegmentMerges"
"annotations":
"description": "Free disk at {{ $labels.node }} node in {{ $labels.cluster }} cluster may be low for optimal segment merges"
"message": "Free disk at {{ $labels.node }} node in {{ $labels.cluster }} cluster may be low for optimal segment merges"
"summary": "Free disk may be low for optimal segment merges"
"expr": |
sum by (cluster, instance, node) (es_fs_path_free_bytes) /
Expand All @@ -74,7 +74,7 @@
"severity": "warning"
- "alert": "ElasticsearchJVMHeapUseHigh"
"annotations":
"description": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "JVM Heap usage on the node is high"
"expr": |
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
Expand All @@ -83,7 +83,7 @@
"severity": "alert"
- "alert": "AggregatedLoggingSystemCPUHigh"
"annotations":
"description": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"summary": "System CPU usage is high"
"expr": |
sum by (cluster, instance, node) (es_os_cpu_percent) > 90
Expand All @@ -92,11 +92,10 @@
"severity": "alert"
- "alert": "ElasticsearchProcessCPUHigh"
"annotations":
"description": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%"
"summary": "ES process CPU usage is high"
"expr": |
sum by (cluster, instance, node) (es_process_cpu_percent) > 90
"for": "1m"
"labels":
"severity": "alert"

4 changes: 3 additions & 1 deletion hack/cr.yaml
Expand Up @@ -9,9 +9,11 @@ spec:
image: quay.io/openshift/origin-logging-elasticsearch5:latest
resources:
limits:
cpu: 500m
memory: 1Gi
requests:
memory: 512Mi
cpu: 500m
memory: 1Gi
nodes:
- nodeCount: 1
roles:
Expand Down
17 changes: 15 additions & 2 deletions hack/deploy-setup.sh
Expand Up @@ -7,6 +7,20 @@ set -euxo pipefail

source "$(dirname $0)/common"

if [[ -z `oc get project ${NAMESPACE} 2> /dev/null` ]] ; then
cat <<EOF | oc create -f -
apiVersion: v1
kind: Namespace
metadata:
name: ${NAMESPACE}
annotations:
openshift.io/node-selector: ""
labels:
openshift.io/cluster-logging: "true"
openshift.io/cluster-monitoring: "true"
EOF
fi

load_manifest() {
local repo=$1
local namespace=${2:-}
Expand All @@ -23,12 +37,11 @@ load_manifest() {
popd
}

oc create namespace ${NAMESPACE} ||:
load_manifest ${repo_dir} ${NAMESPACE}

#hack openshift-monitoring
pushd vendor/github.com/coreos/prometheus-operator/example/prometheus-operator-crd
for file in prometheusrule.crd.yaml servicemonitor.crd.yaml; do
for file in prometheusrule.crd.yaml servicemonitor.crd.yaml; do
oc create -n ${NAMESPACE} -f ${file} ||:
done
popd
Expand Down