diff --git a/openshift/template-monitoring-postgres.yaml b/openshift/template-monitoring-postgres.yaml new file mode 100644 index 0000000000..20169963cf --- /dev/null +++ b/openshift/template-monitoring-postgres.yaml @@ -0,0 +1,326 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: assisted-installer-monitoring-postgres +objects: +- apiVersion: v1 + kind: ConfigMap + metadata: + name: assisted-installer-prometheus-postgres-exporter-queries + labels: + app: assisted-installer-prometheus-postgres-exporter + data: + allow-snippet-annotations: "false" + config.yaml: | + assisted_installer_events: + query: "SELECT MAX(id) as max_event_id FROM events" + metrics: + - events_count: + usage: "COUNTER" + description: "Numeric ID for events. Equivalent to a ever-increasing counter" + + pg_replication: + query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag" + master: true + metrics: + - lag: + usage: "GAUGE" + description: "Replication lag behind master in seconds" + + pg_postmaster: + query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()" + master: true + metrics: + - start_time_seconds: + usage: "GAUGE" + description: "Time at which postmaster started" + + pg_stat_user_tables: + query: | + SELECT + current_database() datname, + schemaname, + relname, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + n_mod_since_analyze, + COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, + COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, + COALESCE(last_analyze, '1970-01-01Z') as last_analyze, + COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count + FROM + pg_stat_user_tables + metrics: + - datname: + usage: "LABEL" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - seq_scan: + usage: "COUNTER" + description: "Number of sequential scans initiated on this table" + - seq_tup_read: + usage: "COUNTER" + description: "Number of live rows fetched by sequential scans" + - idx_scan: + usage: "COUNTER" + description: "Number of index scans initiated on this table" + - idx_tup_fetch: + usage: "COUNTER" + description: "Number of live rows fetched by index scans" + - n_tup_ins: + usage: "COUNTER" + description: "Number of rows inserted" + - n_tup_upd: + usage: "COUNTER" + description: "Number of rows updated" + - n_tup_del: + usage: "COUNTER" + description: "Number of rows deleted" + - n_tup_hot_upd: + usage: "COUNTER" + description: "Number of rows HOT updated (i.e., with no separate index update required)" + - n_live_tup: + usage: "GAUGE" + description: "Estimated number of live rows" + - n_dead_tup: + usage: "GAUGE" + description: "Estimated number of dead rows" + - n_mod_since_analyze: + usage: "GAUGE" + description: "Estimated number of rows changed since last analyze" + - last_vacuum: + usage: "GAUGE" + description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" + - last_autovacuum: + usage: "GAUGE" + description: "Last time at which this table was vacuumed by the autovacuum daemon" + - last_analyze: + usage: "GAUGE" + description: "Last time at which this table was manually analyzed" + - last_autoanalyze: + usage: "GAUGE" + description: "Last time at which this table was analyzed by the autovacuum daemon" + - vacuum_count: + usage: "COUNTER" + description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" + - autovacuum_count: + usage: "COUNTER" + description: "Number of times this table has been vacuumed by the autovacuum daemon" + - analyze_count: + usage: "COUNTER" + description: "Number of times this table has been manually analyzed" + - autoanalyze_count: + usage: "COUNTER" + description: "Number of times this table has been analyzed by the autovacuum daemon" + + pg_statio_user_tables: + query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables" + metrics: + - datname: + usage: "LABEL" + description: "Name of current database" + - schemaname: + usage: "LABEL" + description: "Name of the schema that this table is in" + - relname: + usage: "LABEL" + description: "Name of this table" + - heap_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table" + - heap_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table" + - idx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from all indexes on this table" + - idx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in all indexes on this table" + - toast_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table (if any)" + - toast_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table (if any)" + - tidx_blks_read: + usage: "COUNTER" + description: "Number of disk blocks read from this table's TOAST table indexes (if any)" + - tidx_blks_hit: + usage: "COUNTER" + description: "Number of buffer hits in this table's TOAST table indexes (if any)" + + pg_database: + query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size_bytes FROM pg_database" + master: true + cache_seconds: 30 + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - size_bytes: + usage: "GAUGE" + description: "Disk space used by the database" + + + pg_stat_activity_idle: + query: | + WITH + metrics AS ( + SELECT + application_name, + SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_seconds_sum, + COUNT(*) AS process_seconds_count + FROM pg_stat_activity + WHERE state = 'idle' + GROUP BY application_name + ), + buckets AS ( + SELECT + application_name, + le, + SUM( + CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le + THEN 1 + ELSE 0 + END + )::bigint AS bucket + FROM + pg_stat_activity, + UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le + GROUP BY application_name, le + ORDER BY application_name, le + ) + SELECT + application_name, + process_seconds_sum, + process_seconds_count, + ARRAY_AGG(le) AS process_seconds, + ARRAY_AGG(bucket) AS process_seconds_bucket + FROM metrics JOIN buckets USING (application_name) + GROUP BY 1, 2, 3 + metrics: + - application_name: + usage: "LABEL" + description: "Application Name" + - process_seconds: + usage: "HISTOGRAM" + description: "Idle time of server processes" +- apiVersion: v1 + kind: Service + metadata: + name: assisted-installer-prometheus-postgres-exporter + labels: + app: assisted-installer-prometheus-postgres-exporter + spec: + type: ClusterIP + ports: + - port: 9187 + targetPort: 9187 + protocol: TCP + name: http + selector: + app: assisted-installer-prometheus-postgres-exporter +- apiVersion: apps/v1 + kind: Deployment + metadata: + name: assisted-installer-prometheus-postgres-exporter + labels: + app: assisted-installer-prometheus-postgres-exporter + spec: + replicas: 1 + selector: + matchLabels: + app: assisted-installer-prometheus-postgres-exporter + template: + metadata: + labels: + app: assisted-installer-prometheus-postgres-exporter + spec: + containers: + - name: prometheus-postgres-exporter + args: + - "--extend.query-path=/etc/config.yaml" + env: + - name: DB_HOST + valueFrom: + secretKeyRef: + name: assisted-installer-rds + key: db.host + - name: DATA_SOURCE_URI + value: "$(DB_HOST):5432/?sslmode=disable" + - name: DATA_SOURCE_USER + valueFrom: + secretKeyRef: + name: assisted-installer-rds + key: db.user + - name: DATA_SOURCE_PASS + valueFrom: + secretKeyRef: + name: assisted-installer-rds + key: db.password + image: "${PG_EXPORTER_IMAGE}" + imagePullPolicy: ${PG_EXPOTER_PULL_POLICY} + ports: + - name: http + containerPort: 9187 + protocol: TCP + livenessProbe: + initialDelaySeconds: 0 + timeoutSeconds: 1 + httpGet: + path: / + port: http + readinessProbe: + initialDelaySeconds: 0 + timeoutSeconds: 1 + httpGet: + path: / + port: http + resources: + limits: + cpu: ${PG_EXPORTER_CPU_LIMIT} + memory: ${PG_EXPORTER_MEMORY_LIMIT} + requests: + cpu: ${PG_EXPORTER_CPU_REQUEST} + memory: ${PG_EXPORTER_MEMORY_REQUEST} + + volumeMounts: + - name: queries + mountPath: /etc/config.yaml + subPath: config.yaml + volumes: + - configMap: + defaultMode: 420 + name: assisted-installer-prometheus-postgres-exporter-queries + name: queries +parameters: +- name: PG_EXPORTER_CPU_REQUEST + value: "1m" +- name: PG_EXPORTER_CPU_LIMIT + value: "1" +- name: PG_EXPORTER_MEMORY_REQUEST + value: "16Mi" +- name: PG_EXPORTER_MEMORY_LIMIT + value: "64Mi" +- name: PG_EXPORTER_IMAGE + value: "quay.io/prometheuscommunity/postgres-exporter:v0.10.1" +- name: PG_EXPOTER_PULL_POLICY + value: IfNotPresent diff --git a/openshift/template-monitoring.yaml b/openshift/template-monitoring.yaml index ecfe5afe1e..6e5b5d392b 100644 --- a/openshift/template-monitoring.yaml +++ b/openshift/template-monitoring.yaml @@ -3,7 +3,7 @@ parameters: - name: NAMESPACE value: '' required: true -apiVersion: v1 +apiVersion: template.openshift.io/v1 kind: Template metadata: name: assisted-installer @@ -26,3 +26,21 @@ objects: selector: matchLabels: app: assisted-service +- apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + labels: + prometheus: app-sre + name: servicemonitor-assisted-installer-postgres + spec: + endpoints: + - interval: 30s + path: /metrics + port: http + scheme: http + namespaceSelector: + matchNames: + - ${NAMESPACE} + selector: + matchLabels: + app: assisted-installer-prometheus-postgres-exporter