Skip to content
This repository has been archived by the owner on Jun 12, 2024. It is now read-only.

Commit

Permalink
log diag to file; add scores and network traffic rate analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
pnickolov committed Sep 23, 2021
1 parent 3c6796e commit 8320c5a
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 23 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# Analysis sample output (temp)
samples/

# log & output files used in dev/test
*.log
*.out

# goreleaser files
dist/

Expand Down
27 changes: 16 additions & 11 deletions app/model/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ type AppContainer struct {
}

type AppMetrics struct {
AverageReplicas float64 `yaml:"average_replicas"`
CpuUtilization float64 `yaml:"cpu_saturation"` // aka Saturation, %
MemoryUtilization float64 `yaml:"memory_saturation"`
// TODO: add network traffic, esp. indication of traffic
AverageReplicas float64 `yaml:"average_replicas"` // averaged over the evaluated time range
CpuUtilization float64 `yaml:"cpu_saturation"` // aka Saturation, in percent, can be 0 or >100
MemoryUtilization float64 `yaml:"memory_saturation"` // aka Saturation, in percent, can be 0 or >100
PacketReceiveRate float64 `yaml:"packet_receive_rate"` // per second
PacketTransmitRate float64 `yaml:"packet_transmit_rate"` // per second
RequestRate float64 `yaml:"request_rate"` // per second
}

type AppFlag int
Expand All @@ -75,13 +77,16 @@ func (f AppFlag) String() string {
}

type AppAnalysis struct {
Rating int `yaml:"rating"` // how suitable for optimization
Confidence int `yaml:"confidence"` // how confident is the rating
MainContainer string `yaml:"main_container"` // container to optimize or empty if not identified
Flags map[AppFlag]bool `yaml:"flags"` // flags
Opportunities []string `yaml:"opportunities"` // list of optimization opportunities
Cautions []string `yaml:"cautions"` // list of concerns/cautions
Blockers []string `yaml:"blockers"` // list of blockers prevention optimization
Rating int `yaml:"rating"` // how suitable for optimization
Confidence int `yaml:"confidence"` // how confident is the rating
MainContainer string `yaml:"main_container"` // container to optimize or empty if not identified
EfficiencyScore int `yaml:"efficiency_score"`
ReliabilityScore int `yaml:"reliability_score,omitempty"`
PerformanceScore int `yaml:"performance_score,omitempty"`
Flags map[AppFlag]bool `yaml:"flags"` // flags
Opportunities []string `yaml:"opportunities"` // list of optimization opportunities
Cautions []string `yaml:"cautions"` // list of concerns/cautions
Blockers []string `yaml:"blockers"` // list of blockers prevention optimization
}

type App struct {
Expand Down
39 changes: 38 additions & 1 deletion cmd/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ import (
opsmath "opsani-ignite/math"
)

const (
CPU_WEIGHT = 0.6
MEM_WEIGHT = 0.4
)

type ResourceUtilizationRating struct {
UtilizationFloor float64
RatingBump int
Expand Down Expand Up @@ -274,13 +279,27 @@ func preAnalyzeApp(app *appmodel.App) {
}
}

// validate or determine QoS
computedQos := computePodQoS(app)
if app.Settings.QosClass == "" {
app.Settings.QosClass = computedQos
} else if app.Settings.QosClass != computedQos {
log.Warnf("Computed QoS class %q does not match discovered QoS class %q for app %v; assuming the latter",
computedQos, app.Settings.QosClass, app.Metadata)
}

// validate or determine request rate
// Notes:
// - packet rate can be used as a proxy to requests per second
// - bidirectional traffic is required to consider traffic as requests/replies
computedRps := 0.0
if app.Metrics.PacketReceiveRate > 0 && app.Metrics.PacketTransmitRate > 0 {
computedRps = app.Metrics.PacketReceiveRate // packets received ≈ requests
}
if app.Metrics.RequestRate == 0 {
app.Metrics.RequestRate = opsmath.MagicRound(computedRps)
}

}

func efficiencyImprovementEstimate(app *appmodel.App) string {
Expand Down Expand Up @@ -337,7 +356,7 @@ func analyzeApp(app *appmodel.App) {
o.Blockers = append(o.Blockers, msg)
}

// analyze utilization
// analyze resource utilization
o.Flags[appmodel.F_UTILIZATION] = app.Metrics.CpuUtilization > 0 && app.Metrics.MemoryUtilization > 0
utilBump := utilizationCombinedRating(app.Metrics.CpuUtilization, app.Metrics.MemoryUtilization)
if utilBump != 0 {
Expand All @@ -359,6 +378,24 @@ func analyzeApp(app *appmodel.App) {
}
}

// compute scores
o.EfficiencyScore = int(math.Round(app.Metrics.CpuUtilization*CPU_WEIGHT + app.Metrics.MemoryUtilization*MEM_WEIGHT))

// analyze request rate
if app.Metrics.RequestRate == 0 {
o.Blockers = append(o.Blockers, "No requests are being processed")
o.Flags[appmodel.F_TRAFFIC] = false
} else if app.Metrics.RequestRate < 2 {
o.Cautions = append(o.Cautions, "Low request rate")
o.Rating -= 10
// note: don't set traffic flag
} else {
o.Flags[appmodel.F_TRAFFIC] = true
if app.Metrics.RequestRate > 100 {
o.Rating += 10 // low confidence as we don't know if traffic is served or originated
}
}

// analyze replica count
if app.Metrics.AverageReplicas <= 1 {
o.Rating -= 20
Expand Down
12 changes: 11 additions & 1 deletion cmd/ignite.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ func isQualifiedApp(app *appmodel.App) bool {
}

func runIgnite(cmd *cobra.Command, args []string) {
// set up logging
logFile, err := os.OpenFile("opsani-ignite.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
log.Fatalf("error opening log file: %v", err)
}
defer logFile.Close()
log.SetOutput(logFile)
if showDebug {
log.SetLevel(log.TraceLevel)
} else if suppressWarnings {
Expand All @@ -60,6 +67,7 @@ func runIgnite(cmd *cobra.Command, args []string) {
}

log.Printf("Using Prometheus API at %q\n", promUri)
fmt.Fprintf(os.Stderr, "Using Prometheus API at %q\n", promUri)

// Create root context
ctx := context.Background()
Expand Down Expand Up @@ -106,6 +114,7 @@ func runIgnite(cmd *cobra.Command, args []string) {
if qualified == 0 && deployment == "" { // if a deployment is specified, it will be shown anyway
showAllApps = true
log.Infof("No highly rated applications found. Showing all applications")
fmt.Fprintf(os.Stderr, "No highly rated applications found. Showing all applications")
}
}

Expand All @@ -124,7 +133,8 @@ func runIgnite(cmd *cobra.Command, args []string) {
}
display.WriteOut(table)
if skipped > 0 {
log.Infof("%v applications were not shown due to low rating. Use --show-all to see all apps", skipped)
log.Infof("%v applications were not shown due to low rating", skipped)
fmt.Fprintf(os.Stderr, "%v applications were not shown due to low rating. Use --show-all to see all apps", skipped)
}

}
9 changes: 6 additions & 3 deletions cmd/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ func (table *AppTable) outputTableHeader() {
const RIGHT = tablewriter.ALIGN_RIGHT
const LEFT = tablewriter.ALIGN_LEFT

table.t.SetHeader([]string{"Namespace", "Deployment", "QoS Class", "Instances", "CPU", "Mem", "Opportunity", "Flags"})
table.t.SetColumnAlignment([]int{LEFT, LEFT, LEFT, RIGHT, RIGHT, RIGHT, LEFT, LEFT})
table.t.SetHeader([]string{"Efficiency\nScore", "Namespace", "Deployment", "QoS Class", "Instances", "CPU", "Mem", "Opportunity", "Flags"})
table.t.SetColumnAlignment([]int{RIGHT, LEFT, LEFT, LEFT, RIGHT, RIGHT, RIGHT, LEFT, LEFT})
table.t.SetFooter([]string{})
table.t.SetCenterSeparator("")
table.t.SetColumnSeparator("")
Expand All @@ -106,6 +106,7 @@ func (table *AppTable) outputTableHeader() {
func (table *AppTable) outputTableApp(app *appmodel.App) {
reason, color := appOpportunityAndColor(app)
rowValues := []string{
fmt.Sprintf("%3d", app.Analysis.EfficiencyScore),
app.Metadata.Namespace,
app.Metadata.Workload,
app.Settings.QosClass,
Expand Down Expand Up @@ -146,6 +147,7 @@ func (table *AppTable) outputDetailApp(app *appmodel.App) {
table.t.Rich([]string{"Main Container", app.Analysis.MainContainer}, nil)
table.t.Rich([]string{"Pod QoS Class", app.Settings.QosClass}, nil)

table.t.Rich([]string{"Efficiency Score", fmt.Sprintf("%4d", app.Analysis.EfficiencyScore)}, appColors)
table.t.Rich([]string{"Rating", fmt.Sprintf("%4d%%", app.Analysis.Rating)}, appColors)
table.t.Rich([]string{"Confidence", fmt.Sprintf("%4d%%", app.Analysis.Confidence)}, appColors)

Expand All @@ -161,10 +163,11 @@ func (table *AppTable) outputDetailApp(app *appmodel.App) {
}

//table.Rich(blank, nil)
table.t.Rich([]string{"Average Replica Count", fmt.Sprintf("%3.1g", app.Metrics.AverageReplicas)}, nil)
table.t.Rich([]string{"Average Replica Count", fmt.Sprintf("%3.1f", app.Metrics.AverageReplicas)}, nil)
table.t.Rich([]string{"Container Count", fmt.Sprintf("%3d", len(app.Containers))}, nil)
table.t.Rich([]string{"CPU Utilization", fmt.Sprintf("%3.0f%%", app.Metrics.CpuUtilization)}, nil)
table.t.Rich([]string{"Memory Utilization", fmt.Sprintf("%3.0f%%", app.Metrics.MemoryUtilization)}, nil)
table.t.Rich([]string{"Network Traffic (approx.)", fmt.Sprintf("%3.1f qps", app.Metrics.RequestRate)}, nil)
table.t.Rich([]string{"Opsani Flags", flagsString(app.Analysis.Flags)}, nil)

table.t.Rich(blank, nil)
Expand Down
19 changes: 16 additions & 3 deletions sources/prometheus/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"context"
"fmt"
appmodel "opsani-ignite/app/model"
opsmath "opsani-ignite/math"
"reflect"
"strings"
"text/template"
Expand Down Expand Up @@ -281,6 +282,10 @@ func collectContainersInfo(ctx context.Context, promApi v1.API, app *appmodel.Ap
app.Containers = append(app.Containers, container)
}

// Get restart counts
warnings, err = getContainersUse(ctx, promApi, app, timeRange, containerRestartsTemplate, &selectors, "", "RestartCount")
allWarnings = handleWarnErr(allWarnings, warnings, err, app, "restart counts")

// --- Get resource specifications

// Get resource requests
Expand Down Expand Up @@ -309,9 +314,17 @@ func collectContainersInfo(ctx context.Context, promApi v1.API, app *appmodel.Ap
warnings, err = getContainersUse(ctx, promApi, app, timeRange, containerCpuSecondsThrottledTemplate, &selectors, "Cpu", "SecondsThrottled")
allWarnings = handleWarnErr(allWarnings, warnings, err, app, "CPU throttling")

// Get restart counts
warnings, err = getContainersUse(ctx, promApi, app, timeRange, containerRestartsTemplate, &selectors, "", "RestartCount")
allWarnings = handleWarnErr(allWarnings, warnings, err, app, "restart counts")
// Get network traffic stats (pod-level, not container-level)
rxRate, warnings, err := getRangedMetric(ctx, promApi, app, timeRange, containerRxPacketsTemplate, &selectors)
allWarnings = handleWarnErr(allWarnings, warnings, err, app, "Received packets rate")
txRate, warnings, err := getRangedMetric(ctx, promApi, app, timeRange, containerTxPacketsTemplate, &selectors)
allWarnings = handleWarnErr(allWarnings, warnings, err, app, "Received packets rate")
if rxRate != nil {
app.Metrics.PacketReceiveRate = opsmath.MagicRound(*rxRate)
}
if txRate != nil {
app.Metrics.PacketTransmitRate = opsmath.MagicRound(*txRate)
}

log.Tracef("App %v has %v container(s): %v", app.Metadata, len(app.Containers), app.Containers)

Expand Down
19 changes: 15 additions & 4 deletions sources/prometheus/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
)

var replicaCountTemplate *template.Template
var containerRestartsTemplate *template.Template
var cpuUtilizationTemplate *template.Template
var memoryUtilizationTemplate *template.Template
var containerInfoTemplate *template.Template
Expand All @@ -15,7 +16,8 @@ var containerMemoryUseTemplate *template.Template
var containerCpuSaturationTemplate *template.Template
var containerMemorySaturationTemplate *template.Template
var containerCpuSecondsThrottledTemplate *template.Template
var containerRestartsTemplate *template.Template
var containerRxPacketsTemplate *template.Template
var containerTxPacketsTemplate *template.Template

// Useful References:
//
Expand All @@ -29,9 +31,14 @@ var containerRestartsTemplate *template.Template
// https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu

func initializeTemplates() {
// replica count (averaged over the range)
replicaCountTemplate = template.Must(template.New("prometheusPodAverageReplicas").Parse(
`kube_deployment_status_replicas{namespace="{{ .Namespace }}", deployment="{{ .Workload }}"}`))

// container restarts
containerRestartsTemplate = template.Must(template.New("prometheusRestartsTemplate").Parse(
`avg by (container) (kube_pod_container_status_restarts_total{ {{ .PodSelector }} })`))

// old style, pod-aggregated (but may be less precise)
cpuUtilizationTemplate = template.Must(template.New("prometheusPodCpuUtilization").Parse(
`avg(sum by (pod, container) (rate(container_cpu_usage_seconds_total{ {{ .PodSelector }} }[60s]) * 1024 * 60) / on (pod, container) (container_spec_cpu_shares{ {{ .PodSelector }} }) / 60 * 100)`))
Expand Down Expand Up @@ -67,7 +74,11 @@ func initializeTemplates() {
// container Memory-specifics
// TODO (e.g., oom kill count, maybe from kube_pod_container_status_terminated_reason)

// container restarts
containerRestartsTemplate = template.Must(template.New("prometheusRestartsTemplate").Parse(
`avg by (container) (kube_pod_container_status_restarts_total{ {{ .PodSelector }} })`))
// container network traffic
// note: network stats are per pod (container="POD"), not per container
containerRxPacketsTemplate = template.Must(template.New("prometheusContainerRxPacketsTemplate").Parse(
`avg (rate(container_network_receive_packets_total{ {{ .PodSelector }} }[5m]))`))
containerTxPacketsTemplate = template.Must(template.New("prometheusContainerTxPacketsTemplate").Parse(
`avg (rate(container_network_transmit_packets_total{ {{ .PodSelector }} }[5m]))`))

}

0 comments on commit 8320c5a

Please sign in to comment.