added reliability risk analysis

opsani · Oct 7, 2021 · 4e6de25 · 4e6de25
1 parent 8320c5a
commit 4e6de25
Show file tree

Hide file tree

Showing 5 changed files with 257 additions and 67 deletions.
diff --git a/app/model/model.go b/app/model/model.go
@@ -1,5 +1,7 @@
 package model
 
+import "fmt"
+
 const (
 	QOS_GUARANTEED = "guaranteed"
 	QOS_BURSTABLE  = "burstable"
@@ -51,42 +53,67 @@ type AppContainer struct {
 }
 
 type AppMetrics struct {
-	AverageReplicas    float64 `yaml:"average_replicas"`     // averaged over the evaluated time range
-	CpuUtilization     float64 `yaml:"cpu_saturation"`       // aka Saturation, in percent, can be 0 or >100
-	MemoryUtilization  float64 `yaml:"memory_saturation"`    // aka Saturation, in percent, can be 0 or >100
-	PacketReceiveRate  float64 `yaml:"packet_receive_rate"`  // per second
-	PacketTransmitRate float64 `yaml:"packet_transmit_rate"` // per second
-	RequestRate        float64 `yaml:"request_rate"`         // per second
+	AverageReplicas     float64 `yaml:"average_replicas"`      // averaged over the evaluated time range
+	CpuUtilization      float64 `yaml:"cpu_saturation"`        // aka Saturation, in percent, can be 0 or >100
+	MemoryUtilization   float64 `yaml:"memory_saturation"`     // aka Saturation, in percent, can be 0 or >100
+	CpuSecondsThrottled float64 `yaml:"cpu_seconds_throttled"` // sum of seconds throttled/second across all containers
+	PacketReceiveRate   float64 `yaml:"packet_receive_rate"`   // per second
+	PacketTransmitRate  float64 `yaml:"packet_transmit_rate"`  // per second
+	RequestRate         float64 `yaml:"request_rate"`          // per second
 }
 
 type AppFlag int
 
 const (
-	F_WRITEABLE_VOLUME = iota
+	F_MAIN_CONTAINER = iota
+	F_WRITEABLE_VOLUME
 	F_RESOURCE_SPEC
-	F_SINGLE_REPLICA
-	F_MANY_REPLICAS
-	F_TRAFFIC
+	F_RESOURCE_LIMITS
+	F_RESOURCE_GUARANTEED
 	F_UTILIZATION
 	F_BURST
-	F_MAIN_CONTAINER
+	F_TRAFFIC
+	F_SINGLE_REPLICA
+	F_MANY_REPLICAS
 )
 
 func (f AppFlag) String() string {
-	return []string{"V", "R", "S", "M", "T", "U", "B", "C"}[f]
+	return []string{"C", "V", "R", "L", "G", "U", "B", "T", "S", "M"}[f]
+}
+
+func (f AppFlag) MarshalYAML() (interface{}, error) {
+	return f.String(), nil
+}
+
+type RiskLevel int
+
+const (
+	RISK_UNKNOWN = iota
+	RISK_NONE
+	RISK_LOW
+	RISK_MEDIUM
+	RISK_HIGH
+	RISK_CRITICAL
+)
+
+func (r RiskLevel) String() string {
+	return []string{"-", "None", "Low", "Medium", "High", "Critical"}[r]
+}
+
+func (r RiskLevel) MarshalYAML() (interface{}, error) {
+	return r.String(), nil
 }
 
 type AppAnalysis struct {
-	Rating           int              `yaml:"rating"`         // how suitable for optimization
-	Confidence       int              `yaml:"confidence"`     // how confident is the rating
-	MainContainer    string           `yaml:"main_container"` // container to optimize or empty if not identified
-	EfficiencyScore  int              `yaml:"efficiency_score"`
-	ReliabilityScore int              `yaml:"reliability_score,omitempty"`
-	PerformanceScore int              `yaml:"performance_score,omitempty"`
-	Flags            map[AppFlag]bool `yaml:"flags"`         // flags
-	Opportunities    []string         `yaml:"opportunities"` // list of optimization opportunities
-	Cautions         []string         `yaml:"cautions"`      // list of concerns/cautions
-	Blockers         []string         `yaml:"blockers"`      // list of blockers prevention optimization
+	Rating          int              `yaml:"rating"`         // how suitable for optimization
+	Confidence      int              `yaml:"confidence"`     // how confident is the rating
+	MainContainer   string           `yaml:"main_container"` // container to optimize or empty if not identified
+	EfficiencyScore *int             `yaml:"efficiency_score"`
+	ReliabilityRisk *RiskLevel       `yaml:"reliability_risk"`
+	Flags           map[AppFlag]bool `yaml:"flags"`         // flags
+	Opportunities   []string         `yaml:"opportunities"` // list of optimization opportunities
+	Cautions        []string         `yaml:"cautions"`      // list of concerns/cautions
+	Blockers        []string         `yaml:"blockers"`      // list of blockers prevention optimization
 }
 
 type App struct {
@@ -109,3 +136,16 @@ func (app *App) ContainerIndexByName(name string) (index int, ok bool) {
 	}
 	return
 }
+
+func Score2String(s *int) string {
+	if s == nil {
+		return "n/a"
+	}
+	return fmt.Sprintf("%v", *s)
+}
+func Risk2String(r *RiskLevel) string {
+	if r == nil {
+		return "n/a"
+	}
+	return r.String()
+}
diff --git a/cmd/analysis.go b/cmd/analysis.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"math"
 	"sort"
+	"strings"
 
 	log "github.com/sirupsen/logrus"
 
@@ -226,6 +227,16 @@ func computePodQoS(app *appmodel.App) string {
 	}
 }
 
+func resourcesLimited(app *appmodel.App) bool {
+	for i := range app.Containers {
+		c := &app.Containers[i]
+		if c.Cpu.Limit == 0 || c.Memory.Limit == 0 {
+			return false
+		}
+	}
+	return true
+}
+
 func resourcesExplicitlyDefined(app *appmodel.App) (bool, string) {
 	// select the main container
 	if app.Analysis.MainContainer == "" {
@@ -322,6 +333,49 @@ func efficiencyImprovementEstimate(app *appmodel.App) string {
 	}
 }
 
+func bumpRisk(prior *appmodel.RiskLevel, level appmodel.RiskLevel) *appmodel.RiskLevel {
+	if prior == nil {
+		return &level
+	}
+	if level > *prior {
+		return &level
+	}
+	return prior
+}
+
+func riskAssessment(app *appmodel.App) (*appmodel.RiskLevel, []string) {
+	var risk *appmodel.RiskLevel
+	msg := []string{}
+
+	if app.Settings.QosClass == appmodel.QOS_BESTEFFORT {
+		risk = bumpRisk(risk, appmodel.RISK_HIGH)
+		msg = append(msg, "Pod QoS class is Best Effort")
+	} else if app.Settings.QosClass != appmodel.QOS_GUARANTEED {
+		risk = bumpRisk(risk, appmodel.RISK_MEDIUM)
+		msg = append(msg, fmt.Sprintf("Pod QOS class is %v", strings.Title(app.Settings.QosClass)))
+	}
+
+	if app.Metrics.CpuUtilization >= 200 ||
+		app.Metrics.MemoryUtilization >= 200 ||
+		app.Metrics.CpuSecondsThrottled >= 0.7 {
+		risk = bumpRisk(risk, appmodel.RISK_HIGH)
+		msg = append(msg, "Resource utilization significantly exceeds allocation")
+	} else if app.Metrics.CpuUtilization > 120 ||
+		app.Metrics.MemoryUtilization > 120 ||
+		app.Metrics.CpuSecondsThrottled > 0.25 {
+		risk = bumpRisk(risk, appmodel.RISK_HIGH)
+		msg = append(msg, "Resource utilization exceeds allocation")
+	} else if app.Metrics.CpuUtilization > 90 ||
+		app.Metrics.MemoryUtilization > 90 ||
+		app.Metrics.CpuSecondsThrottled > 0.1 {
+		risk = bumpRisk(risk, appmodel.RISK_MEDIUM)
+		msg = append(msg, "Resource utilization close to allocation")
+	}
+
+	risk = bumpRisk(risk, appmodel.RISK_LOW) // in case not set yet
+	return risk, msg
+}
+
 func analyzeApp(app *appmodel.App) {
 	// finalize basis and prepare for analysis
 	preAnalyzeApp(app)
@@ -348,7 +402,17 @@ func analyzeApp(app *appmodel.App) {
 		o.Flags[appmodel.F_WRITEABLE_VOLUME] = false
 	}
 
-	// missing resource specification (main container has no QoS)
+	// resource specification flags
+	if app.Settings.QosClass == appmodel.QOS_GUARANTEED {
+		o.Flags[appmodel.F_RESOURCE_GUARANTEED] = true
+	} else {
+		o.Flags[appmodel.F_RESOURCE_GUARANTEED] = false
+	}
+	if resourcesLimited(app) {
+		o.Flags[appmodel.F_RESOURCE_LIMITS] = true
+	} else {
+		o.Flags[appmodel.F_RESOURCE_LIMITS] = false
+	}
 	if resGood, msg := resourcesExplicitlyDefined(app); resGood {
 		o.Flags[appmodel.F_RESOURCE_SPEC] = true
 	} else {
@@ -378,8 +442,20 @@ func analyzeApp(app *appmodel.App) {
 		}
 	}
 
-	// compute scores
-	o.EfficiencyScore = int(math.Round(app.Metrics.CpuUtilization*CPU_WEIGHT + app.Metrics.MemoryUtilization*MEM_WEIGHT))
+	// compute efficiency score
+	if app.Metrics.MemoryUtilization == 0 {
+		o.EfficiencyScore = nil // something is wrong - this app likely not functioning or we don't have metrics
+	} else if app.Metrics.CpuUtilization == 0 {
+		// idle apps are inefficient by definition
+		score := 0
+		o.EfficiencyScore = &score
+	} else {
+		cpuSat := opsmath.Min(app.Metrics.CpuUtilization, 100)    // cap utilization for efficiency calc
+		memSat := opsmath.Min(app.Metrics.MemoryUtilization, 100) // " "
+		// score can be assigned only if the app is not bursting
+		score := int(math.Round(cpuSat*CPU_WEIGHT + memSat*MEM_WEIGHT))
+		o.EfficiencyScore = &score
+	}
 
 	// analyze request rate
 	if app.Metrics.RequestRate == 0 {
@@ -408,13 +484,20 @@ func analyzeApp(app *appmodel.App) {
 		o.Confidence += 30
 		o.Flags[appmodel.F_SINGLE_REPLICA] = false
 		o.Flags[appmodel.F_MANY_REPLICAS] = true
-	} else if app.Metrics.AverageReplicas >= 3 {
-		o.Rating += 10
-		o.Confidence += 10
+	} else {
+		if app.Metrics.AverageReplicas > 3 {
+			o.Rating += 10
+			o.Confidence += 10
+		}
 		o.Flags[appmodel.F_SINGLE_REPLICA] = false
 		o.Flags[appmodel.F_MANY_REPLICAS] = false
 	}
 
+	// perform risk assessment
+	riskCautions := []string{}
+	o.ReliabilityRisk, riskCautions = riskAssessment(app)
+	o.Cautions = append(o.Cautions, riskCautions...)
+
 	// finalize blockers
 	if len(o.Blockers) > 0 {
 		o.Rating = -100

diff --git a/cmd/output.go b/cmd/output.go
@@ -89,12 +89,54 @@ func flagsString(flags map[appmodel.AppFlag]bool) (ret string) {
 	return
 }
 
-func (table *AppTable) outputTableHeader() {
-	const RIGHT = tablewriter.ALIGN_RIGHT
+func riskColor(r *appmodel.RiskLevel) int {
+	color := 0 // neutral
+	if r == nil {
+		return color
+	}
+	switch *r {
+	case appmodel.RISK_LOW:
+		color = tablewriter.FgGreenColor
+	case appmodel.RISK_MEDIUM:
+		color = tablewriter.FgYellowColor
+	case appmodel.RISK_HIGH:
+		color = tablewriter.FgRedColor
+	}
+	return color
+}
+
+type HeaderInfo struct {
+	Title     string
+	Alignment int
+}
+
+func getHeadersInfo() []HeaderInfo {
 	const LEFT = tablewriter.ALIGN_LEFT
+	const CENTER = tablewriter.ALIGN_CENTER
+	const RIGHT = tablewriter.ALIGN_RIGHT
+
+	return []HeaderInfo{
+		{"Namespace", LEFT},
+		{"Deployment", LEFT},
+		{"Efficiency\nScore", RIGHT},
+		{"Reliability\nRisk", CENTER},
+		{"Instances", RIGHT},
+		{"CPU", RIGHT},
+		{"Mem", RIGHT},
+		{"Opportunity", LEFT},
+		{"Flags", LEFT},
+	}
+}
 
-	table.t.SetHeader([]string{"Efficiency\nScore", "Namespace", "Deployment", "QoS Class", "Instances", "CPU", "Mem", "Opportunity", "Flags"})
-	table.t.SetColumnAlignment([]int{RIGHT, LEFT, LEFT, LEFT, RIGHT, RIGHT, RIGHT, LEFT, LEFT})
+func (table *AppTable) outputTableHeader() {
+	var headers []string
+	var alignments []int
+	for _, header := range getHeadersInfo() {
+		headers = append(headers, header.Title)
+		alignments = append(alignments, header.Alignment)
+	}
+	table.t.SetHeader(headers)
+	table.t.SetColumnAlignment(alignments)
 	table.t.SetFooter([]string{})
 	table.t.SetCenterSeparator("")
 	table.t.SetColumnSeparator("")
@@ -106,10 +148,10 @@ func (table *AppTable) outputTableHeader() {
 func (table *AppTable) outputTableApp(app *appmodel.App) {
 	reason, color := appOpportunityAndColor(app)
 	rowValues := []string{
-		fmt.Sprintf("%3d", app.Analysis.EfficiencyScore),
 		app.Metadata.Namespace,
 		app.Metadata.Workload,
-		app.Settings.QosClass,
+		fmt.Sprintf("%3v", appmodel.Score2String(app.Analysis.EfficiencyScore)),
+		fmt.Sprintf("%v", appmodel.Risk2String(app.Analysis.ReliabilityRisk)),
 		fmt.Sprintf("%.0fx%d", app.Metrics.AverageReplicas, len(app.Containers)),
 		fmt.Sprintf("%.0f%%", app.Metrics.CpuUtilization),
 		fmt.Sprintf("%.0f%%", app.Metrics.MemoryUtilization),
@@ -140,14 +182,16 @@ func (table *AppTable) outputDetailApp(app *appmodel.App) {
 	opportunityColors := []tablewriter.Colors{[]int{0}, []int{tablewriter.FgGreenColor}}
 	cautionColors := []tablewriter.Colors{[]int{0}, []int{tablewriter.FgYellowColor}}
 	blockerColors := []tablewriter.Colors{[]int{0}, []int{tablewriter.FgRedColor}}
+	riskColors := []tablewriter.Colors{[]int{0}, []int{riskColor(app.Analysis.ReliabilityRisk)}}
 
 	table.t.Rich([]string{"Namespace", app.Metadata.Namespace}, nil)
 	table.t.Rich([]string{"Deployment", app.Metadata.Workload}, nil)
 	table.t.Rich([]string{"Kind", fmt.Sprintf("%v (%v)", app.Metadata.WorkloadKind, app.Metadata.WorkloadApiVersion)}, nil)
 	table.t.Rich([]string{"Main Container", app.Analysis.MainContainer}, nil)
 	table.t.Rich([]string{"Pod QoS Class", app.Settings.QosClass}, nil)
 
-	table.t.Rich([]string{"Efficiency Score", fmt.Sprintf("%4d", app.Analysis.EfficiencyScore)}, appColors)
+	table.t.Rich([]string{"Efficiency Score", fmt.Sprintf("%4v", appmodel.Score2String(app.Analysis.EfficiencyScore))}, appColors)
+	table.t.Rich([]string{"Reliability Risk", fmt.Sprintf("%v", appmodel.Risk2String(app.Analysis.ReliabilityRisk))}, riskColors)
 	table.t.Rich([]string{"Rating", fmt.Sprintf("%4d%%", app.Analysis.Rating)}, appColors)
 	table.t.Rich([]string{"Confidence", fmt.Sprintf("%4d%%", app.Analysis.Confidence)}, appColors)
 

diff --git a/math/stats.go b/math/stats.go
@@ -0,0 +1,51 @@
+/*
+Copyright © 2021 Opsani <support@opsani.com>
+This file is part of https://github.com/opsani/opsani-ignite
+*/
+
+package math
+
+import (
+	m "math"
+)
+
+func Min(samples ...float64) float64 {
+	min := m.NaN()
+	for _, val := range samples {
+		if m.IsNaN(val) || m.IsInf(val, 0) {
+			continue
+		}
+		if m.IsNaN(min) || val < min {
+			min = val
+		}
+	}
+	return min // will return NaN for empty slice or slice that has no valid values
+}
+
+func Sum(samples ...float64) float64 {
+	total := 0.0
+	for _, val := range samples {
+		if m.IsNaN(val) || m.IsInf(val, 0) {
+			continue
+		}
+		total += val
+	}
+	return total
+}
+
+func Avg(samples ...float64) float64 {
+	total := 0.0
+	count := 0
+	for _, val := range samples {
+		if m.IsNaN(val) || m.IsInf(val, 0) {
+			continue
+		}
+		total += val
+		count += 1
+	}
+
+	if count == 0 {
+		return 0.0
+	}
+	return total / float64(len(samples))
+}