Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVSHAS-9189] Scan will stuck in scheduling after controller is shutdown and restarted #155

Merged
merged 7 commits into from
Aug 1, 2024
18 changes: 16 additions & 2 deletions scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var cveDB *common.CveDB
var ctrlCaps share.ControllerCaps
var scanTasker *Tasker
var selfID string
var isGetCapsActivate bool

func dbRead(path string, maxRetry int, output string) map[string]*share.ScanVulnerability {
dbFile := path + share.DefaultCVEDBName
Expand Down Expand Up @@ -98,12 +99,14 @@ func dbRead(path string, maxRetry int, output string) map[string]*share.ScanVuln
}
}

func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinPort uint16) {
func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinPort uint16, doneCh chan bool) {
cb := &clientCallback{
shutCh: make(chan interface{}, 1),
ignoreShutdown: true,
}

var healthCheckCh chan struct{}

for {
// forever retry
dbData := dbRead(path, 0, "")
Expand All @@ -124,6 +127,17 @@ func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinP
scanner.CVEDB = nil
dbData = make(map[string]*share.ScanVulnerability) // zero size

if healthCheckCh != nil {
close(healthCheckCh)
}

healthCheckCh = make(chan struct{})
// Check if the gRPC HealthCheck API is active (indicated by isGetCapsActivate being true).
// If active, initiate periodic health checks by launching a goroutine to monitor the health status of the specified service.
if isGetCapsActivate {
go periodCheckHealth(joinIP, joinPort, &scanner, cb, healthCheckCh, doneCh)
}

// start responding shutdown notice
cb.ignoreShutdown = false
<-cb.shutCh
Expand Down Expand Up @@ -416,7 +430,7 @@ func main() {

// Use the original address, which is the service name, so when controller changes,
// new IP can be resolved
go connectController(*dbPath, *adv, *join, selfID, (uint32)(*advPort), (uint16)(*joinPort))
go connectController(*dbPath, *adv, *join, selfID, (uint32)(*advPort), (uint16)(*joinPort), done)
<-done

log.Info("Exiting ...")
Expand Down
53 changes: 53 additions & 0 deletions server.go
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert the original variable names. Thanks.

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ import (
"github.com/neuvector/scanner/cvetools"
)

const (
period = 20 // Minutes to check if the scanner is in the controller and controller is alive
retryMax = 3 // Number of retry
)

func createEnforcerScanServiceWrapper(conn *grpc.ClientConn) cluster.Service {
return share.NewEnforcerScanServiceClient(conn)
}
Expand Down Expand Up @@ -365,8 +370,10 @@ func scannerRegister(joinIP string, joinPort uint16, data *share.ScannerRegister

caps, err := client.GetCaps(ctx, &share.RPCVoid{})
if err != nil {
isGetCapsActivate = false
downgradeCriticalSeverityInCVEDB(data)
} else {
isGetCapsActivate = true
ctrlCaps = *caps
if !caps.CriticalVul {
downgradeCriticalSeverityInCVEDB(data)
Expand Down Expand Up @@ -404,3 +411,49 @@ func scannerDeregister(joinIP string, joinPort uint16, id string) error {
}
return nil
}

func getScannerAvailable(joinIP string, joinPort uint16, data *share.ScannerRegisterData, cb cluster.GRPCCallback) (*share.ScannerAvailable, error) {
client, err := getControllerServiceClient(joinIP, joinPort, cb)
if err != nil {
log.WithFields(log.Fields{"error": err}).Error("Failed to find ctrl client")
return &share.ScannerAvailable{Visible: false}, errors.New("Failed to connect to controller")
}

ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()

scannerAvailable, errHealthCheck := client.HealthCheck(ctx, data)

return scannerAvailable, errHealthCheck
}

// To ensure the controller's availability, periodCheckHealth use HealthCheck to periodically check if the controller is alive.
// Additionally, if the controller is deleted or not responsive, the scanner will re-register.
func periodCheckHealth(joinIP string, joinPort uint16, data *share.ScannerRegisterData, cb *clientCallback, healthCheckCh chan struct{}, done chan bool) {
ticker := time.NewTicker(time.Duration(period) * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
retryCnt := 0
for retryCnt < retryMax {
scannerAvailable, errHealthCheck := getScannerAvailable(joinIP, joinPort, data, cb)
if errHealthCheck == nil {
if scannerAvailable.Visible {
break
}
} else {
log.WithFields(log.Fields{"joinIP": joinIP, "joinPort": joinPort, "errHealthCheck": errHealthCheck}).Debug("periodCheckHealth has error")
}
retryCnt++
time.Sleep(time.Duration(period) * time.Second) // Add a delay before retrying
}
if retryCnt >= retryMax {
log.WithFields(log.Fields{"joinIP": joinIP, "joinPort": joinPort, "retryMax": retryMax}).Error("The scanner is not in the controller, restart the scanner pod.")
done <- true
}
case <-healthCheckCh:
return
}
}
}
Loading