Skip to content

Commit

Permalink
fix: restart loop in windows service (#703)
Browse files Browse the repository at this point in the history
  • Loading branch information
cristianciutea authored Aug 25, 2021
1 parent 71fbe2a commit d2db1a3
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 46 deletions.
19 changes: 1 addition & 18 deletions cmd/newrelic-infra-service/newrelic-infra-service.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"os"

"github.com/newrelic/infrastructure-agent/internal/agent/service"
"github.com/newrelic/infrastructure-agent/internal/os/api"
"github.com/newrelic/infrastructure-agent/pkg/log"
)

Expand All @@ -17,8 +16,7 @@ func main() {
log.Info("Creating service...")

// Create a native service wrapper for the agent and start it up.
exitCodeC := make(chan int, 1)
agentSvc, err := service.New(exitCodeC, os.Args...)
agentSvc, err := service.New(os.Args...)

if err != nil {
log.WithError(err).Error("Initializing service manager support...")
Expand All @@ -28,19 +26,4 @@ func main() {
if err = agentSvc.Run(); err != nil {
log.WithError(err).Warn("Service exiting abnormally.")
}

err = service.WaitForExitOrTimeout(exitCodeC)
if err == nil {
log.Info("Service exited.")
return
}

// This might not be an error: child may have already exited.
if errCode, ok := err.(*api.ExitCodeErr); ok {
log.WithError(err).Warn("Service exiting with child process status code.")
os.Exit(errCode.ExitCode())
}

log.WithError(err).Warn("Service exiting with child process error message.")
os.Exit(1)
}
59 changes: 31 additions & 28 deletions internal/agent/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ type Service struct {
daemon daemon
}

func New(exitCodeC chan int, arg ...string) (service.Service, error) {
func New(arg ...string) (service.Service, error) {
svc := &Service{
daemon: daemon{
args: arg,
exitCodeC: exitCodeC,
exited: new(atomBool),
args: arg,
exitCodeC: make(chan int, 1),
exited: new(atomBool),
stopRequested: new(atomBool),
},
}

Expand All @@ -49,11 +50,12 @@ func New(exitCodeC chan int, arg ...string) (service.Service, error) {
}

type daemon struct {
sync.Mutex // daemon can be accessed from different routines.
args []string
cmd *exec.Cmd
exitCodeC chan int // exit status handed off to the service for its own exit
exited *atomBool
sync.Mutex // daemon can be accessed from different routines.
args []string
cmd *exec.Cmd
exitCodeC chan int // exit status handed off to the service for its own exit
exited *atomBool
stopRequested *atomBool
}

type atomBool struct {
Expand All @@ -69,10 +71,7 @@ func (b *atomBool) Set(value bool) {
}

func (b *atomBool) Get() bool {
if atomic.LoadInt32(&(b.flag)) != 0 {
return true
}
return false
return atomic.LoadInt32(&(b.flag)) != 0
}

// GetCommandPath returns the absolute path of the agent binary that should be run.
Expand Down Expand Up @@ -107,24 +106,28 @@ func WaitForExitOrTimeout(exitCode <-chan int) error {
}
}

// There are 2 scenarios in how the child process can exit:
// 1. OS requested service stop
// 2. child process decided to exit on it's own.
// In the second scenario, there's no other way to make interface.Run to stop
// we have to call os.Exit.
func (d *daemon) exitWithChildStatus(s service.Service, exitCode int) {
log.WithField("exit_code", exitCode).
Info("child process exited")
var err error
var st service.Status
if st, err = s.Status(); err == nil && st == service.StatusRunning {
d.exited.Set(true)

d.exited.Set(true)

// OS requested service stop.
if d.stopRequested.Get() {
d.exitCodeC <- exitCode
log.Debug("signaling service stop...")
// TODO: investigate this, seems that stopping the service will affect
// the service recovery policy on both Windows / Linux
//s.Stop()
}
if st != service.StatusUnknown {
log.WithError(err).Debug("retrieving service status")
} else { // child process decided to exit on it's own.

// If newrelic-infra-service was not started manually from cmd line (interactive mode).
if !service.Interactive() && exitCode == 0 {
s.Stop()
return
}
// In case of error we have to call os.Exit and rely on os service recovery policy.
os.Exit(exitCode)
}
// interface.Stop cannot be called as service is not running
// and there's no other way to make interface.Run to stop.
// Case for foreground execution, AKA interactive mode.
os.Exit(exitCode)
}
8 changes: 8 additions & 0 deletions internal/agent/service/service_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ func (svc *Service) Stop(_ service.Service) (err error) {

log.Info("Service is stopping. waiting for agent process to terminate...")

svc.daemon.Lock()
defer svc.daemon.Unlock()

svc.daemon.stopRequested.Set(true)

err = svc.daemon.cmd.Process.Signal(signals.GracefulStop)
if err != nil {
log.WithError(err).Debug("Failed to send graceful stop signal to process.")
Expand All @@ -48,9 +53,11 @@ func (svc *Service) Shutdown(_ service.Service) (err error) {

func (d *daemon) run(s service.Service) {
for {
d.Lock()
d.cmd = exec.Command(GetCommandPath(d.args[0]), d.args[1:]...)
d.cmd.Stdout = os.Stdout
d.cmd.Stderr = os.Stderr
d.Unlock()

exitCode := api.CheckExitCode(d.cmd.Run())

Expand All @@ -60,6 +67,7 @@ func (d *daemon) run(s service.Service) {
continue
default:
d.exitWithChildStatus(s, exitCode)
return
}
}
}
11 changes: 11 additions & 0 deletions internal/agent/service/service_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@ func (svc *Service) Start(s service.Service) (err error) {
// - if we start and stop the service immediately, the agent may not stop properly
// and so we have to kill it forcefully
func (svc *Service) Stop(s service.Service) (err error) {
if svc.daemon.exited.Get() {
return nil
}
log.Info("service is stopping. notifying agent process.")

svc.daemon.Lock()
defer svc.daemon.Unlock()

svc.daemon.stopRequested.Set(true)

// notify the agent to gracefully stop
windows.PostNotificationMessage(windows.GetPipeName(svcName), ipc.Stop)

Expand All @@ -40,11 +45,16 @@ func (svc *Service) Stop(s service.Service) (err error) {
// - if we start the service and shutdown the host immediately, the agent may not stop properly
// and so we have to kill it forcefully
func (svc *Service) Shutdown(s service.Service) (err error) {
if svc.daemon.exited.Get() {
return nil
}
log.Debug("host is shutting down. notifying agent process.")

svc.daemon.Lock()
defer svc.daemon.Unlock()

svc.daemon.stopRequested.Set(true)

// notify the agent to update the shutdown status and then stop gracefully
windows.PostNotificationMessage(windows.GetPipeName(svcName), ipc.Shutdown)

Expand All @@ -68,6 +78,7 @@ func (d *daemon) run(s service.Service) {
continue
default:
d.exitWithChildStatus(s, exitCode)
return
}
}
}
Expand Down

0 comments on commit d2db1a3

Please sign in to comment.