/
cmd_ctl_watchdog.go
130 lines (110 loc) · 3.3 KB
/
cmd_ctl_watchdog.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// Copyright 2015 Keybase, Inc. All rights reserved. Use of
// this source code is governed by the included BSD license.
package client
import (
"fmt"
"os"
"syscall"
"github.com/keybase/cli"
"github.com/keybase/client/go/libcmdline"
"github.com/keybase/client/go/libkb"
keybase1 "github.com/keybase/client/go/protocol/keybase1"
)
const numRestartsDefault = 10
type CmdWatchdog struct {
libkb.Contextified
restarts int
}
func (c *CmdWatchdog) ParseArgv(ctx *cli.Context) error {
c.restarts = ctx.Int("num-restarts")
if c.restarts == 0 {
c.restarts = numRestartsDefault
}
return nil
}
func (c *CmdWatchdog) checkAlreadyRunning() bool {
s, err := libkb.NewSocket(c.G())
if err != nil {
return false
}
conn, err := s.DialSocket()
if conn != nil {
conn.Close()
return true
}
return false
}
func (c *CmdWatchdog) Run() (err error) {
// Start + watch over the running service
// until it goes away, which will mean one of:
// - crash
// - system shutdown
// - uninstall
// - legitimate stoppage (ctl stop)
// - legitimate stoppage (ctl restart)
// Testing loop:
// - start service, noting pid
// - Do a wait operation on the process
// - On return, check exit code of process
// - No error: legitimate shutdown, we exit.
// - Failure exit code: restart service
// - Special restart command exit code: restart without counting
//
// Note that we give up after c.restarts consecutive crashes.
// Loop one extra time for initial spawn.
for restartcount := 0; restartcount <= c.restarts; restartcount++ {
// Blocking wait on service. First, there has to be a pid
// file, because this is a forking command.
var pid int
// restart server case
if pid, err = spawnServer(c.G(), c.G().Env.GetCommandLine(), keybase1.ForkType_WATCHDOG); err != nil {
return err
}
p, err := os.FindProcess(pid)
if err != nil {
c.G().Log.Warning("Watchdog can't find %d, exiting", pid)
return err
}
pstate, err := p.Wait()
if err != nil || pstate.Exited() == false {
c.G().Log.Warning("Watchdog ends service wait with no error or exit")
return err
}
if pstate.Success() {
// apparently legitimate shutdown
return nil
}
if c.checkAlreadyRunning() {
return fmt.Errorf("Watchdog Service already running before watchdog - quitting.")
}
status := pstate.Sys().(syscall.WaitStatus)
c.G().Log.Warning("watched service crash with status %v, count %d", status, restartcount)
if status.ExitStatus() == int(keybase1.ExitCode_RESTART) {
// Some third process issued a restart command.
// This doesn't count against our limit
restartcount--
}
}
return fmt.Errorf("Watchdog observed %d crashes in a row. NOT reforking.", c.restarts)
}
func NewCmdWatchdog(cl *libcmdline.CommandLine, g *libkb.GlobalContext) cli.Command {
return cli.Command{
Name: "watchdog",
Usage: "Start, watch and prop up the background service",
Action: func(c *cli.Context) {
cl.ChooseCommand(&CmdWatchdog{Contextified: libkb.NewContextified(g)}, "watchdog", c)
cl.SetForkCmd(libcmdline.NoFork)
cl.SetLogForward(libcmdline.LogForwardNone)
},
Flags: []cli.Flag{
cli.IntFlag{
Name: "n, num-restarts",
Value: numRestartsDefault,
Usage: "specify the number of retries before giving up",
},
},
}
}
func (c *CmdWatchdog) GetUsage() libkb.Usage {
return libkb.Usage{}
}