/
kill.go
153 lines (127 loc) · 4.29 KB
/
kill.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package debug
import (
"errors"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"time"
"github.com/spf13/cobra"
"github.com/spf13/viper"
cfg "github.com/mihongtech/tendermint/config"
"github.com/mihongtech/tendermint/libs/cli"
rpchttp "github.com/mihongtech/tendermint/rpc/client/http"
)
var killCmd = &cobra.Command{
Use: "kill [pid] [compressed-output-file]",
Short: "Kill a Tendermint process while aggregating and packaging debugging data",
Long: `Kill a Tendermint process while also aggregating Tendermint process data
such as the latest node state, including consensus and networking state,
go-routine state, and the node's WAL and config information. This aggregated data
is packaged into a compressed archive.
Example:
$ tendermint debug 34255 /path/to/tm-debug.zip`,
Args: cobra.ExactArgs(2),
RunE: killCmdHandler,
}
func killCmdHandler(cmd *cobra.Command, args []string) error {
pid, err := strconv.ParseUint(args[0], 10, 64)
if err != nil {
return err
}
outFile := args[1]
if outFile == "" {
return errors.New("invalid output file")
}
rpc, err := rpchttp.New(nodeRPCAddr, "/websocket")
if err != nil {
return fmt.Errorf("failed to create new http client: %w", err)
}
home := viper.GetString(cli.HomeFlag)
conf := cfg.DefaultConfig()
conf = conf.SetRoot(home)
cfg.EnsureRoot(conf.RootDir)
// Create a temporary directory which will contain all the state dumps and
// relevant files and directories that will be compressed into a file.
tmpDir, err := ioutil.TempDir(os.TempDir(), "tendermint_debug_tmp")
if err != nil {
return fmt.Errorf("failed to create temporary directory: %w", err)
}
defer os.RemoveAll(tmpDir)
logger.Info("getting node status...")
if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil {
return err
}
logger.Info("getting node network info...")
if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil {
return err
}
logger.Info("getting node consensus state...")
if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil {
return err
}
logger.Info("copying node WAL...")
if err := copyWAL(conf, tmpDir); err != nil {
return err
}
logger.Info("copying node configuration...")
if err := copyConfig(home, tmpDir); err != nil {
return err
}
logger.Info("killing Tendermint process")
if err := killProc(pid, tmpDir); err != nil {
return err
}
logger.Info("archiving and compressing debug directory...")
return zipDir(tmpDir, outFile)
}
// killProc attempts to kill the Tendermint process with a given PID with an
// ABORT signal which should result in a goroutine stacktrace. The PID's STDERR
// is tailed and piped to a file under the directory dir. An error is returned
// if the output file cannot be created or the tail command cannot be started.
// An error is not returned if any subsequent syscall fails.
func killProc(pid uint64, dir string) error {
// pipe STDERR output from tailing the Tendermint process to a file
//
// NOTE: This will only work on UNIX systems.
cmd := exec.Command("tail", "-f", fmt.Sprintf("/proc/%d/fd/2", pid)) // nolint: gosec
outFile, err := os.Create(filepath.Join(dir, "stacktrace.out"))
if err != nil {
return err
}
defer outFile.Close()
cmd.Stdout = outFile
cmd.Stderr = outFile
if err := cmd.Start(); err != nil {
return err
}
// kill the underlying Tendermint process and subsequent tailing process
go func() {
// Killing the Tendermint process with the '-ABRT|-6' signal will result in
// a goroutine stacktrace.
p, err := os.FindProcess(int(pid))
if err != nil {
fmt.Fprintf(os.Stderr, "failed to find PID to kill Tendermint process: %s", err)
} else if err = p.Signal(syscall.SIGABRT); err != nil {
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process: %s", err)
}
// allow some time to allow the Tendermint process to be killed
//
// TODO: We should 'wait' for a kill to succeed (e.g. poll for PID until it
// cannot be found). Regardless, this should be ample time.
time.Sleep(5 * time.Second)
if err := cmd.Process.Kill(); err != nil {
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process output redirection: %s", err)
}
}()
if err := cmd.Wait(); err != nil {
// only return an error not invoked by a manual kill
if _, ok := err.(*exec.ExitError); !ok {
return err
}
}
return nil
}