-
Notifications
You must be signed in to change notification settings - Fork 427
/
task_struct_resolver.cc
298 lines (241 loc) · 10.1 KB
/
task_struct_resolver.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/*
* Copyright 2018- The Pixie Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "src/stirling/bpf_tools/task_struct_resolver.h"
#include <poll.h>
#include <memory>
#include <string>
#include <vector>
#include "src/common/base/base.h"
#include "src/common/system/proc_parser.h"
#include "src/stirling/bpf_tools/bcc_wrapper.h"
#include "src/stirling/bpf_tools/macros.h"
#include "src/stirling/utils/proc_path_tools.h"
#include "src/stirling/bpf_tools/bcc_bpf_intf/types.h"
// Creates a string_view to the BPF code loaded into memory.
BPF_SRC_STRVIEW(bcc_script, task_struct_mem_read);
// A function which we will uprobe on, to trigger our BPF code.
// The function itself is irrelevant, but it must not be optimized away.
// We declare this with C linkage (extern "C") so it has a simple symbol name.
extern "C" {
NO_OPT_ATTR void StirlingProbeTrigger() { return; }
}
namespace px {
namespace stirling {
namespace utils {
using ::px::stirling::bpf_tools::BPFProbeAttachType;
using ::px::stirling::bpf_tools::UProbeSpec;
namespace {
// This is how Linux converts nanoseconds to clock ticks.
// Used to report PID start times in clock ticks, just like /proc/<pid>/stat does.
uint64_t pl_nsec_to_clock_t(uint64_t x) {
constexpr uint64_t NSEC_PER_SEC = 1000000000L;
constexpr uint64_t USER_HZ = 100;
return x / (NSEC_PER_SEC / USER_HZ);
}
// A helper class for populating the TaskStructOffsets struct.
// Maintains some state to detect invalid/ambiguous cases.
class TaskStructOffsetsManager {
public:
explicit TaskStructOffsetsManager(TaskStructOffsets* offsets) : offsets_(offsets) {}
Status SetRealStartTimeOffset(uint64_t offset) {
if (offsets_->real_start_time_offset != 0) {
const uint64_t prev_word_offset = offset - sizeof(uint64_t);
// Check if we had one previous match, and that match was the previous word,
// if so, this second match is the one we're looking for. Why?
// Linux task struct has two time fields which appear back-to-back:
// u64 start_time; // Monotonic time in nsecs
// u64 start_boottime; // Boot based time in nsecs
// If the machine has never been suspended, then these two times will not match.
// In such cases, we may see a duplicate, and we want to take the second value.
if (num_start_time_matches_ == 1 && offsets_->real_start_time_offset == prev_word_offset) {
// This is the expected duplicate. Nothing to do here.
} else {
return error::Internal(
"Location of real_start_time is ambiguous. Found multiple possible offsets. "
"[previous=$0 current=$1]",
offsets_->real_start_time_offset, offset);
}
}
offsets_->real_start_time_offset = offset;
++num_start_time_matches_;
return Status::OK();
}
Status SetGroupLeaderOffset(uint64_t offset) {
if (offsets_->group_leader_offset != 0) {
return error::Internal(
"Location of group_leader is ambiguous. Found multiple possible offsets. "
"[previous=$0 current=$1]",
offsets_->group_leader_offset, offset);
}
offsets_->group_leader_offset = offset;
return Status::OK();
}
Status CheckPopulated() {
if (offsets_->real_start_time_offset == 0) {
return error::Internal("Could not find offset for real_start_time/start_boottime.");
}
if (offsets_->group_leader_offset == 0) {
return error::Internal("Could not find offset for group_leader.");
}
return Status::OK();
}
private:
TaskStructOffsets* offsets_;
int num_start_time_matches_ = 0;
};
// Analyze the raw buffer for the proc pid start time and the task struct address.
// - proc_pid_start_time is used to look for the real_start_time/start_boottime member.
// Note that the name of the member changed across linux versions.
// - task_struct_addr is used to look for a pointer to self, indicating the group_leader member.
// This works since we are tracing a single-threaded program, so the main thread's leader is
// itself.
StatusOr<TaskStructOffsets> ScanBufferForFields(const struct buf& buf,
const uint64_t proc_pid_start_time,
const uint64_t task_struct_addr) {
VLOG(1) << absl::Substitute("task_struct_address = $0", task_struct_addr);
VLOG(1) << absl::Substitute("/proc/self/stat:start_time = $0", proc_pid_start_time);
TaskStructOffsets task_struct_offsets;
TaskStructOffsetsManager offsets_manager(&task_struct_offsets);
for (const auto& [idx, val] : Enumerate(buf.u64words)) {
int current_offset = idx * sizeof(uint64_t);
if (pl_nsec_to_clock_t(val) == proc_pid_start_time) {
VLOG(1) << absl::Substitute("[offset = $0] Found real_start_time", current_offset);
PL_RETURN_IF_ERROR(offsets_manager.SetRealStartTimeOffset(current_offset));
}
if (val == task_struct_addr) {
VLOG(1) << absl::Substitute("[offset = $0] Found group_leader.", current_offset);
PL_RETURN_IF_ERROR(offsets_manager.SetGroupLeaderOffset(current_offset));
}
}
PL_RETURN_IF_ERROR(offsets_manager.CheckPopulated());
return task_struct_offsets;
}
} // namespace
StatusOr<TaskStructOffsets> ResolveTaskStructOffsetsCore() {
// Get the PID start time from /proc.
PL_ASSIGN_OR_RETURN(uint64_t proc_pid_start_time,
::px::system::GetPIDStartTimeTicks("/proc/self"));
PL_ASSIGN_OR_RETURN(std::filesystem::path self_path, GetSelfPath());
// Use address instead of symbol to specify this probe,
// so that even if debug symbols are stripped, the uprobe can still attach.
uint64_t symbol_addr = reinterpret_cast<uint64_t>(&StirlingProbeTrigger);
UProbeSpec uprobe{.binary_path = self_path,
.symbol = {}, // Keep GCC happy.
.address = symbol_addr,
.attach_type = BPFProbeAttachType::kEntry,
.probe_fn = "task_struct_probe"};
// Deploy the BPF program.
auto bcc = std::make_unique<px::stirling::bpf_tools::BCCWrapper>();
std::vector<std::string> cflags;
// Important! Must tell BCCWrapper that we don't need linux headers, otherwise we may
// enter an infinite loop if BCCWrapper tries to run the TaskStructResolver again.
bool requires_linux_headers = false;
PL_RETURN_IF_ERROR(bcc->InitBPFProgram(bcc_script, cflags, requires_linux_headers));
PL_RETURN_IF_ERROR(bcc->AttachUProbe(uprobe));
// Trigger our uprobe.
StirlingProbeTrigger();
// Retrieve the task struct address from BPF map.
uint64_t task_struct_addr;
{
ebpf::StatusTuple bpf_status =
bcc->GetArrayTable<uint64_t>("task_struct_address_map").get_value(0, task_struct_addr);
if (!bpf_status.ok()) {
return error::Internal("Failed to read task_struct_address_map");
}
}
// Retrieve the raw memory buffer of the task struct.
struct buf buf;
{
ebpf::StatusTuple bpf_status =
bcc->GetArrayTable<struct buf>("task_struct_buf").get_value(0, buf);
if (!bpf_status.ok()) {
return error::Internal("Failed to read task_struct_buf");
}
}
// Analyze the raw data buffer for the patterns we are looking for.
return ScanBufferForFields(buf, proc_pid_start_time, task_struct_addr);
}
namespace {
Status ReadFromChild(int fd, TaskStructOffsets* result) {
// We don't expect to fail to receive data from the child,
// but use poll to make sure we don't block indefinitely.
struct pollfd fds;
fds.fd = fd;
fds.events = POLLIN;
constexpr int kTimeoutMillis = 5000;
int retval = poll(&fds, 1, kTimeoutMillis);
if (retval <= 0) {
return error::Internal("Failed to receive data from child.");
}
retval = read(fd, result, sizeof(*result));
if (retval != sizeof(*result)) {
return error::Internal("Failed to receive data from child.");
}
return Status::OK();
}
Status WriteToParent(int fd, const TaskStructOffsets& result) {
ssize_t bytes_written = write(fd, &result, sizeof(result));
// We don't expect this to happen on this pipe, but check just in case.
if (bytes_written != sizeof(result)) {
return error::Internal("Failed to write data to parent.");
}
return Status::OK();
}
} // namespace
StatusOr<TaskStructOffsets> ResolveTaskStructOffsets() {
const TaskStructOffsets kSentinelValue;
// Create pipe descriptors.
int fd[2];
int retval = pipe(fd);
if (retval == -1) {
return error::Internal("Resolution failed. Unable to create pipe: $0", std::strerror(errno));
}
pid_t child_pid = fork();
if (child_pid != 0) {
// Parent process: Wait for results from child.
// Blocking read data from child.
TaskStructOffsets result;
PL_RETURN_IF_ERROR(ReadFromChild(fd[0], &result));
// We can't transfer StatusOr through the pipe,
// so we have to check manually.
if (result == kSentinelValue) {
return error::Internal(
"Resolution failed in subprocess. Check subprocess logs for the error.");
}
close(fd[0]);
close(fd[1]);
return result;
} else {
// Child process: Run ResolveTaskStructOffsets(),
// and send the result to the parent through pipes.
// On error, we send kSentinelValue.
StatusOr<TaskStructOffsets> result_status = ResolveTaskStructOffsetsCore();
LOG_IF(ERROR, !result_status.ok()) << result_status.ToString();
TaskStructOffsets result = result_status.ValueOr(kSentinelValue);
// Send the value on the write-descriptor.
Status s = WriteToParent(fd[1], result);
LOG_IF(ERROR, !s.ok()) << s.ToString();
// Close FDs.
close(fd[0]);
close(fd[1]);
exit(0);
}
}
} // namespace utils
} // namespace stirling
} // namespace px