-
Notifications
You must be signed in to change notification settings - Fork 67
/
native.bpf.c
1478 lines (1277 loc) · 54.8 KB
/
native.bpf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// +build ignore
// ^^ this is a golang build tag meant to exclude this C file from compilation
// by the CGO compiler
//
// SPDX-License-Identifier: GPL-2.0-only
// Copyright 2022 The Parca Authors
#include <common.h>
#include <hash.h>
#include <vmlinux.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "shared.h"
#include "go_runtime.h"
/*================================ CONSTANTS =================================*/
// Programs.
#define NATIVE_UNWINDER_PROGRAM_ID 0
#define RUBY_UNWINDER_PROGRAM_ID 1
#define PYTHON_UNWINDER_PROGRAM_ID 2
#define JAVA_UNWINDER_PROGRAM_ID 3
#if __TARGET_ARCH_x86
// Number of frames to walk per tail call iteration.
#define MAX_STACK_DEPTH_PER_PROGRAM 7
// Number of BPF tail calls that will be attempted.
#define MAX_TAIL_CALLS 19
#endif
#if __TARGET_ARCH_arm64
// Number of frames to walk per tail call iteration.
#define MAX_STACK_DEPTH_PER_PROGRAM 5
// Number of BPF tail calls that will be attempted.
#define MAX_TAIL_CALLS 26
#endif
// Ensure that bpf_perf_prog_read_value() used to clear the addresses will fail as
// the size won't be the expected one. On failure, this helper will zero the buffer.
_Static_assert(sizeof(stack_trace_t) != sizeof(struct bpf_perf_event_value), "stack size must be different to the valid argument");
// Maximum number of frames.
_Static_assert(MAX_TAIL_CALLS *MAX_STACK_DEPTH_PER_PROGRAM >= MAX_STACK_DEPTH, "enough iterations to traverse the whole stack");
// Number of unique stacks.
#define MAX_STACK_TRACES_ENTRIES 64000
// Maximum number of processes we are willing to track.
#define MAX_PROCESSES 5000
// Binary search iterations for dwarf based stack walking.
// 2^19 can bisect ~524_288 entries.
#define MAX_UNWIND_INFO_BINARY_SEARCH_DEPTH 19
// Size of the unwind table.
// 250k * sizeof(stack_unwind_row_t) = 2MB
#define MAX_UNWIND_TABLE_SIZE 250 * 1000
_Static_assert(1 << MAX_UNWIND_INFO_BINARY_SEARCH_DEPTH >= MAX_UNWIND_TABLE_SIZE, "unwind table is big enough");
// Unwind tables bigger than can't fit in the remaining space
// of the current shard are broken up into chunks up to `MAX_UNWIND_TABLE_SIZE`.
#define MAX_UNWIND_TABLE_CHUNKS 30
// Maximum memory mappings per process.
#define MAX_MAPPINGS_PER_PROCESS 2000
#define MAX_MAPPINGS_BINARY_SEARCH_DEPTH 12
_Static_assert(1 << MAX_MAPPINGS_BINARY_SEARCH_DEPTH >= MAX_MAPPINGS_PER_PROCESS, "mappings array is big enough");
// Values for dwarf expressions.
#define DWARF_EXPRESSION_UNKNOWN 0
#define DWARF_EXPRESSION_PLT1 1
#define DWARF_EXPRESSION_PLT2 2
// Values for the unwind table's CFA type.
#define CFA_TYPE_RBP 1
#define CFA_TYPE_RSP 2
#define CFA_TYPE_EXPRESSION 3
// Special values.
#define CFA_TYPE_END_OF_FDE_MARKER 4
// Values for the unwind table's frame pointer type.
#define RBP_TYPE_UNCHANGED 0
#define RBP_TYPE_OFFSET 1
#define RBP_TYPE_REGISTER 2
#define RBP_TYPE_EXPRESSION 3
// Special values.
#define RBP_TYPE_UNDEFINED_RETURN_ADDRESS 4
// Binary search error codes.
#define BINARY_SEARCH_DEFAULT 0xFABADAFABADAULL
#define BINARY_SEARCH_SHOULD_NEVER_HAPPEN 0xDEADBEEFDEADBEEFULL
#define BINARY_SEARCH_EXHAUSTED_ITERATIONS 0xBADFADBADFADBADULL
#define BINARY_SEARCH_NOT_FOUND(var) (var == BINARY_SEARCH_DEFAULT)
#define BINARY_SEARCH_FAILED(var) (var == BINARY_SEARCH_SHOULD_NEVER_HAPPEN || var == BINARY_SEARCH_EXHAUSTED_ITERATIONS)
#define REQUEST_UNWIND_INFORMATION 0
#define REQUEST_PROCESS_MAPPINGS 1
#define REQUEST_REFRESH_PROCINFO 2
#define REQUEST_READ 3
#define ENABLE_STATS_PRINTING false
enum runtime_unwinder_type {
RUNTIME_UNWINDER_TYPE_UNDEFINED = 0,
RUNTIME_UNWINDER_TYPE_RUBY = 1,
RUNTIME_UNWINDER_TYPE_PYTHON = 2,
RUNTIME_UNWINDER_TYPE_JAVA = 3,
RUNTIME_UNWINDER_TYPE_GO = 4,
};
enum find_unwind_table_return {
FIND_UNWIND_SUCCESS = 1,
FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN = 2,
FIND_UNWIND_MAPPING_EXHAUSTED_SEARCH = 3,
FIND_UNWIND_MAPPING_NOT_FOUND = 4,
FIND_UNWIND_CHUNK_NOT_FOUND = 5,
FIND_UNWIND_JITTED = 100,
FIND_UNWIND_SPECIAL = 200,
};
struct unwinder_config_t {
bool filter_processes;
bool verbose_logging;
bool mixed_stack_enabled;
bool python_enabled;
bool ruby_enabled;
bool java_enabled;
bool collect_trace_id;
/* 1 byte of padding */
bool _padding;
u32 rate_limit_unwind_info;
u32 rate_limit_process_mappings;
u32 rate_limit_refresh_process_info;
u32 rate_limit_reads;
};
struct unwinder_stats_t {
u64 total_entries;
u64 total_runs;
u64 total_samples;
u64 success_dwarf;
u64 error_truncated;
u64 error_unsupported_expression;
u64 error_unsupported_frame_pointer_action;
u64 error_unsupported_cfa_register;
u64 error_catchall;
u64 error_should_never_happen;
u64 error_pc_not_covered;
u64 error_pc_not_covered_jit;
u64 error_jit_unupdated_mapping;
u64 error_jit_mixed_mode_disabled; // JIT error because mixed-mode unwinding is disabled
u64 success_jit_frame;
u64 success_jit_to_dwarf;
u64 success_dwarf_to_jit;
u64 success_dwarf_reach_bottom;
u64 success_jit_reach_bottom;
u64 event_request_unwind_information;
u64 event_request_process_mappings;
u64 event_request_refresh_process_info;
u64 event_request_read;
u64 total_zero_pids;
u64 total_kthreads;
u64 total_filter_misses;
};
const volatile struct unwinder_config_t unwinder_config = {};
/*============================== MACROS =====================================*/
#define BPF_MAP(_name, _type, _key_type, _value_type, _max_entries) \
struct { \
__uint(type, _type); \
__uint(max_entries, _max_entries); \
__type(key, _key_type); \
__type(value, _value_type); \
} _name SEC(".maps");
#define BPF_HASH(_name, _key_type, _value_type, _max_entries) BPF_MAP(_name, BPF_MAP_TYPE_HASH, _key_type, _value_type, _max_entries);
#define LOG(fmt, ...) \
({ \
if (unwinder_config.verbose_logging) { \
bpf_printk("native: " fmt, ##__VA_ARGS__); \
} \
})
/*============================= INTERNAL STRUCTS ============================*/
// Unwind tables are splitted in chunks and each chunk
// maps to a range of unwind rows within a shard.
typedef struct {
u64 low_pc;
u64 high_pc;
u64 shard_index;
u64 low_index;
u64 high_index;
} chunk_info_t;
// Unwind table shards for an executable mapping.
typedef struct {
chunk_info_t chunks[MAX_UNWIND_TABLE_CHUNKS];
} unwind_info_chunks_t;
// Represents an executable mapping.
typedef struct {
u64 load_address;
u64 begin;
u64 end;
u64 executable_id;
u64 type;
} mapping_t;
// Executable mappings for a process.
typedef struct {
u64 should_use_fp_by_default;
u64 is_jit_compiler;
u64 unwinder_type;
u64 len;
mapping_t mappings[MAX_MAPPINGS_PER_PROCESS];
} process_info_t;
// A row in the stack unwinding table for Arm64.
typedef struct __attribute__((packed)) {
u64 pc;
#if __TARGET_ARCH_arm64
s16 lr_offset;
#endif
u8 cfa_type;
u8 rbp_type;
s16 cfa_offset;
s16 rbp_offset;
} stack_unwind_row_t;
#if __TARGET_ARCH_arm64
_Static_assert(sizeof(stack_unwind_row_t) == 16, "unwind row has the expected size");
#endif
#if __TARGET_ARCH_x86
_Static_assert(sizeof(stack_unwind_row_t) == 14, "unwind row has the expected size");
#endif
// Unwinding table representation.
typedef struct {
stack_unwind_row_t rows[MAX_UNWIND_TABLE_SIZE];
} stack_unwind_table_t;
#define RUNTIME_INFO_TAG_GO 0
typedef struct {
union {
struct go_runtime_offsets go;
} inner;
u8 tag;
} runtime_info_t;
typedef struct {
u32 pc_not_covered;
u32 no_unwind_info;
u32 missed_filter;
u32 mapping_not_found;
u32 chunk_not_found;
u32 null_unwind_table;
u32 table_not_found;
u32 rbp_failed;
u32 ra_failed;
u32 unsupported_fp_action;
u32 unsupported_cfa;
u32 truncated;
u32 previous_rsp_zero;
u32 previous_rip_zero;
u32 previous_rbp_zero;
u32 internal_error;
} unwind_failed_reasons_t;
/*================================ MAPS =====================================*/
BPF_HASH(debug_threads_ids, int, u8, 1); // Table size will be updated in userspace.
BPF_HASH(process_info, int, process_info_t, MAX_PROCESSES);
BPF_HASH(unwind_info_chunks, u64, unwind_info_chunks_t,
5 * 1000); // Mapping of executable ID to unwind info chunks.
BPF_HASH(unwind_tables, u64, stack_unwind_table_t,
5); // Table size will be updated in userspace.
BPF_HASH(pid_to_runtime_info, int, runtime_info_t, MAX_PROCESSES);
typedef struct {
u8 type;
int pid;
} pid_event_t;
_Static_assert(sizeof(pid_event_t) == 8, "event payload expected to be 64 bits");
BPF_HASH(events_count, pid_event_t, u32, MAX_PROCESSES);
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct unwinder_stats_t);
} percpu_stats SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 4);
__type(key, u32);
__type(value, u32);
} programs SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
__uint(max_entries, 8192);
} events SEC(".maps");
BPF_HASH(unwind_failed_reasons, pid_t, unwind_failed_reasons_t, MAX_PROCESSES)
#define BUMP_UNWIND_FAILED_COUNT(_pid, _reason) \
({ \
pid_t pid = _pid; \
unwind_failed_reasons_t zero = {0}; \
unwind_failed_reasons_t *p_failed_reasons = bpf_map_lookup_or_try_init(&unwind_failed_reasons, &pid, &zero); \
if (p_failed_reasons) { \
__sync_fetch_and_add(&p_failed_reasons->_reason, 1); \
} \
})
/*=========================== HELPER FUNCTIONS ==============================*/
#define DEFINE_COUNTER(__func__name) \
static void bump_unwind_##__func__name() { \
u32 zero = 0; \
struct unwinder_stats_t *unwinder_stats = bpf_map_lookup_elem(&percpu_stats, &zero); \
if (unwinder_stats != NULL) { \
unwinder_stats->__func__name++; \
} \
}
DEFINE_COUNTER(total_entries);
DEFINE_COUNTER(total_runs);
DEFINE_COUNTER(total_samples);
DEFINE_COUNTER(success_dwarf);
DEFINE_COUNTER(error_truncated);
DEFINE_COUNTER(error_unsupported_expression);
DEFINE_COUNTER(error_unsupported_frame_pointer_action);
DEFINE_COUNTER(error_unsupported_cfa_register);
DEFINE_COUNTER(error_catchall);
DEFINE_COUNTER(error_should_never_happen);
DEFINE_COUNTER(error_pc_not_covered);
DEFINE_COUNTER(error_pc_not_covered_jit);
DEFINE_COUNTER(error_jit_unupdated_mapping);
DEFINE_COUNTER(error_jit_mixed_mode_disabled);
DEFINE_COUNTER(success_jit_frame);
DEFINE_COUNTER(success_jit_to_dwarf);
DEFINE_COUNTER(success_dwarf_to_jit);
DEFINE_COUNTER(success_dwarf_reach_bottom);
// DEFINE_COUNTER(success_jit_reach_bottom);
DEFINE_COUNTER(event_request_unwind_information);
DEFINE_COUNTER(event_request_process_mappings);
DEFINE_COUNTER(event_request_refresh_process_info);
DEFINE_COUNTER(event_request_read)
DEFINE_COUNTER(total_zero_pids);
DEFINE_COUNTER(total_kthreads);
DEFINE_COUNTER(total_filter_misses);
// Hack to thwart the verifier's detection of variable bounds.
//
// In recent kernels (6.8 and above) the verifier has gotten smarter
// in its tracking of variable bounds. For example, after an if statement like
// `if (v1 < v2)`,
// if it already had computed bounds for v2, it can infer bounds
// for v1 in each side of the branch (and vice versa). This means it can verify more
// programs successfully, which doesn't matter to us because our program was
// verified successfully before. Unfortunately it has a downside which
// _does_ matter to us: it increases the number of unique verifier states,
// which can cause the same instructions to be explored many times, especially
// in cases where a value is carried through a loop and possibly has
// multiple sets of different bounds on each iteration of the loop, leading to
// a combinatorial explosion. This causes us to blow out the kernel's budget of
// maximum number of instructions verified on program load (currently 1M).
//
// `opaquify32` is a no-op; thus `opaquify32(x)` has the same value as `x`.
// However, the verifier is fortunately not smart enough to realize this,
// and will not realize the result has the same bounds as `x`, subverting the feature
// described above.
//
// For further discussion, see:
// https://lore.kernel.org/bpf/874jci5l3f.fsf@taipei.mail-host-address-is-not-set/
static __always_inline u32 opaquify32(u32 val) {
// We use inline asm to make sure clang doesn't optimize it out
asm volatile(
"%0 ^= 0xffffffff\n"
"%0 ^= 0xffffffff\n"
: "+r"(val)
);
return val;
}
// like opaquify32, but for u64.
static __always_inline u64 opaquify64(u64 val) {
asm volatile(
"%0 ^= 0xffffffffffffffff\n"
"%0 ^= 0xffffffffffffffff\n"
: "+r"(val)
);
return val;
}
static void unwind_print_stats() {
// Do not use the LOG macro, always print the stats.
u32 zero = 0;
struct unwinder_stats_t *unwinder_stats = bpf_map_lookup_elem(&percpu_stats, &zero);
if (unwinder_stats == NULL) {
return;
}
bpf_printk("[[ stats for cpu %d ]]", (int)bpf_get_smp_processor_id());
bpf_printk("\tdwarf_success=%lu", unwinder_stats->success_dwarf);
bpf_printk("\tunsup_expression=%lu", unwinder_stats->error_unsupported_expression);
bpf_printk("\tunsup_frame=%lu", unwinder_stats->error_unsupported_frame_pointer_action);
bpf_printk("\ttruncated=%lu", unwinder_stats->error_truncated);
bpf_printk("\tunsup_cfa_reg=%lu", unwinder_stats->error_unsupported_cfa_register);
bpf_printk("\tcatchall=%lu", unwinder_stats->error_catchall);
bpf_printk("\tnever=%lu", unwinder_stats->error_should_never_happen);
bpf_printk("\tunsup_jit=%lu", unwinder_stats->error_jit_unupdated_mapping);
bpf_printk("\tunsup_jit_mixed_mode_disabled=%lu", unwinder_stats->error_jit_mixed_mode_disabled);
bpf_printk("\tjit_frame=%lu", unwinder_stats->success_jit_frame);
bpf_printk("\tjit_to_dwarf_switch=%lu", unwinder_stats->success_jit_to_dwarf);
bpf_printk("\tdwarf_to_jit_switch=%lu", unwinder_stats->success_dwarf_to_jit);
bpf_printk("\treached_bottom_frame_dwarf=%lu", unwinder_stats->success_dwarf_reach_bottom);
bpf_printk("\treached_bottom_frame_jit=%lu", unwinder_stats->success_jit_reach_bottom);
bpf_printk("\ttotal_entries_counter=%lu", unwinder_stats->total_entries);
bpf_printk("\ttotal_runs_counter=%lu", unwinder_stats->total_runs);
bpf_printk("\ttotal_samples_counter=%lu", unwinder_stats->total_samples);
bpf_printk("\t(not_covered=%lu)", unwinder_stats->error_pc_not_covered);
bpf_printk("\t(not_covered_jit=%lu)", unwinder_stats->error_pc_not_covered_jit);
bpf_printk("\t(total_zero_pids=%lu)", unwinder_stats->total_zero_pids);
bpf_printk("\t(total_kthreads=%lu)", unwinder_stats->total_kthreads);
bpf_printk("\t(total_filter_misses=%lu)", unwinder_stats->total_filter_misses);
bpf_printk("");
}
static void bump_samples() {
u32 zero = 0;
struct unwinder_stats_t *unwinder_stats = bpf_map_lookup_elem(&percpu_stats, &zero);
if (unwinder_stats == NULL) {
return;
}
if (ENABLE_STATS_PRINTING && unwinder_stats->total_samples % 50 == 0) {
unwind_print_stats();
}
bump_unwind_total_samples();
}
/*================================= EVENTS ==================================*/
static __always_inline bool event_rate_limited(pid_event_t event_id, int rate) {
u32 zero = 0;
u32 *val = bpf_map_lookup_or_try_init(&events_count, &event_id, &zero);
if (val) {
if (*val >= rate) {
return true;
}
__sync_fetch_and_add(val, 1);
}
// Even if we got here because the map is full, let's not rate-limit this event.
return false;
}
static __always_inline void request_unwind_information(struct bpf_perf_event_data *ctx, int user_pid) {
char comm[20];
bpf_get_current_comm(comm, 20);
LOG("[debug] requesting unwind info for PID: %d, comm: %s ctx IP: %llx", user_pid, comm, PT_REGS_IP(&ctx->regs));
pid_event_t payload = {REQUEST_UNWIND_INFORMATION, user_pid};
if (event_rate_limited(payload, unwinder_config.rate_limit_unwind_info)) {
return;
}
bump_unwind_event_request_unwind_information();
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &payload, sizeof(u64));
}
static __always_inline void request_process_mappings(struct bpf_perf_event_data *ctx, int user_pid) {
pid_event_t payload = {REQUEST_PROCESS_MAPPINGS, user_pid};
if (event_rate_limited(payload, unwinder_config.rate_limit_process_mappings)) {
return;
}
bump_unwind_event_request_process_mappings();
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &payload, sizeof(u64));
}
static __always_inline void request_refresh_process_info(struct bpf_perf_event_data *ctx, int user_pid) {
pid_event_t payload = {REQUEST_REFRESH_PROCINFO, user_pid};
if (event_rate_limited(payload, unwinder_config.rate_limit_process_mappings)) {
return;
}
bump_unwind_event_request_refresh_process_info();
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &payload, sizeof(u64));
}
static __always_inline void request_read(struct bpf_perf_event_data *ctx, int user_pid, u64 addr) {
typedef struct {
u8 type;
u32 pid;
u64 addr;
} payload_t;
_Static_assert(sizeof(payload_t) == 16, "request_read_addr payload expected to be 128 bits");
// `event_rate_limited` can fail open in case the map is already full.
// We want to have `rate_limit_reads == 0` act as a kill switch where we can be sure
// to NEVER try to read process memory from the agent, so let's just bail early in that case.
if (!unwinder_config.rate_limit_reads) {
return;
}
payload_t payload = {REQUEST_READ, user_pid, addr};
pid_event_t payload_for_rate_limiting = {REQUEST_READ, user_pid};
if (event_rate_limited(payload_for_rate_limiting, unwinder_config.rate_limit_reads)) {
return;
}
bump_unwind_event_request_read();
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &payload, sizeof(payload));
}
// Binary search the executable mappings to find the one that covers a given pc.
static u64 find_mapping(process_info_t *proc_info, u64 pc) {
u64 left = 0;
u64 right = proc_info->len;
u64 found = BINARY_SEARCH_DEFAULT;
// Find the mapping.
for (int i = 0; i < MAX_MAPPINGS_BINARY_SEARCH_DEPTH; i++) {
u32 mid = (left + right) / 2;
if (left >= right) {
return found;
}
mid = opaquify32(mid);
left = opaquify64(left);
right = opaquify64(right);
if (mid < 0 || mid >= MAX_MAPPINGS_PER_PROCESS) {
LOG("\t.should never happen");
return BINARY_SEARCH_SHOULD_NEVER_HAPPEN;
}
if (proc_info->mappings[mid].begin <= pc) {
found = mid;
left = mid + 1;
} else {
right = mid;
}
}
return BINARY_SEARCH_EXHAUSTED_ITERATIONS;
}
// Binary search the unwind table to find the row index containing the unwind
// information for a given program counter (pc).
static u64 find_offset_for_pc(stack_unwind_table_t *table, u64 pc, u64 left, u64 right) {
u64 found = BINARY_SEARCH_DEFAULT;
for (int i = 0; i < MAX_UNWIND_INFO_BINARY_SEARCH_DEPTH; i++) {
// TODO(javierhonduco): ensure that this condition is right as we use
// unsigned values...
if (left >= right) {
LOG("\t.done");
return found;
}
u32 mid = (left + right) / 2;
mid = opaquify32(mid);
left = opaquify32(left);
right = opaquify32(right);
// Appease the verifier.
if (mid < 0 || mid >= MAX_UNWIND_TABLE_SIZE) {
LOG("\t.should never happen, mid: %lu, max: %lu", mid, MAX_UNWIND_TABLE_SIZE);
bump_unwind_error_should_never_happen();
return BINARY_SEARCH_SHOULD_NEVER_HAPPEN;
}
// Debug logs.
// LOG("\t-> fetched PC %llx, target PC %llx (iteration %d/%d, mid: %d, left:%d, right:%d)", table->rows[mid].pc, pc, i,
// MAX_UNWIND_INFO_BINARY_SEARCH_DEPTH, mid, left, right);
if (table->rows[mid].pc <= pc) {
found = mid;
left = mid + 1;
} else {
right = mid;
}
// Debug logs.
// LOG("\t<- fetched PC %llx, target PC %llx (iteration %d/%d, mid:
// --, left:%d, right:%d)", ctx->table->rows[mid].pc, ctx->pc, index,
// MAX_UNWIND_INFO_BINARY_SEARCH_DEPTH, ctx->left, ctx->right);
}
return BINARY_SEARCH_EXHAUSTED_ITERATIONS;
}
// Finds whether a process should be unwound using the unwind
// tables.
static __always_inline bool has_unwind_information(pid_t per_process_id) {
process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &per_process_id);
if (proc_info) {
return true;
}
return false;
}
static __always_inline bool is_debug_enabled_for_thread(int per_thread_id) {
void *val = bpf_map_lookup_elem(&debug_threads_ids, &per_thread_id);
if (val) {
return true;
}
return false;
}
// Finds the shard information for a given pid and program counter. Optionally,
// and offset can be passed that will be filled in with the mapping's load
// address.
static __always_inline enum find_unwind_table_return find_unwind_table(chunk_info_t **chunk_info, pid_t pid, u64 pc, u64 *offset) {
process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &pid);
// Appease the verifier.
if (proc_info == NULL) {
LOG("[error] should never happen");
return FIND_UNWIND_MAPPING_SHOULD_NEVER_HAPPEN;
}
u64 executable_id = 0;
u64 load_address = 0;
u64 type = 0;
u64 index = find_mapping(proc_info, pc);
if (index == BINARY_SEARCH_DEFAULT) {
return FIND_UNWIND_MAPPING_NOT_FOUND;
}
if (index < 0 || index >= MAX_MAPPINGS_PER_PROCESS) {
return -1;
}
bool found = proc_info->mappings[index].begin <= pc && pc <= proc_info->mappings[index].end;
if (!found) {
LOG("[warn] :((( no mapping for ip=%llx", pc);
return FIND_UNWIND_MAPPING_NOT_FOUND;
}
// "type" here is set in userspace in our `proc_info` map to indicate JITed and special sections,
// It is not something we get from procfs.
executable_id = proc_info->mappings[index].executable_id;
load_address = proc_info->mappings[index].load_address;
type = proc_info->mappings[index].type;
if (offset != NULL) {
*offset = load_address;
}
if (type == 1) {
return FIND_UNWIND_JITTED;
}
if (type == 2) {
return FIND_UNWIND_SPECIAL;
}
LOG("~about to check shards found=%d", found);
LOG("~checking shards now");
// Find the chunk where this unwind table lives.
// Each chunk maps to exactly one shard.
unwind_info_chunks_t *chunks = bpf_map_lookup_elem(&unwind_info_chunks, &executable_id);
if (chunks == NULL) {
LOG("[info] chunks is null for executable %llu", executable_id);
return FIND_UNWIND_CHUNK_NOT_FOUND;
}
u64 adjusted_pc = pc - load_address;
for (int i = 0; i < MAX_UNWIND_TABLE_CHUNKS; i++) {
// Reached last chunk.
if (chunks->chunks[i].low_pc == 0) {
break;
}
if (chunks->chunks[i].low_pc <= adjusted_pc && adjusted_pc <= chunks->chunks[i].high_pc) {
LOG("[info] found chunk");
*chunk_info = &chunks->chunks[i];
return FIND_UNWIND_SUCCESS;
}
}
LOG("[error] could not find chunk for adjusted ip=0x%llx", adjusted_pc);
return FIND_UNWIND_CHUNK_NOT_FOUND;
}
// Kernel addresses have the top bits set.
static __always_inline bool in_kernel(u64 ip) {
return ip & (1UL << 63);
}
// kthreads mm's is not set.
//
// We don't check for the return value of `retrieve_task_registers`, it's
// caller due the verifier not liking that code.
static __always_inline bool is_kthread() {
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
if (task == NULL) {
return false;
}
void *mm;
int err = bpf_probe_read_kernel(&mm, 8, &task->mm);
if (err) {
LOG("[warn] bpf_probe_read_kernel failed with %d", err);
return false;
}
return mm == NULL;
}
// avoid R0 invalid mem access 'scalar'
// Port of `task_pt_regs` in BPF.
#if __TARGET_ARCH_arm64
static __always_inline bool retrieve_task_registers(u64 *ip, u64 *sp, u64 *bp, u64 *lr) {
if (ip == NULL || sp == NULL || bp == NULL || lr == NULL) {
return false;
}
#elif __TARGET_ARCH_x86
static __always_inline bool retrieve_task_registers(u64 *ip, u64 *sp, u64 *bp) {
if (ip == NULL || sp == NULL || bp == NULL) {
return false;
}
#else
#error "Unsupported platform"
#endif
int err;
void *stack;
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
if (task == NULL) {
return false;
}
if (is_kthread()) {
return false;
}
err = bpf_probe_read_kernel(&stack, 8, &task->stack);
if (err) {
LOG("[warn] bpf_probe_read_kernel failed with %d", err);
return false;
}
void *ptr = stack + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
struct pt_regs *regs = ((struct pt_regs *)ptr) - 1;
*ip = PT_REGS_IP_CORE(regs);
*sp = PT_REGS_SP_CORE(regs);
*bp = PT_REGS_FP_CORE(regs);
#if __TARGET_ARCH_arm64
*lr = PT_REGS_RET_CORE(regs);
#endif
return true;
}
static __always_inline void unwind_using_kernel_provided_unwinder(struct bpf_perf_event_data *ctx, unwind_state_t *unwind_state, int user_or_kernel) {
long ret = bpf_get_stack(ctx, unwind_state->stack.addresses, MAX_STACK_DEPTH * sizeof(u64), user_or_kernel);
if (ret < 0) {
LOG("[error] bpf_get_stack (%d) failed: %d", ret, user_or_kernel);
return;
}
unwind_state->stack.len = ret / sizeof(u64);
}
static __always_inline void unwind_kernel_stack(struct bpf_perf_event_data *ctx, unwind_state_t *unwind_state) {
unwind_using_kernel_provided_unwinder(ctx, unwind_state, 0);
}
// Aggregate the given stacktrace.
static __always_inline void add_stack(struct bpf_perf_event_data *ctx, u64 pid_tgid, unwind_state_t *unwind_state) {
stack_count_key_t *stack_key = &unwind_state->stack_key;
int per_process_id = pid_tgid >> 32;
int per_thread_id = pid_tgid;
stack_key->pid = per_process_id;
stack_key->tgid = per_thread_id;
if (unwinder_config.collect_trace_id) {
runtime_info_t *runtime_info = bpf_map_lookup_elem(&pid_to_runtime_info, &per_process_id);
if (runtime_info && runtime_info->tag == RUNTIME_INFO_TAG_GO) {
get_trace_id(ctx, &runtime_info->inner.go, stack_key->trace_id);
}
}
// Hash and add user stack.
u64 user_stack_id = hash_stack(&unwind_state->stack, 0);
stack_key->user_stack_id = user_stack_id;
int err = bpf_map_update_elem(&stack_traces, &user_stack_id, &unwind_state->stack, BPF_ANY);
if (err != 0) {
LOG("[error] failed to update user stack with %d", err);
}
// Hash and add kernel stack.
unwind_kernel_stack(ctx, unwind_state);
u64 kernel_stack_id = hash_stack(&unwind_state->stack, 0);
stack_key->kernel_stack_id = kernel_stack_id;
err = bpf_map_update_elem(&stack_traces, &kernel_stack_id, &unwind_state->stack, BPF_ANY);
if (err != 0) {
LOG("[error] failed to update kernel stack with %d", err);
}
request_process_mappings(ctx, per_process_id);
// Continue unwinding runtimes, if any.
switch (unwind_state->unwinder_type) {
case RUNTIME_UNWINDER_TYPE_UNDEFINED:
case RUNTIME_UNWINDER_TYPE_GO:
// Most programs aren't "runtimes", this can be rather verbose.
// LOG("[debug] per_process_id: %d not a runtime", per_process_id);
aggregate_stacks();
break;
case RUNTIME_UNWINDER_TYPE_RUBY:
if (!unwinder_config.ruby_enabled) {
LOG("[debug] Ruby unwinder (rbperf) is disabled");
aggregate_stacks();
break;
}
LOG("[debug] tail-call to Ruby unwinder (rbperf)");
bpf_tail_call(ctx, &programs, RUBY_UNWINDER_PROGRAM_ID);
break;
case RUNTIME_UNWINDER_TYPE_PYTHON:
if (!unwinder_config.python_enabled) {
LOG("[debug] Python unwinder (pyperf) is disabled");
aggregate_stacks();
break;
}
LOG("[debug] tail-call to Python unwinder (pyperf)");
bpf_tail_call(ctx, &programs, PYTHON_UNWINDER_PROGRAM_ID);
break;
case RUNTIME_UNWINDER_TYPE_JAVA:
if (!unwinder_config.java_enabled) {
LOG("[debug] Java unwinder (jvm) is disabled");
aggregate_stacks();
break;
}
LOG("[debug] tail-call to Java unwinder (jvm)");
bpf_tail_call(ctx, &programs, JAVA_UNWINDER_PROGRAM_ID);
break;
default:
LOG("[error] bad runtime unwinder type value: %d", unwind_state->unwinder_type);
break;
}
}
static __always_inline void add_frame(unwind_state_t *unwind_state, u64 frame) {
u64 len = unwind_state->stack.len;
if (len >= 0 && len < MAX_STACK_DEPTH) {
unwind_state->stack.addresses[len] = frame;
unwind_state->stack.len++;
}
}
static __always_inline u64 canonicalize_addr(u64 addr) {
#if __TARGET_ARCH_arm64
// aarch64 has a 48-bit address space; one bit (in position 56)
// indicates whether it points into kernel or user space.
// the remaining 15 bits of pointers can be used for
// various other purposes. Before reading from an address, it needs
// to be canonicalized by setting the higher-order bits to 1 or 0
// for kernel and user space, respectively.
return (addr & (1ull << 55)) ? (addr | 0xFFFF000000000000) : (addr & 0x0000FFFFFFFFFFFF);
#else
return addr;
#endif
}
SEC("perf_event")
int native_unwind(struct bpf_perf_event_data *ctx) {
u64 pid_tgid = bpf_get_current_pid_tgid();
int per_process_id = pid_tgid >> 32;
int err = 0;
bool reached_bottom_of_stack = false;
u32 zero = 0;
bool dwarf_to_jit = false;
unwind_state_t *unwind_state = bpf_map_lookup_elem(&heap, &zero);
if (unwind_state == NULL) {
LOG("unwind_state is NULL, should not happen");
return 1;
}
process_info_t *proc_info = bpf_map_lookup_elem(&process_info, &per_process_id);
if (proc_info == NULL) {
LOG("[error] should never happen");
return 1;
}
for (int i = 0; i < MAX_STACK_DEPTH_PER_PROGRAM; i++) {
LOG("## frame: %d", unwind_state->stack.len);
LOG("\tcurrent pc: %llx", unwind_state->ip);
LOG("\tcurrent sp: %llx", unwind_state->sp);
LOG("\tcurrent bp: %llx", unwind_state->bp);
u64 offset = 0;
chunk_info_t *chunk_info = NULL;
enum find_unwind_table_return unwind_table_result = find_unwind_table(&chunk_info, per_process_id, unwind_state->ip, &offset);
if (unwind_table_result == FIND_UNWIND_JITTED) {
LOG("[debug] Unwinding JITed stacks");
unwind_state->unwinding_jit = true;
if (dwarf_to_jit) {
dwarf_to_jit = false;
bump_unwind_success_dwarf_to_jit();
}
bump_unwind_success_jit_frame();
unwind_state->use_fp = true;
goto unwind_with_frame_pointers;
} else if (unwind_table_result == FIND_UNWIND_SPECIAL) {
LOG("vDSO mapping, trying with frame pointers");
runtime_info_t *runtime_info = bpf_map_lookup_elem(&pid_to_runtime_info, &per_process_id);
if (runtime_info && runtime_info->tag == RUNTIME_INFO_TAG_GO) {
u64 sp = 0;
u64 pc = 0;
bool success = get_go_vdso_state(ctx, &runtime_info->inner.go, &sp, &pc);
if (!success) {
LOG("[error] failed to read Go vdso state");
} else if (sp && pc) {
LOG("[info] got vdso state: sp=0x%lx, pc=0x%lx", sp, pc);
unwind_state->vdso_sp = sp;
unwind_state->vdso_pc = pc;
}
}
unwind_state->use_fp = true;
goto unwind_with_frame_pointers;
} else if (unwind_table_result == FIND_UNWIND_MAPPING_NOT_FOUND) {
LOG("[warn] mapping not found");
request_refresh_process_info(ctx, per_process_id);
BUMP_UNWIND_FAILED_COUNT(per_process_id, mapping_not_found);
return 1;
} else if (unwind_table_result == FIND_UNWIND_CHUNK_NOT_FOUND) {
if (proc_info->should_use_fp_by_default) {
LOG("[info] chunk not found, trying with frame pointers");
unwind_state->use_fp = true;
goto unwind_with_frame_pointers;
}
LOG("[info] chunk not found but fp unwinding not allowed");
BUMP_UNWIND_FAILED_COUNT(per_process_id, chunk_not_found);
return 1;
} else if (chunk_info == NULL) {
LOG("[debug] chunks is null");
reached_bottom_of_stack = true;
break;
}
stack_unwind_table_t *unwind_table = bpf_map_lookup_elem(&unwind_tables, &chunk_info->shard_index);
if (unwind_table == NULL) {
LOG("unwind table is null :( for shard %llu", chunk_info->shard_index);
BUMP_UNWIND_FAILED_COUNT(per_process_id, null_unwind_table);
return 0;
}
LOG("le offset: %llx", offset);
u64 left = chunk_info->low_index;
u64 right = chunk_info->high_index;
LOG("========== left %llu right %llu", left, right);
u64 table_idx = find_offset_for_pc(unwind_table, unwind_state->ip - offset, left, right);
if (BINARY_SEARCH_NOT_FOUND(table_idx) || BINARY_SEARCH_FAILED(table_idx)) {
LOG("[error] binary search failed with %llx", table_idx);
BUMP_UNWIND_FAILED_COUNT(per_process_id, table_not_found);
return 1;
}
LOG("\t=> table_index: %d", table_idx);
LOG("\t=> adjusted pc: %llx", unwind_state->ip - offset);
// Appease the verifier.
if (table_idx < 0 || table_idx >= MAX_UNWIND_TABLE_SIZE) {
LOG("\t[error] this should never happen table_idx");
bump_unwind_error_should_never_happen();
BUMP_UNWIND_FAILED_COUNT(per_process_id, internal_error);
return 1;
}
// lr offset is only fetched from userspace and used as a field in unwind table for Arm64
#if __TARGET_ARCH_arm64
s16 found_lr_offset = unwind_table->rows[table_idx].lr_offset;
#endif