Implement a 'aggs' per-CPU BPF map to store aggregation data

Aggregation data consists of data items (all uint64_t) that are updated when an aggregation function is executed. This is quite different from other data recording actions that append data to an output buffer. We need support for concurrency control to allow reading the data during non-atomic updates. We use a latch mechanism, i.e. a multiversion concurrency control mechanism, to satify these requirements. Simply put, the per-CPU aggregation data area stores a sequence id that tracks the generation of the data updates. Since only one aggregation can be updated at a time on a single CPU, one latch is sufficient for all aggregations (per CPU-buffer). We also allocate two copies of each aggregation, which we will call A and B. When we update the aggregation data, we first modify A and then we modify B, while ensuring that while we modify one, the reader is directed to read from the other. Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com> Reviewed-by: Eugene Loh <eugene.loh@oracle.com>
oracle · Nov 30, 2020 · 287e4bf · 287e4bf
1 parent 1ca381c
commit 287e4bf
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 22 deletions.
diff --git a/libdtrace/dt_bpf.c b/libdtrace/dt_bpf.c
@@ -144,6 +144,11 @@ set_task_offsets(dtrace_hdl_t *dtp)
  * - state:	DTrace session state, used to communicate state between BPF
  *		programs and userspace.  The content of the map is defined in
  *		dt_state.h.
+ * - aggs:	Aggregation data buffer map, associated with each CPU.  The
+ *		map is implemented as a global per-CPU map with a singleton
+ *		element (key 0).  Every aggregation is stored with two copies
+ *		of its data to provide a lockless latch-based mechanism for
+ *		atomic reading and writing.
  * - buffers:	Perf event output buffer map, associating a perf event output
  *		buffer with each CPU.  The map is indexed by CPU id.
  * - cpuinfo:	CPU information map, associating a cpuinfo_t structure with
@@ -191,7 +196,7 @@ set_task_offsets(dtrace_hdl_t *dtp)
 int
 dt_bpf_gmap_create(dtrace_hdl_t *dtp)
 {
-	int		gvarc, tvarc;
+	int		gvarc, tvarc, aggsz;
 	int		ci_mapfd;
 	uint32_t	key = 0;
 
@@ -202,6 +207,9 @@ dt_bpf_gmap_create(dtrace_hdl_t *dtp)
 	/* Mark global maps creation as completed. */
 	dt_gmap_done = 1;
 
+	/* Determine the aggregation buffer size.  */
+	aggsz = dt_idhash_nextoff(dtp->dt_aggs, 1, 0);
+
 	/* Determine the number of global and TLS variables. */
 	gvarc = dt_idhash_peekid(dtp->dt_globals) - DIF_VAR_OTHER_UBASE;
 	tvarc = dt_idhash_peekid(dtp->dt_tls) - DIF_VAR_OTHER_UBASE;
@@ -214,6 +222,16 @@ dt_bpf_gmap_create(dtrace_hdl_t *dtp)
 	if (dtp->dt_stmap_fd == -1)
 		return -1;	/* dt_errno is set for us */
 
+	/*
+	 * If there is aggregation data to be collected, we need to add a
+	 * uint64_t to the map value size to hold a latch sequence number (seq)
+	 * for concurrent access to the data.
+	 */
+	if (aggsz > 0 &&
+	    create_gmap(dtp, "aggs", BPF_MAP_TYPE_PERCPU_ARRAY,
+			sizeof(uint32_t), sizeof(uint64_t) +  aggsz, 1) == -1)
+		return -1;	/* dt_errno is set for us */
+
 	if (create_gmap(dtp, "buffers", BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 			sizeof(uint32_t), sizeof(uint32_t),
 			dtp->dt_conf.num_online_cpus) == -1)

diff --git a/libdtrace/dt_dctx.h b/libdtrace/dt_dctx.h
@@ -37,12 +37,14 @@ typedef struct dt_dctx {
 	dt_activity_t	*act;		/* pointer to activity state */
 	dt_mstate_t	*mst;		/* DTrace machine state */
 	char		*buf;		/* Output buffer scratch memory */
+	char		*agg;		/* Aggregation data */
 } dt_dctx_t;
 
 #define DCTX_CTX	offsetof(dt_dctx_t, ctx)
 #define DCTX_ACT	offsetof(dt_dctx_t, act)
 #define DCTX_MST	offsetof(dt_dctx_t, mst)
 #define DCTX_BUF	offsetof(dt_dctx_t, buf)
+#define DCTX_AGG	offsetof(dt_dctx_t, agg)
 #define DCTX_SIZE	((int16_t)sizeof(dt_dctx_t))
 
 /*
@@ -92,23 +94,23 @@ typedef struct dt_dctx {
  *                             +----------------+
  *         SCRATCH_BASE = -512 | Scratch Memory |
  *                             +----------------+
- *   LVAR_END = LVAR(n) = -256 | LVAR n         | (n = DT_LVAR_MAX = 18)
+ *   LVAR_END = LVAR(n) = -256 | LVAR n         | (n = DT_LVAR_MAX = 17)
  *                             +----------------+
  *                             |      ...       |
  *                             +----------------+
- *              LVAR(1) = -120 | LVAR 1         |
+ *              LVAR(1) = -128 | LVAR 1         |
  *                             +----------------+
- *  LVAR_BASE = LVAR(0) = -112 | LVAR 0         |
+ *  LVAR_BASE = LVAR(0) = -120 | LVAR 0         |
  *                             +----------------+
- *             SPILL(n) = -104 | %r8            | (n = DT_STK_NREGS - 1 = 8)
+ *             SPILL(n) = -112 | %r8            | (n = DT_STK_NREGS - 1 = 8)
  *                             +----------------+
  *                             |      ...       |
  *                             +----------------+
- *              SPILL(1) = -48 | %r1            |
+ *              SPILL(1) = -56 | %r1            |
  *                             +----------------+
- * SPILL_BASE = SPILL(0) = -40 | %r0            |
+ * SPILL_BASE = SPILL(0) = -48 | %r0            |
  *                             +----------------+
- *                  DCTX = -32 | DTrace Context | -1
+ *                  DCTX = -40 | DTrace Context | -1
  *                             +----------------+
  */
 #define DT_STK_BASE		((int16_t)0)

diff --git a/libdtrace/dt_dlibs.c b/libdtrace/dt_dlibs.c
@@ -61,6 +61,7 @@ static const dt_ident_t		dt_bpf_symbols[] = {
 	DT_BPF_SYMBOL(dt_set_tvar, DT_IDENT_SYMBOL),
 	DT_BPF_SYMBOL(dt_strnlen, DT_IDENT_SYMBOL),
 	/* BPF maps */
+	DT_BPF_SYMBOL(aggs, DT_IDENT_PTR),
 	DT_BPF_SYMBOL(buffers, DT_IDENT_PTR),
 	DT_BPF_SYMBOL(cpuinfo, DT_IDENT_PTR),
 	DT_BPF_SYMBOL(gvars, DT_IDENT_PTR),

diff --git a/test/unittest/codegen/tst.stack_layout.r b/test/unittest/codegen/tst.stack_layout.r
@@ -1,17 +1,17 @@
 Base:          0
-dctx:        -32
-%r0:         -40
-%r1:         -48
-%r2:         -56
-%r3:         -64
-%r4:         -72
-%r5:         -80
-%r6:         -88
-%r7:         -96
-%r8:        -104
-lvar[ -1]:  -111 (ID  -1)
-lvar[  0]:  -112 (ID   0)
-lvar[  1]:  -120 (ID   1)
-lvar[ 18]:  -256 (ID  18)
+dctx:        -40
+%r0:         -48
+%r1:         -56
+%r2:         -64
+%r3:         -72
+%r4:         -80
+%r5:         -88
+%r6:         -96
+%r7:        -104
+%r8:        -112
+lvar[ -1]:  -119 (ID  -1)
+lvar[  0]:  -120 (ID   0)
+lvar[  1]:  -128 (ID   1)
+lvar[ 17]:  -256 (ID  17)
 lvar[ -1]:  -257 (ID  -1)
 scratch:    -257 .. -512