Skip to content

Commit af34245

Browse files
authored
Merge pull request redis#7289 from oranagra/defrag_edge_case
fix a rare active defrag edge case bug leading to stagnation
2 parents 23a85ba + 88d71f4 commit af34245

File tree

5 files changed

+173
-30
lines changed

5 files changed

+173
-30
lines changed

deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
216216
}
217217

218218
JEMALLOC_ALWAYS_INLINE int
219-
iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
219+
iget_defrag_hint(tsdn_t *tsdn, void* ptr) {
220220
int defrag = 0;
221221
rtree_ctx_t rtree_ctx_fallback;
222222
rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -232,11 +232,22 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
232232
malloc_mutex_lock(tsdn, &bin->lock);
233233
/* don't bother moving allocations from the slab currently used for new allocations */
234234
if (slab != bin->slabcur) {
235-
const bin_info_t *bin_info = &bin_infos[binind];
236-
size_t availregs = bin_info->nregs * bin->stats.curslabs;
237-
*bin_util = ((long long)bin->stats.curregs<<16) / availregs;
238-
*run_util = ((long long)(bin_info->nregs - extent_nfree_get(slab))<<16) / bin_info->nregs;
239-
defrag = 1;
235+
int free_in_slab = extent_nfree_get(slab);
236+
if (free_in_slab) {
237+
const bin_info_t *bin_info = &bin_infos[binind];
238+
int curslabs = bin->stats.curslabs;
239+
size_t curregs = bin->stats.curregs;
240+
if (bin->slabcur) {
241+
/* remove slabcur from the overall utilization */
242+
curregs -= bin_info->nregs - extent_nfree_get(bin->slabcur);
243+
curslabs -= 1;
244+
}
245+
/* Compare the utilization ratio of the slab in question to the total average,
246+
* to avoid precision lost and division, we do that by extrapolating the usage
247+
* of the slab as if all slabs have the same usage. If this slab is less used
248+
* than the average, we'll prefer to evict the data to hopefully more used ones */
249+
defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs;
250+
}
240251
}
241252
malloc_mutex_unlock(tsdn, &bin->lock);
242253
}

deps/jemalloc/src/jemalloc.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3326,12 +3326,10 @@ jemalloc_postfork_child(void) {
33263326
/******************************************************************************/
33273327

33283328
/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation.
3329-
* returns 0 if the allocation is in the currently active run,
3330-
* or when it is not causing any frag issue (large or huge bin)
3331-
* returns the bin utilization and run utilization both in fixed point 16:16.
3329+
* returns 1 if the allocation should be moved, and 0 if the allocation be kept.
33323330
* If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */
33333331
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
3334-
get_defrag_hint(void* ptr, int *bin_util, int *run_util) {
3332+
get_defrag_hint(void* ptr) {
33353333
assert(ptr != NULL);
3336-
return iget_defrag_hint(TSDN_NULL, ptr, bin_util, run_util);
3334+
return iget_defrag_hint(TSDN_NULL, ptr);
33373335
}

src/debug.c

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,13 @@ void mallctl_int(client *c, robj **argv, int argc) {
311311
size_t sz = sizeof(old);
312312
while (sz > 0) {
313313
if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) {
314+
if (ret == EPERM && argc > 1) {
315+
/* if this option is write only, try just writing to it. */
316+
if (!(ret=je_mallctl(argv[0]->ptr, NULL, 0, &val, sz))) {
317+
addReply(c, shared.ok);
318+
return;
319+
}
320+
}
314321
if (ret==EINVAL) {
315322
/* size might be wrong, try a smaller one */
316323
sz /= 2;
@@ -333,17 +340,30 @@ void mallctl_int(client *c, robj **argv, int argc) {
333340
}
334341

335342
void mallctl_string(client *c, robj **argv, int argc) {
336-
int ret;
343+
int rret, wret;
337344
char *old;
338345
size_t sz = sizeof(old);
339346
/* for strings, it seems we need to first get the old value, before overriding it. */
340-
if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
341-
addReplyErrorFormat(c,"%s", strerror(ret));
342-
return;
347+
if ((rret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
348+
/* return error unless this option is write only. */
349+
if (!(rret == EPERM && argc > 1)) {
350+
addReplyErrorFormat(c,"%s", strerror(rret));
351+
return;
352+
}
353+
}
354+
if(argc > 1) {
355+
char *val = argv[1]->ptr;
356+
char **valref = &val;
357+
if ((!strcmp(val,"VOID")))
358+
valref = NULL, sz = 0;
359+
wret = je_mallctl(argv[0]->ptr, NULL, 0, valref, sz);
343360
}
344-
addReplyBulkCString(c, old);
345-
if(argc > 1)
346-
je_mallctl(argv[0]->ptr, NULL, 0, &argv[1]->ptr, sizeof(char*));
361+
if (!rret)
362+
addReplyBulkCString(c, old);
363+
else if (wret)
364+
addReplyErrorFormat(c,"%s", strerror(wret));
365+
else
366+
addReply(c, shared.ok);
347367
}
348368
#endif
349369

src/defrag.c

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
/* this method was added to jemalloc in order to help us understand which
4545
* pointers are worthwhile moving and which aren't */
46-
int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util);
46+
int je_get_defrag_hint(void* ptr);
4747

4848
/* forward declarations*/
4949
void defragDictBucketCallback(void *privdata, dictEntry **bucketref);
@@ -55,18 +55,11 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
5555
* when it returns a non-null value, the old pointer was already released
5656
* and should NOT be accessed. */
5757
void* activeDefragAlloc(void *ptr) {
58-
int bin_util, run_util;
5958
size_t size;
6059
void *newptr;
61-
if(!je_get_defrag_hint(ptr, &bin_util, &run_util)) {
62-
server.stat_active_defrag_misses++;
63-
return NULL;
64-
}
65-
/* if this run is more utilized than the average utilization in this bin
66-
* (or it is full), skip it. This will eventually move all the allocations
67-
* from relatively empty runs into relatively full runs. */
68-
if (run_util > bin_util || run_util == 1<<16) {
60+
if(!je_get_defrag_hint(ptr)) {
6961
server.stat_active_defrag_misses++;
62+
size = zmalloc_size(ptr);
7063
return NULL;
7164
}
7265
/* move this allocation to a new allocation.

tests/unit/memefficiency.tcl

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ start_server {tags {"defrag"}} {
9595
}
9696
if {$::verbose} {
9797
puts "frag $frag"
98+
set misses [s active_defrag_misses]
99+
set hits [s active_defrag_hits]
100+
puts "hits: $hits"
101+
puts "misses: $misses"
98102
puts "max latency $max_latency"
99103
puts [r latency latest]
100104
puts [r latency history active-defrag-cycle]
@@ -221,6 +225,10 @@ start_server {tags {"defrag"}} {
221225
}
222226
if {$::verbose} {
223227
puts "frag $frag"
228+
set misses [s active_defrag_misses]
229+
set hits [s active_defrag_hits]
230+
puts "hits: $hits"
231+
puts "misses: $misses"
224232
puts "max latency $max_latency"
225233
puts [r latency latest]
226234
puts [r latency history active-defrag-cycle]
@@ -256,11 +264,12 @@ start_server {tags {"defrag"}} {
256264
set expected_frag 1.7
257265
# add a mass of list nodes to two lists (allocations are interlaced)
258266
set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
259-
for {set j 0} {$j < 500000} {incr j} {
267+
set elements 500000
268+
for {set j 0} {$j < $elements} {incr j} {
260269
$rd lpush biglist1 $val
261270
$rd lpush biglist2 $val
262271
}
263-
for {set j 0} {$j < 500000} {incr j} {
272+
for {set j 0} {$j < $elements} {incr j} {
264273
$rd read ; # Discard replies
265274
$rd read ; # Discard replies
266275
}
@@ -302,6 +311,8 @@ start_server {tags {"defrag"}} {
302311

303312
# test the the fragmentation is lower
304313
after 120 ;# serverCron only updates the info once in 100ms
314+
set misses [s active_defrag_misses]
315+
set hits [s active_defrag_hits]
305316
set frag [s allocator_frag_ratio]
306317
set max_latency 0
307318
foreach event [r latency latest] {
@@ -312,6 +323,8 @@ start_server {tags {"defrag"}} {
312323
}
313324
if {$::verbose} {
314325
puts "frag $frag"
326+
puts "misses: $misses"
327+
puts "hits: $hits"
315328
puts "max latency $max_latency"
316329
puts [r latency latest]
317330
puts [r latency history active-defrag-cycle]
@@ -320,13 +333,121 @@ start_server {tags {"defrag"}} {
320333
# due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
321334
# we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
322335
assert {$max_latency <= 30}
336+
337+
# in extreme cases of stagnation, we see over 20m misses before the tests aborts with "defrag didn't stop",
338+
# in normal cases we only see 100k misses out of 500k elements
339+
assert {$misses < $elements}
323340
}
324341
# verify the data isn't corrupted or changed
325342
set newdigest [r debug digest]
326343
assert {$digest eq $newdigest}
327344
r save ;# saving an rdb iterates over all the data / pointers
328345
r del biglist1 ;# coverage for quicklistBookmarksClear
329346
} {1}
347+
348+
test "Active defrag edge case" {
349+
# there was an edge case in defrag where all the slabs of a certain bin are exact the same
350+
# % utilization, with the exception of the current slab from which new allocations are made
351+
# if the current slab is lower in utilization the defragger would have ended up in stagnation,
352+
# keept running and not move any allocation.
353+
# this test is more consistent on a fresh server with no history
354+
start_server {tags {"defrag"}} {
355+
r flushdb
356+
r config resetstat
357+
r config set save "" ;# prevent bgsave from interfereing with save below
358+
r config set hz 100
359+
r config set activedefrag no
360+
r config set active-defrag-max-scan-fields 1000
361+
r config set active-defrag-threshold-lower 5
362+
r config set active-defrag-cycle-min 65
363+
r config set active-defrag-cycle-max 75
364+
r config set active-defrag-ignore-bytes 1mb
365+
r config set maxmemory 0
366+
set expected_frag 1.3
367+
368+
r debug mallctl-str thread.tcache.flush VOID
369+
# fill the first slab containin 32 regs of 640 bytes.
370+
for {set j 0} {$j < 32} {incr j} {
371+
r setrange "_$j" 600 x
372+
r debug mallctl-str thread.tcache.flush VOID
373+
}
374+
375+
# add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab.
376+
set rd [redis_deferring_client]
377+
set keys 640000
378+
for {set j 0} {$j < $keys} {incr j} {
379+
$rd setrange $j 600 x
380+
}
381+
for {set j 0} {$j < $keys} {incr j} {
382+
$rd read ; # Discard replies
383+
}
384+
385+
# create some fragmentation of 50%
386+
set sent 0
387+
for {set j 0} {$j < $keys} {incr j 1} {
388+
$rd del $j
389+
incr sent
390+
incr j 1
391+
}
392+
for {set j 0} {$j < $sent} {incr j} {
393+
$rd read ; # Discard replies
394+
}
395+
396+
# create higher fragmentation in the first slab
397+
for {set j 10} {$j < 32} {incr j} {
398+
r del "_$j"
399+
}
400+
401+
# start defrag
402+
after 120 ;# serverCron only updates the info once in 100ms
403+
set frag [s allocator_frag_ratio]
404+
if {$::verbose} {
405+
puts "frag $frag"
406+
}
407+
408+
assert {$frag >= $expected_frag}
409+
410+
set digest [r debug digest]
411+
catch {r config set activedefrag yes} e
412+
if {![string match {DISABLED*} $e]} {
413+
# wait for the active defrag to start working (decision once a second)
414+
wait_for_condition 50 100 {
415+
[s active_defrag_running] ne 0
416+
} else {
417+
fail "defrag not started."
418+
}
419+
420+
# wait for the active defrag to stop working
421+
wait_for_condition 500 100 {
422+
[s active_defrag_running] eq 0
423+
} else {
424+
after 120 ;# serverCron only updates the info once in 100ms
425+
puts [r info memory]
426+
puts [r info stats]
427+
puts [r memory malloc-stats]
428+
fail "defrag didn't stop."
429+
}
430+
431+
# test the the fragmentation is lower
432+
after 120 ;# serverCron only updates the info once in 100ms
433+
set misses [s active_defrag_misses]
434+
set hits [s active_defrag_hits]
435+
set frag [s allocator_frag_ratio]
436+
if {$::verbose} {
437+
puts "frag $frag"
438+
puts "hits: $hits"
439+
puts "misses: $misses"
440+
}
441+
assert {$frag < 1.1}
442+
assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses
443+
}
444+
445+
# verify the data isn't corrupted or changed
446+
set newdigest [r debug digest]
447+
assert {$digest eq $newdigest}
448+
r save ;# saving an rdb iterates over all the data / pointers
449+
}
450+
}
330451
}
331452
}
332453
} ;# run_solo

0 commit comments

Comments
 (0)