Merge pull request redis#7289 from oranagra/defrag_edge_case

antirez · web-flow · commit af3424569221 · 2020-05-20T15:17:02.000+02:00
fix a rare active defrag edge case bug leading to stagnation
diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -216,7 +216,7 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
 }
 
 JEMALLOC_ALWAYS_INLINE int
-iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
+iget_defrag_hint(tsdn_t *tsdn, void* ptr) {
 	int defrag = 0;
 	rtree_ctx_t rtree_ctx_fallback;
 	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback);
@@ -232,11 +232,22 @@ iget_defrag_hint(tsdn_t *tsdn, void* ptr, int *bin_util, int *run_util) {
 		malloc_mutex_lock(tsdn, &bin->lock);
 		/* don't bother moving allocations from the slab currently used for new allocations */
 		if (slab != bin->slabcur) {
-			const bin_info_t *bin_info = &bin_infos[binind];
-			size_t availregs = bin_info->nregs * bin->stats.curslabs;
-			*bin_util = ((long long)bin->stats.curregs<<16) / availregs;
-			*run_util = ((long long)(bin_info->nregs - extent_nfree_get(slab))<<16) / bin_info->nregs;
-			defrag = 1;
+			int free_in_slab = extent_nfree_get(slab);
+			if (free_in_slab) {
+				const bin_info_t *bin_info = &bin_infos[binind];
+				int curslabs = bin->stats.curslabs;
+				size_t curregs = bin->stats.curregs;
+				if (bin->slabcur) {
+					/* remove slabcur from the overall utilization */
+					curregs -= bin_info->nregs - extent_nfree_get(bin->slabcur);
+					curslabs -= 1;
+				}
+				/* Compare the utilization ratio of the slab in question to the total average,
+				 * to avoid precision lost and division, we do that by extrapolating the usage
+				 * of the slab as if all slabs have the same usage. If this slab is less used 
+				 * than the average, we'll prefer to evict the data to hopefully more used ones */
+				defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs;
+			}
 		}
 		malloc_mutex_unlock(tsdn, &bin->lock);
 	}
diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c
@@ -3326,12 +3326,10 @@ jemalloc_postfork_child(void) {
 /******************************************************************************/
 
 /* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation.
- * returns 0 if the allocation is in the currently active run,
- * or when it is not causing any frag issue (large or huge bin)
- * returns the bin utilization and run utilization both in fixed point 16:16.
+ * returns 1 if the allocation should be moved, and 0 if the allocation be kept.
  * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */
 JEMALLOC_EXPORT int JEMALLOC_NOTHROW
-get_defrag_hint(void* ptr, int *bin_util, int *run_util) {
+get_defrag_hint(void* ptr) {
 	assert(ptr != NULL);
-	return iget_defrag_hint(TSDN_NULL, ptr, bin_util, run_util);
+	return iget_defrag_hint(TSDN_NULL, ptr);
 }
diff --git a/src/debug.c b/src/debug.c
@@ -311,6 +311,13 @@ void mallctl_int(client *c, robj **argv, int argc) {
     size_t sz = sizeof(old);
     while (sz > 0) {
         if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, argc > 1? &val: NULL, argc > 1?sz: 0))) {
+            if (ret == EPERM && argc > 1) {
+                /* if this option is write only, try just writing to it. */
+                if (!(ret=je_mallctl(argv[0]->ptr, NULL, 0, &val, sz))) {
+                    addReply(c, shared.ok);
+                    return;
+                }
+            }
             if (ret==EINVAL) {
                 /* size might be wrong, try a smaller one */
                 sz /= 2;
@@ -333,17 +340,30 @@ void mallctl_int(client *c, robj **argv, int argc) {
 }
 
 void mallctl_string(client *c, robj **argv, int argc) {
-    int ret;
+    int rret, wret;
     char *old;
     size_t sz = sizeof(old);
     /* for strings, it seems we need to first get the old value, before overriding it. */
-    if ((ret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
-        addReplyErrorFormat(c,"%s", strerror(ret));
-        return;
+    if ((rret=je_mallctl(argv[0]->ptr, &old, &sz, NULL, 0))) {
+        /* return error unless this option is write only. */
+        if (!(rret == EPERM && argc > 1)) {
+            addReplyErrorFormat(c,"%s", strerror(rret));
+            return;
+        }
+    }
+    if(argc > 1) {
+        char *val = argv[1]->ptr;
+        char **valref = &val;
+        if ((!strcmp(val,"VOID")))
+            valref = NULL, sz = 0;
+        wret = je_mallctl(argv[0]->ptr, NULL, 0, valref, sz);
     }
-    addReplyBulkCString(c, old);
-    if(argc > 1)
-        je_mallctl(argv[0]->ptr, NULL, 0, &argv[1]->ptr, sizeof(char*));
+    if (!rret)
+        addReplyBulkCString(c, old);
+    else if (wret)
+        addReplyErrorFormat(c,"%s", strerror(wret));
+    else
+        addReply(c, shared.ok);
 }
 #endif
 
diff --git a/src/defrag.c b/src/defrag.c
@@ -43,7 +43,7 @@
 
 /* this method was added to jemalloc in order to help us understand which
  * pointers are worthwhile moving and which aren't */
-int je_get_defrag_hint(void* ptr, int *bin_util, int *run_util);
+int je_get_defrag_hint(void* ptr);
 
 /* forward declarations*/
 void defragDictBucketCallback(void *privdata, dictEntry **bucketref);
@@ -55,18 +55,11 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
  * when it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
 void* activeDefragAlloc(void *ptr) {
-    int bin_util, run_util;
     size_t size;
     void *newptr;
-    if(!je_get_defrag_hint(ptr, &bin_util, &run_util)) {
-        server.stat_active_defrag_misses++;
-        return NULL;
-    }
-    /* if this run is more utilized than the average utilization in this bin
-     * (or it is full), skip it. This will eventually move all the allocations
-     * from relatively empty runs into relatively full runs. */
-    if (run_util > bin_util || run_util == 1<<16) {
+    if(!je_get_defrag_hint(ptr)) {
         server.stat_active_defrag_misses++;
+        size = zmalloc_size(ptr);
         return NULL;
     }
     /* move this allocation to a new allocation.
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
@@ -95,6 +95,10 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    puts "hits: $hits"
+                    puts "misses: $misses"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -221,6 +225,10 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    puts "hits: $hits"
+                    puts "misses: $misses"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -256,11 +264,12 @@ start_server {tags {"defrag"}} {
             set expected_frag 1.7
             # add a mass of list nodes to two lists (allocations are interlaced)
             set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
-            for {set j 0} {$j < 500000} {incr j} {
+            set elements 500000
+            for {set j 0} {$j < $elements} {incr j} {
                 $rd lpush biglist1 $val
                 $rd lpush biglist2 $val
             }
-            for {set j 0} {$j < 500000} {incr j} {
+            for {set j 0} {$j < $elements} {incr j} {
                 $rd read ; # Discard replies
                 $rd read ; # Discard replies
             }
@@ -302,6 +311,8 @@ start_server {tags {"defrag"}} {
 
                 # test the the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
+                set misses [s active_defrag_misses]
+                set hits [s active_defrag_hits]
                 set frag [s allocator_frag_ratio]
                 set max_latency 0
                 foreach event [r latency latest] {
@@ -312,6 +323,8 @@ start_server {tags {"defrag"}} {
                 }
                 if {$::verbose} {
                     puts "frag $frag"
+                    puts "misses: $misses"
+                    puts "hits: $hits"
                     puts "max latency $max_latency"
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
@@ -320,13 +333,121 @@ start_server {tags {"defrag"}} {
                 # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
                 # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
                 assert {$max_latency <= 30}
+
+                # in extreme cases of stagnation, we see over 20m misses before the tests aborts with "defrag didn't stop",
+                # in normal cases we only see 100k misses out of 500k elements
+                assert {$misses < $elements}
             }
             # verify the data isn't corrupted or changed
             set newdigest [r debug digest]
             assert {$digest eq $newdigest}
             r save ;# saving an rdb iterates over all the data / pointers
             r del biglist1 ;# coverage for quicklistBookmarksClear
         } {1}
+
+        test "Active defrag edge case" {
+            # there was an edge case in defrag where all the slabs of a certain bin are exact the same
+            # % utilization, with the exception of the current slab from which new allocations are made
+            # if the current slab is lower in utilization the defragger would have ended up in stagnation,
+            # keept running and not move any allocation.
+            # this test is more consistent on a fresh server with no history
+            start_server {tags {"defrag"}} {
+                r flushdb
+                r config resetstat
+                r config set save "" ;# prevent bgsave from interfereing with save below
+                r config set hz 100
+                r config set activedefrag no
+                r config set active-defrag-max-scan-fields 1000
+                r config set active-defrag-threshold-lower 5
+                r config set active-defrag-cycle-min 65
+                r config set active-defrag-cycle-max 75
+                r config set active-defrag-ignore-bytes 1mb
+                r config set maxmemory 0
+                set expected_frag 1.3
+
+                r debug mallctl-str thread.tcache.flush VOID
+                # fill the first slab containin 32 regs of 640 bytes.
+                for {set j 0} {$j < 32} {incr j} {
+                    r setrange "_$j" 600 x
+                    r debug mallctl-str thread.tcache.flush VOID
+                }
+
+                # add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab.
+                set rd [redis_deferring_client]
+                set keys 640000
+                for {set j 0} {$j < $keys} {incr j} {
+                    $rd setrange $j 600 x
+                }
+                for {set j 0} {$j < $keys} {incr j} {
+                    $rd read ; # Discard replies
+                }
+
+                # create some fragmentation of 50%
+                set sent 0
+                for {set j 0} {$j < $keys} {incr j 1} {
+                    $rd del $j
+                    incr sent
+                    incr j 1
+                }
+                for {set j 0} {$j < $sent} {incr j} {
+                    $rd read ; # Discard replies
+                }
+
+                # create higher fragmentation in the first slab
+                for {set j 10} {$j < 32} {incr j} {
+                    r del "_$j"
+                }
+
+                # start defrag
+                after 120 ;# serverCron only updates the info once in 100ms
+                set frag [s allocator_frag_ratio]
+                if {$::verbose} {
+                    puts "frag $frag"
+                }
+
+                assert {$frag >= $expected_frag}
+
+                set digest [r debug digest]
+                catch {r config set activedefrag yes} e
+                if {![string match {DISABLED*} $e]} {
+                    # wait for the active defrag to start working (decision once a second)
+                    wait_for_condition 50 100 {
+                        [s active_defrag_running] ne 0
+                    } else {
+                        fail "defrag not started."
+                    }
+
+                    # wait for the active defrag to stop working
+                    wait_for_condition 500 100 {
+                        [s active_defrag_running] eq 0
+                    } else {
+                        after 120 ;# serverCron only updates the info once in 100ms
+                        puts [r info memory]
+                        puts [r info stats]
+                        puts [r memory malloc-stats]
+                        fail "defrag didn't stop."
+                    }
+
+                    # test the the fragmentation is lower
+                    after 120 ;# serverCron only updates the info once in 100ms
+                    set misses [s active_defrag_misses]
+                    set hits [s active_defrag_hits]
+                    set frag [s allocator_frag_ratio]
+                    if {$::verbose} {
+                        puts "frag $frag"
+                        puts "hits: $hits"
+                        puts "misses: $misses"
+                    }
+                    assert {$frag < 1.1}
+                    assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses
+                }
+
+                # verify the data isn't corrupted or changed
+                set newdigest [r debug digest]
+                assert {$digest eq $newdigest}
+                r save ;# saving an rdb iterates over all the data / pointers
+            }
+        }
     }
 }
 } ;# run_solo