Optimize mono_gc_bzero and mono_gc_memmove to closely match native pe…

…rformance. * gc.c (mono_gc_memmove): Doing naive word copy turned out to be several times slower than optimized memmove.Doing optimized memmove is a hellbent problem because each target (x86, amd64, arm, etc) have multiple implementations based on which CPU you're currently running on. Given we're not ready to suck in ten thousand lines of assembly for the benefit of a few targets a better solution is needed. The problem we face is due to implementations that don't perform word stores on the word aligned parts of the destination. This does break things for us as the CLI environment disallows word tearing for normal operations. The solution is then to work around the silliness of the known implementations that basically handle any sorts of misalignment badly. To do so we ensure that we only call memmove with a word aligned destination and a word multiple size. This should be enough for OSX, Android, linux and iOS on x86, amd64 and ARM.
mono · Mar 11, 2013 · 8a6a647 · 8a6a647
1 parent dde2c39
commit 8a6a647
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 56 deletions.
diff --git a/mono/metadata/gc-internal.h b/mono/metadata/gc-internal.h
@@ -362,6 +362,10 @@ gboolean mono_gc_reference_queue_add (MonoReferenceQueue *queue, MonoObject *obj
 BOOL APIENTRY mono_gc_dllmain (HMODULE module_handle, DWORD reason, LPVOID reserved) MONO_INTERNAL;
 #endif
 
+/*
+Those functions must be used when it's possible that either destination is not
+word aligned or size is not a multiple of word size.
+*/
 void mono_gc_bzero (void *dest, size_t size) MONO_INTERNAL;
 void mono_gc_memmove (void *dest, const void *src, size_t size) MONO_INTERNAL;
 

diff --git a/mono/metadata/gc.c b/mono/metadata/gc.c
@@ -1560,6 +1560,12 @@ mono_gc_reference_queue_free (MonoReferenceQueue *queue)
 #define align_down(ptr) ((void*)(_toi(ptr) & ~ptr_mask))
 #define align_up(ptr) ((void*) ((_toi(ptr) + ptr_mask) & ~ptr_mask))
 
+#define BZERO_WORDS(dest,words) do {	\
+	int __i;	\
+	for (__i = 0; __i < (words); ++__i)	\
+		((void **)(dest))[__i] = 0;	\
+} while (0)
+
 /**
  * mono_gc_bzero:
  * @dest: address to start to clear
@@ -1574,24 +1580,53 @@ mono_gc_reference_queue_free (MonoReferenceQueue *queue)
 void
 mono_gc_bzero (void *dest, size_t size)
 {
-	char *p = (char*)dest;
-	char *end = p + size;
-	char *align_end = align_up (p);
-	char *word_end;
-
-	while (p < align_end)
-		*p++ = 0;
-
-	word_end = align_down (end);
-	while (p < word_end) {
-		*((void**)p) = NULL;
-		p += sizeof (void*);
+	char *d = (char*)dest;
+	size_t tail_bytes, word_bytes;
+
+	/*
+	If we're copying less than a word, just use memset.
+
+	We cannot bail out early if both are aligned because some implementations
+	use byte copying for sizes smaller than 16. OSX, on this case.
+	*/
+	if (size < sizeof(void*)) {
+		memset (dest, 0, size);
+		return;
 	}
 
-	while (p < end)
-		*p++ = 0;
-}
+	/*align to word boundary */
+	while (unaligned_bytes (d) && size) {
+		*d++ = 0;
+		--size;
+	}
+
+	/* copy all words with memmove */
+	word_bytes = (size_t)align_down (size);
+	switch (word_bytes) {
+	case sizeof (void*) * 1:
+		BZERO_WORDS (dest, 1);
+		break;
+	case sizeof (void*) * 2:
+		BZERO_WORDS (dest, 2);
+		break;
+	case sizeof (void*) * 3:
+		BZERO_WORDS (dest, 3);
+		break;
+	case sizeof (void*) * 4:
+		BZERO_WORDS (dest, 4);
+		break;
+	default:
+		memset (d, 0, word_bytes);
+	}
 
+	tail_bytes = unaligned_bytes (size);
+	if (tail_bytes) {
+		d += word_bytes;
+		do {
+			*d++ = 0;
+		} while (--tail_bytes);
+	}
+}
 
 /**
  * mono_gc_memmove:
@@ -1602,18 +1637,19 @@ mono_gc_bzero (void *dest, size_t size)
  * Move @size bytes from @src to @dest.
  * size MUST be a multiple of sizeof (gpointer)
  *
- * FIXME borrow faster code from some BSD libc or bionic
  */
 void
 mono_gc_memmove (void *dest, const void *src, size_t size)
 {
 	/*
-	 * If dest and src are differently aligned with respect to
-	 * pointer size then it makes no sense to do aligned copying.
-	 * In fact, we would end up with unaligned loads which is
-	 * incorrect on some architectures.
-	 */
-	if ((char*)dest - (char*)align_down (dest) != (char*)src - (char*)align_down (src)) {
+	If we're copying less than a word we don't need to worry about word tearing
+	so we bailout to memmove early.
+
+	If both dest is aligned and size is a multiple of word size, we can go straigh
+	to memmove.
+
+	*/
+	if (size < sizeof(void*) || !((_toi (dest) | (size)) & sizeof (void*))) {
 		memmove (dest, src, size);
 		return;
 	}
@@ -1622,45 +1658,51 @@ mono_gc_memmove (void *dest, const void *src, size_t size)
 	 * A bit of explanation on why we align only dest before doing word copies.
 	 * Pointers to managed objects must always be stored in word aligned addresses, so
 	 * even if dest is misaligned, src will be by the same amount - this ensure proper atomicity of reads.
+	 *
+	 * We don't need to case when source and destination have different alignments since we only do word stores
+	 * using memmove, which must handle it.
 	 */
-	if (dest > src && ((size_t)((char*)dest - (char*)src) < size)) {
+	if (dest > src && ((size_t)((char*)dest - (char*)src) < size)) { /*backward copy*/
 		char *p = (char*)dest + size;
-		char *s = (char*)src + size;
-		char *start = (char*)dest;
-		char *align_end = MAX((char*)dest, (char*)align_down (p));
-		char *word_start;
-
-		while (p > align_end)
-			*--p = *--s;
-
-		word_start = align_up (start);
-		while (p > word_start) {
-			p -= sizeof (void*);
-			s -= sizeof (void*);
-			*((void**)p) = *((void**)s);
-		}
-
-		while (p > start)
-			*--p = *--s;
+			char *s = (char*)src + size;
+			char *start = (char*)dest;
+			char *align_end = MAX((char*)dest, (char*)align_down (p));
+			char *word_start;
+			size_t bytes_to_memmove;
+
+			while (p > align_end)
+				*--p = *--s;
+
+			word_start = align_up (start);
+			bytes_to_memmove = p - word_start;
+			p -= bytes_to_memmove;
+			s -= bytes_to_memmove;
+			memmove (p, s, bytes_to_memmove);
+
+			while (p > start)
+				*--p = *--s;
 	} else {
-		char *p = (char*)dest;
-		char *s = (char*)src;
-		char *end = p + size;
-		char *align_end = MIN ((char*)end, (char*)align_up (p));
-		char *word_end;
-
-		while (p < align_end)
-			*p++ = *s++;
-
-		word_end = align_down (end);
-		while (p < word_end) {
-			*((void**)p) = *((void**)s);
-			p += sizeof (void*);
-			s += sizeof (void*);
+		char *d = (char*)dest;
+		const char *s = (const char*)src;
+		size_t tail_bytes;
+
+		/*align to word boundary */
+		while (unaligned_bytes (d)) {
+			*d++ = *s++;
+			--size;
 		}
 
-		while (p < end)
-			*p++ = *s++;
+		/* copy all words with memmove */
+		memmove (d, s, (size_t)align_down (size));
+
+		tail_bytes = unaligned_bytes (size);
+		if (tail_bytes) {
+			d += (size_t)align_down (size);
+			s += (size_t)align_down (size);
+			do {
+				*d++ = *s++;
+			} while (--tail_bytes);
+		}
 	}
 }