[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609

igogo-x86 · 2023-10-19T15:44:23Z

Refactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency.

llvmbot · 2023-10-19T15:45:37Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-transforms

Author: Igor Kirillov (igogo-x86)

Changes

Refactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency.

Patch is 132.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69609.diff

11 Files Affected:

(modified) llvm/lib/CodeGen/ExpandMemCmp.cpp (+16-15)
(modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll (+10-16)
(modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll (+8-14)
(modified) llvm/test/CodeGen/X86/memcmp-optsize-x32.ll (+4-6)
(modified) llvm/test/CodeGen/X86/memcmp-optsize.ll (+4-6)
(modified) llvm/test/CodeGen/X86/memcmp-pgso-x32.ll (+4-6)
(modified) llvm/test/CodeGen/X86/memcmp-pgso.ll (+4-6)
(modified) llvm/test/CodeGen/X86/memcmp-x32.ll (+14-22)
(modified) llvm/test/CodeGen/X86/memcmp.ll (+12-20)
(modified) llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll (+196-196)
(modified) llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll (+529-529)

diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..40fbe877ab7ac09 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -307,19 +307,20 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   if (!Rhs)
     Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
 
+  // Zero extend if required.
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
+  }
+
   // Swap bytes if required.
   if (NeedsBSwap) {
+    Type *BSwapType = CmpSizeType ? CmpSizeType : LoadSizeType;
     Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
+                                                Intrinsic::bswap, BSwapType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }
-
-  // Zero extend if required.
-  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
-    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
-    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
-  }
   return {Lhs, Rhs};
 }
 
@@ -694,10 +695,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %17 = getelementptr i32, i32* %15, i32 2
 ///  %18 = load i32, i32* %16
 ///  %19 = load i32, i32* %17
-///  %20 = call i32 @llvm.bswap.i32(i32 %18)
-///  %21 = call i32 @llvm.bswap.i32(i32 %19)
-///  %22 = zext i32 %20 to i64
-///  %23 = zext i32 %21 to i64
+///  %20 = zext i32 %18 to i64
+///  %21 = zext i32 %19 to i64
+///  %22 = call i64 @llvm.bswap.i64(i64 %20)
+///  %23 = call i64 @llvm.bswap.i64(i64 %21)
 ///  %24 = sub i64 %22, %23
 ///  %25 = icmp ne i64 %24, 0
 ///  br i1 %25, label %res_block, label %loadbb2
@@ -710,10 +711,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %31 = getelementptr i16, i16* %29, i16 6
 ///  %32 = load i16, i16* %30
 ///  %33 = load i16, i16* %31
-///  %34 = call i16 @llvm.bswap.i16(i16 %32)
-///  %35 = call i16 @llvm.bswap.i16(i16 %33)
-///  %36 = zext i16 %34 to i64
-///  %37 = zext i16 %35 to i64
+///  %34 = zext i16 %32 to i64
+///  %35 = zext i16 %33 to i64
+///  %36 = call i64 @llvm.bswap.i64(i16 %34)
+///  %37 = call i64 @llvm.bswap.i64(i16 %35)
 ///  %38 = sub i64 %36, %37
 ///  %39 = icmp ne i64 %38, 0
 ///  br i1 %39, label %res_block, label %loadbb3
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index c0f8f86e6e8b107..a89571656e46951 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -44,14 +44,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -75,14 +73,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -99,10 +95,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 56d06021867fa15..1f07ba39ecef9fe 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -52,10 +52,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -79,10 +77,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -97,10 +93,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
@@ -511,8 +505,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB29_3
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4bd3b..8efd4fca91a9972 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
 define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471cd46..a8df0ac1354f893 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb9068c..b486eebd54b4a37 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
 define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43b5c..afb57c8101b8221 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index ab439b32f2f1b20..f5b67ab45f7255c 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -43,14 +43,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -62,9 +60,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
   ret i32 %m
@@ -75,9 +72,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    retl
@@ -103,14 +99,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -127,10 +121,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 1330f3a241a5c2a..b8c0f509f1d081b 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -51,10 +51,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -65,9 +63,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length2_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
   ret i32 %m
@@ -77,9 +74,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length2_gt_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
@@ -105,10 +101,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -123,10 +117,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
@@ -537,8 +529,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB31_3
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index f56d9688a01e12d..bd42d5f8d50859d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -5,14 +5,14 @@ declare i32 @memcmp(ptr nocapture, ptr nocapture, i32)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
@@ -20,14 +20,14 @@ define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
 ; X32-LABEL: @cmp2_align2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 2
+; X32-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
@@ -37,27 +37,27 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp3(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP...
[truncated]

topperc · 2023-10-19T15:47:29Z

Wouldn't that byte swap the zeros that were added by the zext into the LSBs? I don't know this code, but that seems weird.

igogo-x86 · 2023-10-19T15:47:31Z

This modification lays the groundwork for upcoming patches, which will alter the code generation for AArch64 when the memcmp size modulo 8 is 3, 5, or 6.

igogo-x86 · 2023-10-19T15:49:34Z

Wouldn't that byte swap the zeros that were added by the zext into the LSBs? I don't know this code, but that seems weird.

Yeah, they would, but since they are zeroes, they wouldn't affect the comparison result.

topperc · 2023-10-19T15:50:59Z

llvm/test/CodeGen/X86/memcmp.ll

@@ -537,8 +529,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx


bswapq is 2 uops vs 1 for bswapl on some CPU isn't it?

Also, on MOVBE targets this will no longer fold the loads.

github-actions · 2023-10-19T15:55:42Z

✅ With the latest revision this PR passed the C/C++ code formatter.

igogo-x86 · 2023-10-23T16:43:07Z

Looks like it is not entirely profitable. I dropped it in favour of - #69942

[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86

67917cd

Refactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency.

llvmbot added backend:X86 llvm:transforms labels Oct 19, 2023

topperc reviewed Oct 19, 2023

View reviewed changes

igogo-x86 requested a review from david-arm October 19, 2023 15:52

Clang-format

a6eeb12

igogo-x86 closed this Oct 23, 2023

igogo-x86 deleted the expandmemcpy-bswap-zext-order branch October 23, 2023 16:43

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609

[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609

Uh oh!

igogo-x86 commented Oct 19, 2023

Uh oh!

llvmbot commented Oct 19, 2023 •

edited

Loading

Uh oh!

topperc commented Oct 19, 2023

Uh oh!

igogo-x86 commented Oct 19, 2023

Uh oh!

igogo-x86 commented Oct 19, 2023

Uh oh!

topperc Oct 19, 2023

Uh oh!

RKSimon Oct 19, 2023

Uh oh!

github-actions bot commented Oct 19, 2023 •

edited

Loading

Uh oh!

igogo-x86 commented Oct 23, 2023

Uh oh!

Uh oh!

[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609

[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609

Uh oh!

Conversation

igogo-x86 commented Oct 19, 2023

Uh oh!

llvmbot commented Oct 19, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

topperc commented Oct 19, 2023

Uh oh!

igogo-x86 commented Oct 19, 2023

Uh oh!

igogo-x86 commented Oct 19, 2023

Uh oh!

topperc Oct 19, 2023

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 19, 2023

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 19, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

igogo-x86 commented Oct 23, 2023

Uh oh!

Uh oh!

llvmbot commented Oct 19, 2023 •

edited

Loading

github-actions bot commented Oct 19, 2023 •

edited

Loading