-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 #69609
Conversation
Refactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency.
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-transforms Author: Igor Kirillov (igogo-x86) ChangesRefactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency. Patch is 132.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69609.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..40fbe877ab7ac09 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -307,19 +307,20 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
if (!Rhs)
Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+ // Zero extend if required.
+ if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+ Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+ Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
+ }
+
// Swap bytes if required.
if (NeedsBSwap) {
+ Type *BSwapType = CmpSizeType ? CmpSizeType : LoadSizeType;
Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
+ Intrinsic::bswap, BSwapType);
Lhs = Builder.CreateCall(Bswap, Lhs);
Rhs = Builder.CreateCall(Bswap, Rhs);
}
-
- // Zero extend if required.
- if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
- Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
- Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
- }
return {Lhs, Rhs};
}
@@ -694,10 +695,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
/// %17 = getelementptr i32, i32* %15, i32 2
/// %18 = load i32, i32* %16
/// %19 = load i32, i32* %17
-/// %20 = call i32 @llvm.bswap.i32(i32 %18)
-/// %21 = call i32 @llvm.bswap.i32(i32 %19)
-/// %22 = zext i32 %20 to i64
-/// %23 = zext i32 %21 to i64
+/// %20 = zext i32 %18 to i64
+/// %21 = zext i32 %19 to i64
+/// %22 = call i64 @llvm.bswap.i64(i64 %20)
+/// %23 = call i64 @llvm.bswap.i64(i64 %21)
/// %24 = sub i64 %22, %23
/// %25 = icmp ne i64 %24, 0
/// br i1 %25, label %res_block, label %loadbb2
@@ -710,10 +711,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
/// %31 = getelementptr i16, i16* %29, i16 6
/// %32 = load i16, i16* %30
/// %33 = load i16, i16* %31
-/// %34 = call i16 @llvm.bswap.i16(i16 %32)
-/// %35 = call i16 @llvm.bswap.i16(i16 %33)
-/// %36 = zext i16 %34 to i64
-/// %37 = zext i16 %35 to i64
+/// %34 = zext i16 %32 to i64
+/// %35 = zext i16 %33 to i64
+/// %36 = call i64 @llvm.bswap.i64(i16 %34)
+/// %37 = call i64 @llvm.bswap.i64(i16 %35)
/// %38 = sub i64 %36, %37
/// %39 = icmp ne i64 %38, 0
/// br i1 %39, label %res_block, label %loadbb3
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index c0f8f86e6e8b107..a89571656e46951 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -44,14 +44,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
define i32 @length2(ptr %X, ptr %Y) nounwind {
; X86-LABEL: length2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -75,14 +73,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X86-LABEL: length2_lt:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
@@ -99,10 +95,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %ax
-; X86-NEXT: movzwl %cx, %ecx
-; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: bswapl %ecx
+; X86-NEXT: bswapl %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 56d06021867fa15..1f07ba39ecef9fe 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -52,10 +52,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -79,10 +77,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: shrl $31, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
@@ -97,10 +93,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setg %al
@@ -511,8 +505,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
-; X64-NEXT: bswapl %ecx
-; X64-NEXT: bswapl %edx
+; X64-NEXT: bswapq %rcx
+; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: je .LBB29_3
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4bd3b..8efd4fca91a9972 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
; X86-LABEL: length2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471cd46..a8df0ac1354f893 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
-; X64-NEXT: bswapl %ecx
-; X64-NEXT: bswapl %edx
+; X64-NEXT: bswapq %rcx
+; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb9068c..b486eebd54b4a37 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
; X86-LABEL: length2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43b5c..afb57c8101b8221 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
-; X64-NEXT: bswapl %ecx
-; X64-NEXT: bswapl %edx
+; X64-NEXT: bswapq %rcx
+; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index ab439b32f2f1b20..f5b67ab45f7255c 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -43,14 +43,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
define i32 @length2(ptr %X, ptr %Y) nounwind {
; X86-LABEL: length2:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -62,9 +60,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: rolw $8, %ax
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT: bswapl %eax
+; X86-NEXT: addl $-825360384, %eax # imm = 0xCECE0000
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
ret i32 %m
@@ -75,9 +72,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: rolw $8, %ax
-; X86-NEXT: movzwl %ax, %eax
-; X86-NEXT: addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT: bswapl %eax
+; X86-NEXT: addl $-825360384, %eax # imm = 0xCECE0000
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setg %al
; X86-NEXT: retl
@@ -103,14 +99,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X86-LABEL: length2_lt:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %dx
-; X86-NEXT: movzwl %cx, %eax
-; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: bswapl %eax
+; X86-NEXT: bswapl %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: shrl $31, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
@@ -127,10 +121,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %eax
-; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %ax
-; X86-NEXT: movzwl %cx, %ecx
-; X86-NEXT: movzwl %ax, %eax
+; X86-NEXT: bswapl %ecx
+; X86-NEXT: bswapl %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 1330f3a241a5c2a..b8c0f509f1d081b 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -51,10 +51,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -65,9 +63,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length2_const:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT: bswapl %eax
+; X64-NEXT: addl $-825360384, %eax # imm = 0xCECE0000
; X64-NEXT: retq
%m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
ret i32 %m
@@ -77,9 +74,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length2_gt_const:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT: bswapl %eax
+; X64-NEXT: addl $-825360384, %eax # imm = 0xCECE0000
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setg %al
; X64-NEXT: retq
@@ -105,10 +101,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: shrl $31, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
@@ -123,10 +117,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
-; X64-NEXT: rolw $8, %ax
-; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: bswapl %eax
+; X64-NEXT: bswapl %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setg %al
@@ -537,8 +529,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
-; X64-NEXT: bswapl %ecx
-; X64-NEXT: bswapl %edx
+; X64-NEXT: bswapq %rcx
+; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: je .LBB31_3
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index f56d9688a01e12d..bd42d5f8d50859d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -5,14 +5,14 @@ declare i32 @memcmp(ptr nocapture, ptr nocapture, i32)
define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X32-LABEL: @cmp2(
-; X32-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT: ret i32 [[TMP9]]
+; X32-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT: [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT: ret i32 [[TMP7]]
;
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
ret i32 %call
@@ -20,14 +20,14 @@ define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y) {
define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y) {
; X32-LABEL: @cmp2_align2(
-; X32-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X32-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT: ret i32 [[TMP9]]
+; X32-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
+; X32-NEXT: [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 2
+; X32-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT: ret i32 [[TMP7]]
;
%call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
ret i32 %call
@@ -37,27 +37,27 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) {
; X32-LABEL: @cmp3(
; X32-NEXT: br label [[LOADBB:%.*]]
; X32: res_block:
-; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
; X32-NEXT: br label [[ENDBLOCK:%.*]]
; X32: loadbb:
-; X32-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT: [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X32-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X32-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT: [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT: [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT: [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X32-NEXT: br i1 [[TMP...
[truncated]
|
Wouldn't that byte swap the zeros that were added by the zext into the LSBs? I don't know this code, but that seems weird. |
This modification lays the groundwork for upcoming patches, which will alter the code generation for AArch64 when the memcmp size modulo 8 is 3, 5, or 6. |
Yeah, they would, but since they are zeroes, they wouldn't affect the comparison result. |
@@ -537,8 +529,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind { | |||
; X64-NEXT: # %bb.1: # %loadbb1 | |||
; X64-NEXT: movl 8(%rdi), %ecx | |||
; X64-NEXT: movl 8(%rsi), %edx | |||
; X64-NEXT: bswapl %ecx | |||
; X64-NEXT: bswapl %edx | |||
; X64-NEXT: bswapq %rcx |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bswapq is 2 uops vs 1 for bswapl on some CPU isn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, on MOVBE targets this will no longer fold the loads.
✅ With the latest revision this PR passed the C/C++ code formatter. |
Looks like it is not entirely profitable. I dropped it in favour of - #69942 |
Refactored the sequence of operations in MemCmpExpansion to zero-extend before byte-swapping. This change enables the generation of fewer instructions for x86, thereby improving code efficiency.