-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[LoongArch] Initial implementation for enableMemCmpExpansion hook
#166526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
79e394b to
0c42622
Compare
|
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesAfter overriding Patch is 220.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166526.diff 5 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index f548a8dd0532b..f6637ef58cf9c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -111,4 +111,27 @@ bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
}
}
-// TODO: Implement more hooks to provide TTI machinery for LoongArch.
+LoongArchTTIImpl::TTI::MemCmpExpansionOptions
+LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+ TTI::MemCmpExpansionOptions Options;
+
+ if (!ST->hasUAL())
+ return Options;
+
+ // TODO: Set same as the default value of MaxLoadsPerMemcmp or
+ // MaxLoadsPerMemcmpOptSize. May need more consideration?
+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
+ Options.AllowOverlappingLoads = true;
+
+ // TODO: Support for vectors.
+ if (ST->is64Bit()) {
+ Options.LoadSizes = {8, 4, 2, 1};
+ Options.AllowedTailExpansions = {3, 5, 6};
+ } else {
+ Options.LoadSizes = {4, 2, 1};
+ Options.AllowedTailExpansions = {3};
+ }
+
+ return Options;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index e3f16c7804994..9b479f9dc0dc5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -55,7 +55,8 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
bool shouldExpandReduction(const IntrinsicInst *II) const override;
- // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+ TTI::MemCmpExpansionOptions
+ enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
index 82fe899bb795b..a6ed1f1db1678 100644
--- a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
+++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll
@@ -38,260 +38,488 @@ entry:
}
define i32 @bcmp_size_1(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_1:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 1
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_1:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.bu $a0, $a0, 0
+; LA32-UAL-NEXT: ld.bu $a1, $a1, 0
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_1:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 1
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_1:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.bu $a0, $a0, 0
+; LA64-UAL-NEXT: ld.bu $a1, $a1, 0
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_1:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 1
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_1:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 1
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 1)
ret i32 %bcmp
}
define i32 @bcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_2:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 2
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_2:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.hu $a0, $a0, 0
+; LA32-UAL-NEXT: ld.hu $a1, $a1, 0
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_2:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 2
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_2:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.hu $a0, $a0, 0
+; LA64-UAL-NEXT: ld.hu $a1, $a1, 0
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_2:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 2
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_2:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 2
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 2)
ret i32 %bcmp
}
define i32 @bcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_3:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 3
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_3:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.hu $a2, $a0, 0
+; LA32-UAL-NEXT: ld.hu $a3, $a1, 0
+; LA32-UAL-NEXT: ld.bu $a0, $a0, 2
+; LA32-UAL-NEXT: ld.bu $a1, $a1, 2
+; LA32-UAL-NEXT: xor $a2, $a2, $a3
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: or $a0, $a2, $a0
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_3:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 3
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_3:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.hu $a2, $a0, 0
+; LA64-UAL-NEXT: ld.hu $a3, $a1, 0
+; LA64-UAL-NEXT: ld.bu $a0, $a0, 2
+; LA64-UAL-NEXT: ld.bu $a1, $a1, 2
+; LA64-UAL-NEXT: xor $a2, $a2, $a3
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: or $a0, $a2, $a0
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_3:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 3
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_3:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 3
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 3)
ret i32 %bcmp
}
define i32 @bcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_4:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 4
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_4:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.w $a0, $a0, 0
+; LA32-UAL-NEXT: ld.w $a1, $a1, 0
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_4:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 4
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_4:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.w $a0, $a0, 0
+; LA64-UAL-NEXT: ld.w $a1, $a1, 0
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_4:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 4
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_4:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 4
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 4)
ret i32 %bcmp
}
define i32 @bcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_5:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 5
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_5:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.w $a2, $a0, 0
+; LA32-UAL-NEXT: ld.w $a3, $a1, 0
+; LA32-UAL-NEXT: ld.bu $a0, $a0, 4
+; LA32-UAL-NEXT: ld.bu $a1, $a1, 4
+; LA32-UAL-NEXT: xor $a2, $a2, $a3
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: or $a0, $a2, $a0
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_5:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 5
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_5:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.w $a2, $a0, 0
+; LA64-UAL-NEXT: ld.w $a3, $a1, 0
+; LA64-UAL-NEXT: ld.bu $a0, $a0, 4
+; LA64-UAL-NEXT: ld.bu $a1, $a1, 4
+; LA64-UAL-NEXT: xor $a2, $a2, $a3
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: or $a0, $a2, $a0
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_5:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 5
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_5:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 5
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 5)
ret i32 %bcmp
}
define i32 @bcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_6:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 6
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_6:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.w $a2, $a0, 0
+; LA32-UAL-NEXT: ld.w $a3, $a1, 0
+; LA32-UAL-NEXT: ld.hu $a0, $a0, 4
+; LA32-UAL-NEXT: ld.hu $a1, $a1, 4
+; LA32-UAL-NEXT: xor $a2, $a2, $a3
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: or $a0, $a2, $a0
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_6:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 6
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_6:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.w $a2, $a0, 0
+; LA64-UAL-NEXT: ld.w $a3, $a1, 0
+; LA64-UAL-NEXT: ld.hu $a0, $a0, 4
+; LA64-UAL-NEXT: ld.hu $a1, $a1, 4
+; LA64-UAL-NEXT: xor $a2, $a2, $a3
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: or $a0, $a2, $a0
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_6:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 6
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_6:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 6
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 6)
ret i32 %bcmp
}
define i32 @bcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_7:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 7
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_7:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.w $a2, $a0, 0
+; LA32-UAL-NEXT: ld.w $a3, $a1, 0
+; LA32-UAL-NEXT: ld.w $a0, $a0, 3
+; LA32-UAL-NEXT: ld.w $a1, $a1, 3
+; LA32-UAL-NEXT: xor $a2, $a2, $a3
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: or $a0, $a2, $a0
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_7:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 7
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_7:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.w $a2, $a0, 0
+; LA64-UAL-NEXT: ld.w $a3, $a1, 0
+; LA64-UAL-NEXT: ld.w $a0, $a0, 3
+; LA64-UAL-NEXT: ld.w $a1, $a1, 3
+; LA64-UAL-NEXT: xor $a2, $a2, $a3
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: or $a0, $a2, $a0
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_7:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero, 7
+; LA32-NUAL-NEXT: bl bcmp
+; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NUAL-NEXT: addi.w $sp, $sp, 16
+; LA32-NUAL-NEXT: ret
+;
+; LA64-NUAL-LABEL: bcmp_size_7:
+; LA64-NUAL: # %bb.0: # %entry
+; LA64-NUAL-NEXT: addi.d $sp, $sp, -16
+; LA64-NUAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NUAL-NEXT: ori $a2, $zero, 7
+; LA64-NUAL-NEXT: pcaddu18i $ra, %call36(bcmp)
+; LA64-NUAL-NEXT: jirl $ra, $ra, 0
+; LA64-NUAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NUAL-NEXT: addi.d $sp, $sp, 16
+; LA64-NUAL-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iGRLen 7)
ret i32 %bcmp
}
define i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
-; LA32-LABEL: bcmp_size_8:
-; LA32: # %bb.0: # %entry
-; LA32-NEXT: addi.w $sp, $sp, -16
-; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: ori $a2, $zero, 8
-; LA32-NEXT: bl bcmp
-; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 16
-; LA32-NEXT: ret
+; LA32-UAL-LABEL: bcmp_size_8:
+; LA32-UAL: # %bb.0: # %entry
+; LA32-UAL-NEXT: ld.w $a2, $a0, 0
+; LA32-UAL-NEXT: ld.w $a3, $a1, 0
+; LA32-UAL-NEXT: ld.w $a0, $a0, 4
+; LA32-UAL-NEXT: ld.w $a1, $a1, 4
+; LA32-UAL-NEXT: xor $a2, $a2, $a3
+; LA32-UAL-NEXT: xor $a0, $a0, $a1
+; LA32-UAL-NEXT: or $a0, $a2, $a0
+; LA32-UAL-NEXT: sltu $a0, $zero, $a0
+; LA32-UAL-NEXT: ret
;
-; LA64-LABEL: bcmp_size_8:
-; LA64: # %bb.0: # %entry
-; LA64-NEXT: addi.d $sp, $sp, -16
-; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT: ori $a2, $zero, 8
-; LA64-NEXT: pcaddu18i $ra, %call36(bcmp)
-; LA64-NEXT: jirl $ra, $ra, 0
-; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT: addi.d $sp, $sp, 16
-; LA64-NEXT: ret
+; LA64-UAL-LABEL: bcmp_size_8:
+; LA64-UAL: # %bb.0: # %entry
+; LA64-UAL-NEXT: ld.d $a0, $a0, 0
+; LA64-UAL-NEXT: ld.d $a1, $a1, 0
+; LA64-UAL-NEXT: xor $a0, $a0, $a1
+; LA64-UAL-NEXT: sltu $a0, $zero, $a0
+; LA64-UAL-NEXT: ret
+;
+; LA32-NUAL-LABEL: bcmp_size_8:
+; LA32-NUAL: # %bb.0: # %entry
+; LA32-NUAL-NEXT: addi.w $sp, $sp, -16
+; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NUAL-NEXT: ori $a2, $zero,...
[truncated]
|
|
How does this optimization affect the benchmark? For example Like: |
Okay, I will try to test some benchmarks such as test-suite or spec cpu and add the results later. Thanks. |
|
I have just tested the
Detail results of This benchmark is specifically designed to test |
|
|
||
| // TODO: Implement more hooks to provide TTI machinery for LoongArch. | ||
| LoongArchTTIImpl::TTI::MemCmpExpansionOptions | ||
| LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IsZeroCmp is not used currently?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. It is not been used for now. Whether it is ZeroCmp or not, enable memcmp expansion is beneficial.
| if (!ST->hasUAL()) | ||
| return Options; | ||
|
|
||
| // TODO: Set same as the default value of MaxLoadsPerMemcmp or |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't quite understand what this TODO mean...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MaxNumLoads is used to set the maximum number of loads allowed to use when expanding memcmp (refer to max-loads-per-memcmp option). In default, TLI->getMaxExpandSizeMemcmp returns the default value of MaxLoadsPerMemcmpOptSize(which is 4) or MaxLoadsPerMemcmp(which is 8).
Targets can set this two values to allow expanding memcmp to larger or smaller sequences.
I think that is enough. |
Great. Thanks a lot. |
b89ea92 to
25c01e2
Compare
| ; LA64-NEXT: ret | ||
| ; LA64-UAL-LABEL: bcmp_lt_zero: | ||
| ; LA64-UAL: # %bb.0: # %entry | ||
| ; LA64-UAL-NEXT: move $a0, $zero |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure whether it is correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After expand-memcmp pass, the original IR is expanded and optimized to:
%0 = load i32, ptr %s1, align 1
%1 = load i32, ptr %s2, align 1
%2 = icmp ne i32 %0, %1
%3 = zext i1 %2 to i32
ret i1 false
So the result is always false.
And I noticed that the test bcmp_ge_zero always returns true, so seems always assume bcmp never returns negative result? I am not sure if this is the assumption of llvm or the misprocessing of this pass. I tried below using -O0(will call bcmp actually):
#include <stdio.h>
#include <string.h>
int main () {
char *s0 = "0000000";
char *s1 = "1111111";
printf("= : %d\n", bcmp(s1, s1, 7));
printf("> : %d\n", bcmp(s1, s0, 7));
printf("< : %d\n", bcmp(s0, s1, 7));
return 0;
}
The result is:
= : 0
> : 1
< : -1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Compiling this using clang, the function test can be optimized after this commit.
#include <stdio.h>
#include <string.h>
int test(char* s0, char* s1) {
if (bcmp(s0, s1, 4) > -1)
return 1;
return 0;
}
int main() {
char s0[10], s1[10];
printf("cin s0 (more than 4): ");
scanf("%9s", s0);
printf("cin s1 (more than 4): ");
scanf("%9s", s1);
printf("%d\n", test(s0, s1));
return 0;
}
Results are difference:
> clang t.c -O3
> ./a.out
cin s0 (more than 4): aaaaa
cin s1 (more than 4): bbbbb
1
> clang t.c -O0
> ./a.out
cin s0 (more than 4): aaaaa
cin s1 (more than 4): bbbbb
0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Appears after 46a13a0.
After overriding `TargetTransformInfo::enableMemCmpExpansion` in this commit, `MergeICmps` and `ExpandMemCmp` passes will be enabled on LoongArch.
db651a0 to
fd004e2
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/27046 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/17987 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/17/builds/12521 Here is the relevant piece of the build log for the reference |
After overriding
TargetTransformInfo::enableMemCmpExpansionin this commit,MergeICmpsandExpandMemCmppasses will be enabled on LoongArch.