-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[NVPTX] properly expand operations that we do not support on v2i32 #162391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Artem Belevich (Artem-B) ChangesFollow-up on #153478 and #161715. v2i32 register class exists mostly to facilitate v2f32's use of integer registers. There are no actual instructions that can apply to v2i32 directly (except bitwise logical ops). Everything else must be done elementwise. Patch is 66.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162391.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bc047a4aa999d..640811a7144c0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
- // Only logical ops can be done on v4i8 directly, others must be done
+ // Only logical ops can be done on v4i8/v2i32 directly, others must be done
// elementwise.
setOperationAction(
{ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE,
@@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM,
ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT,
ISD::USUBSAT},
- MVT::v4i8, Expand);
+ {MVT::v4i8, MVT::v2i32}, Expand);
// Operations not directly supported by NVPTX.
for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
@@ -684,12 +684,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
// For others we will expand to a SHL/SRA pair.
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+ setOperationAction(
+ ISD::SIGN_EXTEND_INREG,
+ {MVT::i64, MVT::i32, MVT::i16, MVT::i8, MVT::i1, MVT::v2i16, MVT::v2i32},
+ Legal);
setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
new file mode 100644
index 0000000000000..3bdde4fa7c321
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -0,0 +1,1625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOI32X2 %s
+; RUN: %if ptxas-sm_80 %{ \
+; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \
+; RUN: %}
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-I32X2 %s
+; RUN: %if ptxas-sm_100 %{ \
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define <2 x i32> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {-1, 2};
+; CHECK-NEXT: ret;
+ ret <2 x i32> <i32 -1, i32 2>
+}
+
+define i32 @test_extract_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_0(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
+; CHECK-NOI32X2-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_extract_0(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<2>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, _}, %rd1;
+; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-I32X2-NEXT: ret;
+ %e = extractelement <2 x i32> %a, i32 0
+ ret i32 %e
+}
+
+define i32 @test_extract_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_1(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
+; CHECK-NOI32X2-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_extract_1(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<2>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {_, %r1}, %rd1;
+; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-I32X2-NEXT: ret;
+ %e = extractelement <2 x i32> %a, i32 1
+ ret i32 %e
+}
+
+define i32 @test_extract_i(<2 x i32> %a, i64 %idx) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_i(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .local .align 8 .b8 __local_depot3[8];
+; CHECK-NOI32X2-NEXT: .reg .b64 %SP;
+; CHECK-NOI32X2-NEXT: .reg .b64 %SPL;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-NOI32X2-NEXT: .reg .b64 %rd<6>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: mov.b64 %SPL, __local_depot3;
+; CHECK-NOI32X2-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0];
+; CHECK-NOI32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
+; CHECK-NOI32X2-NEXT: st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NOI32X2-NEXT: and.b64 %rd2, %rd1, 1;
+; CHECK-NOI32X2-NEXT: shl.b64 %rd3, %rd2, 2;
+; CHECK-NOI32X2-NEXT: add.u64 %rd4, %SP, 0;
+; CHECK-NOI32X2-NEXT: or.b64 %rd5, %rd4, %rd3;
+; CHECK-NOI32X2-NEXT: ld.b32 %r3, [%rd5];
+; CHECK-NOI32X2-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_extract_i(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .pred %p<2>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<4>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_extract_i_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_extract_i_param_0];
+; CHECK-I32X2-NEXT: setp.eq.b64 %p1, %rd2, 0;
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: selp.b32 %r3, %r1, %r2, %p1;
+; CHECK-I32X2-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-I32X2-NEXT: ret;
+ %e = extractelement <2 x i32> %a, i64 %idx
+ ret i32 %e
+}
+
+define <2 x i32> @test_add(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_add(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_add_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_add_param_0];
+; CHECK-NOI32X2-NEXT: add.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: add.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_add(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_add_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_add_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: add.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: add.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %r = add <2 x i32> %a, %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_add_imm_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_add_imm_0(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_add_imm_0_param_0];
+; CHECK-NOI32X2-NEXT: add.s32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_add_imm_0(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_add_imm_0_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: add.s32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT: ret;
+ %r = add <2 x i32> <i32 1, i32 2>, %a
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_add_imm_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_add_imm_1(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_add_imm_1_param_0];
+; CHECK-NOI32X2-NEXT: add.s32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_add_imm_1(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_add_imm_1_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT: add.s32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT: add.s32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT: ret;
+ %r = add <2 x i32> %a, <i32 1, i32 2>
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sub(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_sub(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_sub_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sub_param_0];
+; CHECK-NOI32X2-NEXT: sub.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: sub.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_sub(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_sub_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_sub_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: sub.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: sub.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %r = sub <2 x i32> %a, %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_smax(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_smax(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_smax_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_smax_param_0];
+; CHECK-NOI32X2-NEXT: max.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: max.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_smax(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_smax_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_smax_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: max.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: max.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp sgt <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_umax(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_umax(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_umax_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_umax_param_0];
+; CHECK-NOI32X2-NEXT: max.u32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: max.u32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_umax(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_umax_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_umax_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: max.u32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: max.u32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp ugt <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_smin(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_smin(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_smin_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_smin_param_0];
+; CHECK-NOI32X2-NEXT: min.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: min.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_smin(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_smin_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_smin_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: min.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: min.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp sle <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_umin(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_umin(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_umin_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_umin_param_0];
+; CHECK-NOI32X2-NEXT: min.u32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT: min.u32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_umin(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_umin_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_umin_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: min.u32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT: min.u32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp ule <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_eq(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+; CHECK-NOI32X2-LABEL: test_eq(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_eq_param_2];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_eq_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_eq_param_0];
+; CHECK-NOI32X2-NEXT: setp.eq.b32 %p1, %r1, %r3;
+; CHECK-NOI32X2-NEXT: setp.eq.b32 %p2, %r2, %r4;
+; CHECK-NOI32X2-NEXT: selp.b32 %r7, %r2, %r6, %p2;
+; CHECK-NOI32X2-NEXT: selp.b32 %r8, %r1, %r5, %p1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_eq(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .pred %p<3>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd3, [test_eq_param_2];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_eq_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_eq_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: setp.eq.b32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT: setp.eq.b32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT: mov.b64 {%r5, %r6}, %rd3;
+; CHECK-I32X2-NEXT: selp.b32 %r7, %r4, %r6, %p2;
+; CHECK-I32X2-NEXT: selp.b32 %r8, %r3, %r5, %p1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp eq <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %c
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @test_ne(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+; CHECK-NOI32X2-LABEL: test_ne(
+; CHECK-NOI32X2: {
+; CHECK-NOI32X2-NEXT: .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT: // %bb.0:
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_ne_param_2];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_ne_param_1];
+; CHECK-NOI32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ne_param_0];
+; CHECK-NOI32X2-NEXT: setp.ne.b32 %p1, %r1, %r3;
+; CHECK-NOI32X2-NEXT: setp.ne.b32 %p2, %r2, %r4;
+; CHECK-NOI32X2-NEXT: selp.b32 %r7, %r2, %r6, %p2;
+; CHECK-NOI32X2-NEXT: selp.b32 %r8, %r1, %r5, %p1;
+; CHECK-NOI32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOI32X2-NEXT: ret;
+;
+; CHECK-I32X2-LABEL: test_ne(
+; CHECK-I32X2: {
+; CHECK-I32X2-NEXT: .reg .pred %p<3>;
+; CHECK-I32X2-NEXT: .reg .b32 %r<9>;
+; CHECK-I32X2-NEXT: .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT: // %bb.0:
+; CHECK-I32X2-NEXT: ld.param.b64 %rd3, [test_ne_param_2];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_ne_param_1];
+; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_ne_param_0];
+; CHECK-I32X2-NEXT: mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT: mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT: setp.ne.b32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT: setp.ne.b32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT: mov.b64 {%r5, %r6}, %rd3;
+; CHECK-I32X2-NEXT: selp.b32 %r7, %r4, %r6, %p2;
+; CHECK-I32X2-NEXT: selp.b32 %r8, %r3, %r5, %p1;
+; CHECK-I32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-I32X2-NEXT: ret;
+ %cmp = icmp ne <2 x i32> %a, %b
+ %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %c
+ ret <2 ...
[truncated]
|
This comment was marked as outdated.
This comment was marked as outdated.
824001b
to
918a987
Compare
v2i32 register class is present to facilitate v2f32's use of integer registers. There are no actual instructions that can apply to v2i32 directly (except bitwise logical ops). Everything else must be done elementwise.
Context: most of the time nothing generates |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks, and sorry for the fallout from my change.
define void @test_ldst_v2i32_unaligned(ptr %a, ptr %b) { | ||
; CHECK-NOI32X2-LABEL: test_ldst_v2i32_unaligned( | ||
; CHECK-NOI32X2: { | ||
; CHECK-NOI32X2-NEXT: .reg .b32 %r<13>; | ||
; CHECK-NOI32X2-NEXT: .reg .b64 %rd<3>; | ||
; CHECK-NOI32X2-EMPTY: | ||
; CHECK-NOI32X2-NEXT: // %bb.0: | ||
; CHECK-NOI32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2i32_unaligned_param_1]; | ||
; CHECK-NOI32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2i32_unaligned_param_0]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r1, [%rd1+2]; | ||
; CHECK-NOI32X2-NEXT: shl.b32 %r2, %r1, 16; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r3, [%rd1+3]; | ||
; CHECK-NOI32X2-NEXT: shl.b32 %r4, %r3, 24; | ||
; CHECK-NOI32X2-NEXT: or.b32 %r5, %r4, %r2; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r6, [%rd1]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r7, [%rd1+1]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r8, [%rd1+4]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r9, [%rd1+5]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r10, [%rd1+6]; | ||
; CHECK-NOI32X2-NEXT: ld.b8 %r11, [%rd1+7]; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+7], %r11; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+6], %r10; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+5], %r9; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+4], %r8; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+1], %r7; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2], %r6; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+3], %r3; | ||
; CHECK-NOI32X2-NEXT: shr.u32 %r12, %r5, 16; | ||
; CHECK-NOI32X2-NEXT: st.b8 [%rd2+2], %r12; | ||
; CHECK-NOI32X2-NEXT: ret; | ||
; | ||
; CHECK-I32X2-LABEL: test_ldst_v2i32_unaligned( | ||
; CHECK-I32X2: { | ||
; CHECK-I32X2-NEXT: .reg .b64 %rd<28>; | ||
; CHECK-I32X2-EMPTY: | ||
; CHECK-I32X2-NEXT: // %bb.0: | ||
; CHECK-I32X2-NEXT: ld.param.b64 %rd2, [test_ldst_v2i32_unaligned_param_1]; | ||
; CHECK-I32X2-NEXT: ld.param.b64 %rd1, [test_ldst_v2i32_unaligned_param_0]; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd3, [%rd1]; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd4, [%rd1+1]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd5, %rd4, 8; | ||
; CHECK-I32X2-NEXT: or.b64 %rd6, %rd5, %rd3; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd7, [%rd1+2]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd8, %rd7, 16; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd9, [%rd1+3]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd10, %rd9, 24; | ||
; CHECK-I32X2-NEXT: or.b64 %rd11, %rd10, %rd8; | ||
; CHECK-I32X2-NEXT: or.b64 %rd12, %rd11, %rd6; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd13, [%rd1+4]; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd14, [%rd1+5]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd15, %rd14, 8; | ||
; CHECK-I32X2-NEXT: or.b64 %rd16, %rd15, %rd13; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd17, [%rd1+6]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd18, %rd17, 16; | ||
; CHECK-I32X2-NEXT: ld.b8 %rd19, [%rd1+7]; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd20, %rd19, 24; | ||
; CHECK-I32X2-NEXT: or.b64 %rd21, %rd20, %rd18; | ||
; CHECK-I32X2-NEXT: or.b64 %rd22, %rd21, %rd16; | ||
; CHECK-I32X2-NEXT: shl.b64 %rd23, %rd22, 32; | ||
; CHECK-I32X2-NEXT: or.b64 %rd24, %rd23, %rd12; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+6], %rd17; | ||
; CHECK-I32X2-NEXT: shr.u64 %rd25, %rd24, 56; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+7], %rd25; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+4], %rd13; | ||
; CHECK-I32X2-NEXT: shr.u64 %rd26, %rd24, 40; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+5], %rd26; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+1], %rd4; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2], %rd3; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+3], %rd9; | ||
; CHECK-I32X2-NEXT: shr.u64 %rd27, %rd24, 16; | ||
; CHECK-I32X2-NEXT: st.b8 [%rd2+2], %rd27; | ||
; CHECK-I32X2-NEXT: ret; | ||
%t1 = load <2 x i32>, ptr %a, align 1 | ||
store <2 x i32> %t1, ptr %b, align 1 | ||
ret void | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks significantly worse in -O0
but comparable in -O3
, so I guess it's okay.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Handling of bytes 2 and 3 in -O3
still looks rather odd. Why are we still shifting around those bytes but not the others?
We may still have gaps in the lowering of v2i32 that we'll need to address, but at least we would not crash for the time being.
Follow-up on #153478 and #161715.
v2i32 register class exists mostly to facilitate v2f32's use of integer registers. There are no actual instructions that can apply to v2i32 directly (except bitwise logical ops). Everything else must be done elementwise.