diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 38912a7f09e30..0c581dccbbd75 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // Map the global virtual register number to a register class specific // virtual register number starting from 1 with that class. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - //unsigned numRegClasses = TRI->getNumRegClasses(); // Emit the Fake Stack Object const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1479,13 +1478,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // global virtual // register number and the per class virtual register number. // We use the per class virtual register number in the ptx output. - unsigned int numVRs = MRI->getNumVirtRegs(); - for (unsigned i = 0; i < numVRs; i++) { - Register vr = Register::index2VirtReg(i); - const TargetRegisterClass *RC = MRI->getRegClass(vr); - DenseMap ®map = VRegMapping[RC]; - int n = regmap.size(); - regmap.insert(std::make_pair(vr, n + 1)); + for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) { + Register VR = Register::index2VirtReg(I); + if (MRI->use_empty(VR) && MRI->def_empty(VR)) + continue; + auto &RCRegMap = VRegMapping[MRI->getRegClass(VR)]; + RCRegMap[VR] = RCRegMap.size() + 1; } // Emit declaration of the virtual registers or 'physical' registers for diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index abc873e2aa706..bf51973e88357 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,7 +10,7 @@ declare {float, float} @bars({float, float} %input) define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: test_v2f32( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; @@ -21,8 +21,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-NEXT: call.uni (retval0), barv, (param0); ; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.b64 %rd4, [test_v2f32_param_1]; -; CHECK-NEXT: st.b64 [%rd4], %rd2; +; CHECK-NEXT: ld.param.b64 %rd3, [test_v2f32_param_1]; +; CHECK-NEXT: st.b64 [%rd3], %rd2; ; CHECK-NEXT: ret; %call = tail call <2 x float> @barv(<2 x float> %input) store <2 x float> %call, ptr %output, align 8 @@ -32,8 +32,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; @@ -47,9 +47,9 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-NEXT: ld.param.b32 %r2, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: ld.param.b64 %rd4, [test_v3f32_param_1]; -; CHECK-NEXT: st.b32 [%rd4+8], %r2; -; CHECK-NEXT: st.b64 [%rd4], %rd2; +; CHECK-NEXT: ld.param.b64 %rd3, [test_v3f32_param_1]; +; CHECK-NEXT: st.b32 [%rd3+8], %r2; +; CHECK-NEXT: st.b64 [%rd3], %rd2; ; CHECK-NEXT: ret; %call = tail call <3 x float> @barv3(<3 x float> %input) ; Make sure we don't load more values than than we need to. @@ -60,7 +60,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { define void @test_a2f32([2 x float] %input, ptr %output) { ; CHECK-LABEL: test_a2f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -87,7 +87,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) { define void @test_s2f32({float, float} %input, ptr %output) { ; CHECK-LABEL: test_s2f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index f710d7f883a1b..5f4856acb317c 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -47,90 +47,90 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62: { ; CHECKPTX62-NEXT: .reg .pred %p<5>; ; CHECKPTX62-NEXT: .reg .b16 %rs<11>; -; CHECKPTX62-NEXT: .reg .b32 %r<58>; +; CHECKPTX62-NEXT: .reg .b32 %r<50>; ; CHECKPTX62-EMPTY: ; CHECKPTX62-NEXT: // %bb.0: ; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX62-NEXT: ld.param.b32 %r23, [test_param_2]; -; CHECKPTX62-NEXT: ld.param.b32 %r22, [test_param_1]; -; CHECKPTX62-NEXT: ld.param.b32 %r24, [test_param_0]; -; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4; -; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3; -; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3; -; CHECKPTX62-NEXT: mov.b32 %r26, 65535; -; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX62-NEXT: not.b32 %r3, %r27; -; CHECKPTX62-NEXT: ld.b32 %r54, [%r1]; +; CHECKPTX62-NEXT: ld.param.b32 %r15, [test_param_2]; +; CHECKPTX62-NEXT: ld.param.b32 %r14, [test_param_1]; +; CHECKPTX62-NEXT: ld.param.b32 %r16, [test_param_0]; +; CHECKPTX62-NEXT: and.b32 %r1, %r16, -4; +; CHECKPTX62-NEXT: and.b32 %r17, %r16, 3; +; CHECKPTX62-NEXT: shl.b32 %r2, %r17, 3; +; CHECKPTX62-NEXT: mov.b32 %r18, 65535; +; CHECKPTX62-NEXT: shl.b32 %r19, %r18, %r2; +; CHECKPTX62-NEXT: not.b32 %r3, %r19; +; CHECKPTX62-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28; +; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3; -; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2; -; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3; -; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; -; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r6, %r54; -; CHECKPTX62-NEXT: mov.b32 %r54, %r6; +; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3; +; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2; +; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; +; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46; +; CHECKPTX62-NEXT: mov.b32 %r46, %r4; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX62-NEXT: ld.b32 %r55, [%r1]; +; CHECKPTX62-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33; +; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25; ; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00; ; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5; -; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6; -; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2; -; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3; -; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; -; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r9, %r55; -; CHECKPTX62-NEXT: mov.b32 %r55, %r9; +; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6; +; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; +; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27; +; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; +; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47; +; CHECKPTX62-NEXT: mov.b32 %r47, %r5; ; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26 -; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4; -; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3; -; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX62-NEXT: mov.b32 %r39, 65535; -; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; -; CHECKPTX62-NEXT: not.b32 %r12, %r40; -; CHECKPTX62-NEXT: ld.global.b32 %r56, [%r10]; +; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4; +; CHECKPTX62-NEXT: shl.b32 %r30, %r14, 3; +; CHECKPTX62-NEXT: and.b32 %r7, %r30, 24; +; CHECKPTX62-NEXT: mov.b32 %r31, 65535; +; CHECKPTX62-NEXT: shl.b32 %r32, %r31, %r7; +; CHECKPTX62-NEXT: not.b32 %r8, %r32; +; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41; +; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33; ; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8; -; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12; -; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; -; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r15, %r56; -; CHECKPTX62-NEXT: mov.b32 %r56, %r15; +; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8; +; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7; +; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35; +; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; +; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48; +; CHECKPTX62-NEXT: mov.b32 %r48, %r9; ; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8 -; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4; -; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3; -; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX62-NEXT: mov.b32 %r47, 65535; -; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; -; CHECKPTX62-NEXT: not.b32 %r18, %r48; -; CHECKPTX62-NEXT: ld.shared.b32 %r57, [%r16]; +; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4; +; CHECKPTX62-NEXT: shl.b32 %r38, %r15, 3; +; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24; +; CHECKPTX62-NEXT: mov.b32 %r39, 65535; +; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; +; CHECKPTX62-NEXT: not.b32 %r12, %r40; +; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49; +; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41; ; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1; -; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10; -; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17; -; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18; -; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; -; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r21, %r57; -; CHECKPTX62-NEXT: mov.b32 %r57, %r21; +; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10; +; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11; +; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43; +; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; +; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49; +; CHECKPTX62-NEXT: mov.b32 %r49, %r13; ; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX62-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index f96fd30019025..e560d4386c20d 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -47,93 +47,93 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71: { ; CHECKPTX71-NEXT: .reg .pred %p<5>; ; CHECKPTX71-NEXT: .reg .b16 %rs<14>; -; CHECKPTX71-NEXT: .reg .b32 %r<58>; +; CHECKPTX71-NEXT: .reg .b32 %r<50>; ; CHECKPTX71-EMPTY: ; CHECKPTX71-NEXT: // %bb.0: ; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX71-NEXT: ld.param.b32 %r23, [test_param_2]; -; CHECKPTX71-NEXT: ld.param.b32 %r22, [test_param_1]; -; CHECKPTX71-NEXT: ld.param.b32 %r24, [test_param_0]; -; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; -; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; -; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; -; CHECKPTX71-NEXT: mov.b32 %r26, 65535; -; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; -; CHECKPTX71-NEXT: not.b32 %r3, %r27; -; CHECKPTX71-NEXT: ld.b32 %r54, [%r1]; +; CHECKPTX71-NEXT: ld.param.b32 %r15, [test_param_2]; +; CHECKPTX71-NEXT: ld.param.b32 %r14, [test_param_1]; +; CHECKPTX71-NEXT: ld.param.b32 %r16, [test_param_0]; +; CHECKPTX71-NEXT: and.b32 %r1, %r16, -4; +; CHECKPTX71-NEXT: and.b32 %r17, %r16, 3; +; CHECKPTX71-NEXT: shl.b32 %r2, %r17, 3; +; CHECKPTX71-NEXT: mov.b32 %r18, 65535; +; CHECKPTX71-NEXT: shl.b32 %r19, %r18, %r2; +; CHECKPTX71-NEXT: not.b32 %r3, %r19; +; CHECKPTX71-NEXT: ld.b32 %r46, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28; +; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20; ; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4; -; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2; -; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3; -; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32; -; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r6, %r54; -; CHECKPTX71-NEXT: mov.b32 %r54, %r6; +; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4; +; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2; +; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3; +; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24; +; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46; +; CHECKPTX71-NEXT: mov.b32 %r46, %r4; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX71-NEXT: ld.b32 %r55, [%r1]; +; CHECKPTX71-NEXT: ld.b32 %r47, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33; +; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25; ; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6; -; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7; -; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2; -; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3; -; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; -; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37; -; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r9, %r55; -; CHECKPTX71-NEXT: mov.b32 %r55, %r9; +; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7; +; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; +; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3; +; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27; +; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29; +; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47; +; CHECKPTX71-NEXT: mov.b32 %r47, %r5; ; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3; ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26 -; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4; -; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3; -; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; -; CHECKPTX71-NEXT: mov.b32 %r39, 65535; -; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; -; CHECKPTX71-NEXT: not.b32 %r12, %r40; -; CHECKPTX71-NEXT: ld.global.b32 %r56, [%r10]; +; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4; +; CHECKPTX71-NEXT: shl.b32 %r30, %r14, 3; +; CHECKPTX71-NEXT: and.b32 %r7, %r30, 24; +; CHECKPTX71-NEXT: mov.b32 %r31, 65535; +; CHECKPTX71-NEXT: shl.b32 %r32, %r31, %r7; +; CHECKPTX71-NEXT: not.b32 %r8, %r32; +; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41; +; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33; ; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10; -; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; -; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12; -; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; -; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45; -; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r15, %r56; -; CHECKPTX71-NEXT: mov.b32 %r56, %r15; +; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10; +; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7; +; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8; +; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35; +; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37; +; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48; +; CHECKPTX71-NEXT: mov.b32 %r48, %r9; ; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5; ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8 -; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4; -; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3; -; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24; -; CHECKPTX71-NEXT: mov.b32 %r47, 65535; -; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17; -; CHECKPTX71-NEXT: not.b32 %r18, %r48; -; CHECKPTX71-NEXT: ld.shared.b32 %r57, [%r16]; +; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4; +; CHECKPTX71-NEXT: shl.b32 %r38, %r15, 3; +; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24; +; CHECKPTX71-NEXT: mov.b32 %r39, 65535; +; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; +; CHECKPTX71-NEXT: not.b32 %r12, %r40; +; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; -; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49; +; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11; +; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41; ; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80; ; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1; -; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13; -; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17; -; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18; -; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51; -; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53; -; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r21, %r57; -; CHECKPTX71-NEXT: mov.b32 %r57, %r21; +; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13; +; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11; +; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12; +; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43; +; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45; +; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49; +; CHECKPTX71-NEXT: mov.b32 %r49, %r13; ; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7; ; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end ; CHECKPTX71-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index 04a58cf22cfc5..6ea02f35e9626 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -425,40 +425,40 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1]; ; CHECK-NEXT: ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0]; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; -; CHECK-NEXT: cvt.u32.u64 %r6, %rd2; -; CHECK-NEXT: and.b32 %r7, %r6, 3; -; CHECK-NEXT: shl.b32 %r1, %r7, 3; -; CHECK-NEXT: mov.b32 %r8, 65535; -; CHECK-NEXT: shl.b32 %r9, %r8, %r1; -; CHECK-NEXT: not.b32 %r2, %r9; -; CHECK-NEXT: ld.b32 %r19, [%rd1]; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; +; CHECK-NEXT: and.b32 %r5, %r4, 3; +; CHECK-NEXT: shl.b32 %r1, %r5, 3; +; CHECK-NEXT: mov.b32 %r6, 65535; +; CHECK-NEXT: shl.b32 %r7, %r6, %r1; +; CHECK-NEXT: not.b32 %r2, %r7; +; CHECK-NEXT: ld.b32 %r17, [%rd1]; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u32 %r10, %r19, %r1; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r10; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; -; CHECK-NEXT: add.rn.f32 %r13, %r11, %r12; -; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r13; -; CHECK-NEXT: cvt.u32.u16 %r14, %rs3; -; CHECK-NEXT: shl.b32 %r15, %r14, %r1; -; CHECK-NEXT: and.b32 %r16, %r19, %r2; -; CHECK-NEXT: or.b32 %r17, %r16, %r15; +; CHECK-NEXT: shr.u32 %r8, %r17, %r1; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r8; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs2; +; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs3; +; CHECK-NEXT: shl.b32 %r13, %r12, %r1; +; CHECK-NEXT: and.b32 %r14, %r17, %r2; +; CHECK-NEXT: or.b32 %r15, %r14, %r13; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r19, %r17; -; CHECK-NEXT: setp.ne.b32 %p1, %r5, %r19; -; CHECK-NEXT: mov.b32 %r19, %r5; +; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15; +; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17; +; CHECK-NEXT: mov.b32 %r17, %r3; ; CHECK-NEXT: @%p1 bra $L__BB24_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end -; CHECK-NEXT: shr.u32 %r18, %r5, %r1; -; CHECK-NEXT: st.param.b16 [func_retval0], %r18; +; CHECK-NEXT: shr.u32 %r16, %r3, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %r16; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, half %val seq_cst ret half %ret diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index b4641d01eb927..bd4c7775354ae 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -204,7 +204,7 @@ declare <2 x bfloat> @test_callee(<2 x bfloat> %a, <2 x bfloat> %b) #0 define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-LABEL: test_call( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll index 9988d5b122cc1..579f02a9539c6 100644 --- a/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll +++ b/llvm/test/CodeGen/NVPTX/byval-arg-vectorize.ll @@ -11,7 +11,7 @@ declare %struct.double2 @add(ptr align(16) byval(%struct.double2), ptr align(16) define void @call_byval(ptr %out, ptr %in1, ptr %in2) { ; CHECK-LABEL: call_byval( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<12>; +; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [call_byval_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 63c389c36e87e..6e480996e7e6a 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB15_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB15_1; ; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB16_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB16_1; ; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB17_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB17_1; ; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB18_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB18_1; ; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB19_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB19_1; ; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB20_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB20_1; ; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB21_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB21_1; ; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB22_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB22_1; ; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB23_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB23_1; ; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB24_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB24_1; ; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB25_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB25_1; ; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB26_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB26_1; ; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB27_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB27_1; ; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB28_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB28_1; ; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<20>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM60-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 65535; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r15, [%rd1]; -; SM60-NEXT: and.b32 %r19, %r15, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 65535; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r16, %r19, %r3; -; SM60-NEXT: or.b32 %r17, %r19, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB29_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM60-NEXT: mov.b32 %r19, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB29_1; ; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.global.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1997,43 +1997,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2044,43 +2044,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<21>; +; SM60-NEXT: .reg .b32 %r<18>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: ; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM60-NEXT: membar.cta; -; SM60-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM60-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: cvt.u32.u64 %r10, %rd2; -; SM60-NEXT: and.b32 %r11, %r10, 3; -; SM60-NEXT: shl.b32 %r1, %r11, 3; -; SM60-NEXT: mov.b32 %r12, 255; -; SM60-NEXT: shl.b32 %r13, %r12, %r1; -; SM60-NEXT: not.b32 %r2, %r13; -; SM60-NEXT: cvt.u32.u16 %r14, %rs1; -; SM60-NEXT: and.b32 %r15, %r14, 255; -; SM60-NEXT: shl.b32 %r3, %r15, %r1; -; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM60-NEXT: and.b32 %r20, %r16, %r2; +; SM60-NEXT: cvt.u32.u64 %r8, %rd2; +; SM60-NEXT: and.b32 %r9, %r8, 3; +; SM60-NEXT: shl.b32 %r1, %r9, 3; +; SM60-NEXT: mov.b32 %r10, 255; +; SM60-NEXT: shl.b32 %r11, %r10, %r1; +; SM60-NEXT: not.b32 %r2, %r11; +; SM60-NEXT: cvt.u32.u16 %r12, %rs1; +; SM60-NEXT: and.b32 %r13, %r12, 255; +; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r4, %r7, %r1; +; SM60-NEXT: ld.shared.b32 %r14, [%rd1]; +; SM60-NEXT: and.b32 %r17, %r14, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r17, %r20, %r3; -; SM60-NEXT: or.b32 %r18, %r20, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM60-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM60-NEXT: or.b32 %r15, %r17, %r3; +; SM60-NEXT: or.b32 %r16, %r17, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM60-NEXT: and.b32 %r8, %r7, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM60-NEXT: mov.b32 %r20, %r8; +; SM60-NEXT: and.b32 %r6, %r5, %r2; +; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM60-NEXT: mov.b32 %r17, %r6; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; -; SM60-NEXT: st.param.b32 [func_retval0], %r14; +; SM60-NEXT: st.param.b32 [func_retval0], %r12; ; SM60-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 5cb344d5ded84..065b89c7ebf74 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB15_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB15_1; ; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB16_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB16_1; ; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB17_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB17_1; ; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB18_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB18_1; ; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB19_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB19_1; ; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB20_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB20_1; ; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB21_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB21_1; ; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB22_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB22_1; ; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB23_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB23_1; ; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB24_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB24_1; ; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB25_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB25_1; ; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB26_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB26_1; ; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB27_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB27_1; ; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB28_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB28_1; ; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM70-NEXT: fence.sc.cta; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB29_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB29_1; ; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.global.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -1997,43 +1997,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2044,43 +2044,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.shared.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 7cb259023d6dd..e4433570bdd70 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -7,41 +7,41 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic ret i8 %new @@ -52,42 +52,42 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_acquire_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire ret i8 %new @@ -98,43 +98,43 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [monotonic_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [monotonic_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst ret i8 %new @@ -145,42 +145,42 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_monotonic_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [acquire_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic ret i8 %new @@ -191,42 +191,42 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b8 %r9, [acquire_acquire_i8_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: ld.param.b8 %r7, [acquire_acquire_i8_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire ret i8 %new @@ -237,43 +237,43 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acquire_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acquire_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst ret i8 %new @@ -284,42 +284,42 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic ret i8 %new @@ -330,43 +330,43 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire ret i8 %new @@ -377,43 +377,43 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [release_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [release_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst ret i8 %new @@ -424,43 +424,43 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic ret i8 %new @@ -471,43 +471,43 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -518,43 +518,43 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst ret i8 %new @@ -565,43 +565,43 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_monotonic_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_monotonic_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic ret i8 %new @@ -612,43 +612,43 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_acquire_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_acquire_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire ret i8 %new @@ -659,43 +659,43 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [seq_cst_seq_cst_i8_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst ret i8 %new @@ -706,40 +706,40 @@ define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB15_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB15_1; ; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic ret i16 %new @@ -750,41 +750,41 @@ define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB16_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB16_1; ; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire ret i16 %new @@ -795,42 +795,42 @@ define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [monotonic_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB17_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB17_1; ; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst ret i16 %new @@ -841,41 +841,41 @@ define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [acquire_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB18_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB18_1; ; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic ret i16 %new @@ -886,41 +886,41 @@ define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_cta_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_cta_param_1]; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: ld.param.b16 %r7, [acquire_acquire_i16_global_cta_param_1]; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB19_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB19_1; ; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire ret i16 %new @@ -931,42 +931,42 @@ define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acquire_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB20_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB20_1; ; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst ret i16 %new @@ -977,41 +977,41 @@ define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB21_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB21_1; ; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic ret i16 %new @@ -1022,42 +1022,42 @@ define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB22_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB22_1; ; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire ret i16 %new @@ -1068,42 +1068,42 @@ define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [release_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB23_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB23_1; ; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst ret i16 %new @@ -1114,42 +1114,42 @@ define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB24_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB24_1; ; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic ret i16 %new @@ -1160,42 +1160,42 @@ define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB25_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB25_1; ; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire ret i16 %new @@ -1206,42 +1206,42 @@ define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [acq_rel_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB26_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB26_1; ; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst ret i16 %new @@ -1252,42 +1252,42 @@ define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_monotonic_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB27_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB27_1; ; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic ret i16 %new @@ -1298,42 +1298,42 @@ define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_acquire_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB28_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB28_1; ; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire ret i16 %new @@ -1344,42 +1344,42 @@ define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<20>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0]; ; SM90-NEXT: fence.sc.cta; -; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1]; +; SM90-NEXT: ld.param.b16 %r7, [seq_cst_seq_cst_i16_global_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 65535; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r15, [%rd1]; -; SM90-NEXT: and.b32 %r19, %r15, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 65535; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r16, %r19, %r3; -; SM90-NEXT: or.b32 %r17, %r19, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB29_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM90-NEXT: mov.b32 %r19, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB29_1; ; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst ret i16 %new @@ -1899,43 +1899,43 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.global.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire ret i8 %new @@ -2014,43 +2014,43 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_generic_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_generic_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new @@ -2061,43 +2061,43 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<21>; +; SM90-NEXT: .reg .b32 %r<18>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2]; ; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0]; ; SM90-NEXT: fence.release.cta; -; SM90-NEXT: ld.param.b8 %r9, [acq_rel_acquire_i8_shared_cta_param_1]; +; SM90-NEXT: ld.param.b8 %r7, [acq_rel_acquire_i8_shared_cta_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: cvt.u32.u64 %r10, %rd2; -; SM90-NEXT: and.b32 %r11, %r10, 3; -; SM90-NEXT: shl.b32 %r1, %r11, 3; -; SM90-NEXT: mov.b32 %r12, 255; -; SM90-NEXT: shl.b32 %r13, %r12, %r1; -; SM90-NEXT: not.b32 %r2, %r13; -; SM90-NEXT: cvt.u32.u16 %r14, %rs1; -; SM90-NEXT: and.b32 %r15, %r14, 255; -; SM90-NEXT: shl.b32 %r3, %r15, %r1; -; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; -; SM90-NEXT: and.b32 %r20, %r16, %r2; +; SM90-NEXT: cvt.u32.u64 %r8, %rd2; +; SM90-NEXT: and.b32 %r9, %r8, 3; +; SM90-NEXT: shl.b32 %r1, %r9, 3; +; SM90-NEXT: mov.b32 %r10, 255; +; SM90-NEXT: shl.b32 %r11, %r10, %r1; +; SM90-NEXT: not.b32 %r2, %r11; +; SM90-NEXT: cvt.u32.u16 %r12, %rs1; +; SM90-NEXT: and.b32 %r13, %r12, 255; +; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r4, %r7, %r1; +; SM90-NEXT: ld.shared.b32 %r14, [%rd1]; +; SM90-NEXT: and.b32 %r17, %r14, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r17, %r20, %r3; -; SM90-NEXT: or.b32 %r18, %r20, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17; -; SM90-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM90-NEXT: or.b32 %r15, %r17, %r3; +; SM90-NEXT: or.b32 %r16, %r17, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 -; SM90-NEXT: and.b32 %r8, %r7, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM90-NEXT: mov.b32 %r20, %r8; +; SM90-NEXT: and.b32 %r6, %r5, %r2; +; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM90-NEXT: mov.b32 %r17, %r6; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; -; SM90-NEXT: st.param.b32 [func_retval0], %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r12; ; SM90-NEXT: ret; %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire ret i8 %new diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 237e42394ba2f..997df7a8ad8b8 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -14,82 +14,82 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<18>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: ld.param.b8 %r7, [relaxed_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: and.b32 %r13, %r12, 255; +; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r14, [%rd1]; +; SM30-NEXT: and.b32 %r17, %r14, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r15, %r17, %r3; +; SM30-NEXT: or.b32 %r16, %r17, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM30-NEXT: @%p1 bra $L__BB0_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM30-NEXT: mov.b32 %r17, %r6; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [relaxed_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i8( ; SM90: { @@ -140,84 +140,84 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<18>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: ld.param.b8 %r7, [acquire_sys_i8_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: and.b32 %r13, %r12, 255; +; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r14, [%rd1]; +; SM30-NEXT: and.b32 %r17, %r14, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r15, %r17, %r3; +; SM30-NEXT: or.b32 %r16, %r17, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM30-NEXT: @%p1 bra $L__BB1_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM30-NEXT: mov.b32 %r17, %r6; ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: ld.param.b8 %r7, [acquire_sys_i8_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i8( ; SM90: { @@ -269,84 +269,84 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<18>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [release_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: and.b32 %r13, %r12, 255; +; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r14, [%rd1]; +; SM30-NEXT: and.b32 %r17, %r14, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r15, %r17, %r3; +; SM30-NEXT: or.b32 %r16, %r17, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM30-NEXT: @%p1 bra $L__BB2_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM30-NEXT: mov.b32 %r17, %r6; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [release_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i8( ; SM90: { @@ -398,86 +398,86 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<18>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: and.b32 %r13, %r12, 255; +; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r14, [%rd1]; +; SM30-NEXT: and.b32 %r17, %r14, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r15, %r17, %r3; +; SM30-NEXT: or.b32 %r16, %r17, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM30-NEXT: @%p1 bra $L__BB3_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM30-NEXT: mov.b32 %r17, %r6; ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i8( ; SM90: { @@ -530,86 +530,86 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<21>; +; SM30-NEXT: .reg .b32 %r<18>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r7, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 255; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: and.b32 %r15, %r14, 255; -; SM30-NEXT: shl.b32 %r3, %r15, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r16, [%rd1]; -; SM30-NEXT: and.b32 %r20, %r16, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 255; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: and.b32 %r13, %r12, 255; +; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r14, [%rd1]; +; SM30-NEXT: and.b32 %r17, %r14, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r17, %r20, %r3; -; SM30-NEXT: or.b32 %r18, %r20, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM30-NEXT: or.b32 %r15, %r17, %r3; +; SM30-NEXT: or.b32 %r16, %r17, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM30-NEXT: @%p1 bra $L__BB4_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM30-NEXT: mov.b32 %r20, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM30-NEXT: mov.b32 %r17, %r6; ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i8( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<21>; +; SM70-NEXT: .reg .b32 %r<18>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r7, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 255; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: and.b32 %r15, %r14, 255; -; SM70-NEXT: shl.b32 %r3, %r15, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r16, [%rd1]; -; SM70-NEXT: and.b32 %r20, %r16, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 255; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: and.b32 %r13, %r12, 255; +; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r14, [%rd1]; +; SM70-NEXT: and.b32 %r17, %r14, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r17, %r20, %r3; -; SM70-NEXT: or.b32 %r18, %r20, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r18; +; SM70-NEXT: or.b32 %r15, %r17, %r3; +; SM70-NEXT: or.b32 %r16, %r17, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r20, %r8; -; SM70-NEXT: mov.b32 %r20, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; +; SM70-NEXT: mov.b32 %r17, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i8( ; SM90: { @@ -663,80 +663,80 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: ld.param.b16 %r7, [relaxed_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB5_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB5_1; ; SM30-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: relaxed_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [relaxed_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: relaxed_sys_i16( ; SM90: { @@ -786,82 +786,82 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: ld.param.b16 %r7, [acquire_sys_i16_param_1]; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB6_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB6_1; ; SM30-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acquire_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: ld.param.b16 %r7, [acquire_sys_i16_param_1]; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acquire_sys_i16( ; SM90: { @@ -912,82 +912,82 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [release_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB7_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB7_1; ; SM30-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: release_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [release_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: release_sys_i16( ; SM90: { @@ -1038,84 +1038,84 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB8_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB8_1; ; SM30-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: acq_rel_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: acq_rel_sys_i16( ; SM90: { @@ -1168,84 +1168,84 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<20>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: ; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; ; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r7, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: cvt.u32.u64 %r10, %rd2; -; SM30-NEXT: and.b32 %r11, %r10, 3; -; SM30-NEXT: shl.b32 %r1, %r11, 3; -; SM30-NEXT: mov.b32 %r12, 65535; -; SM30-NEXT: shl.b32 %r13, %r12, %r1; -; SM30-NEXT: not.b32 %r2, %r13; -; SM30-NEXT: cvt.u32.u16 %r14, %rs1; -; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.b32 %r15, [%rd1]; -; SM30-NEXT: and.b32 %r19, %r15, %r2; +; SM30-NEXT: cvt.u32.u64 %r8, %rd2; +; SM30-NEXT: and.b32 %r9, %r8, 3; +; SM30-NEXT: shl.b32 %r1, %r9, 3; +; SM30-NEXT: mov.b32 %r10, 65535; +; SM30-NEXT: shl.b32 %r11, %r10, %r1; +; SM30-NEXT: not.b32 %r2, %r11; +; SM30-NEXT: cvt.u32.u16 %r12, %rs1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; +; SM30-NEXT: shl.b32 %r4, %r7, %r1; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r16, %r19, %r3; -; SM30-NEXT: or.b32 %r17, %r19, %r4; -; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16; -; SM30-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB9_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM30-NEXT: and.b32 %r8, %r7, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM30-NEXT: mov.b32 %r19, %r8; +; SM30-NEXT: and.b32 %r6, %r5, %r2; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB9_1; ; SM30-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; -; SM30-NEXT: st.param.b32 [func_retval0], %r14; +; SM30-NEXT: st.param.b32 [func_retval0], %r12; ; SM30-NEXT: ret; ; ; SM70-LABEL: seq_cst_sys_i16( ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<20>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; ; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r7, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: cvt.u32.u64 %r10, %rd2; -; SM70-NEXT: and.b32 %r11, %r10, 3; -; SM70-NEXT: shl.b32 %r1, %r11, 3; -; SM70-NEXT: mov.b32 %r12, 65535; -; SM70-NEXT: shl.b32 %r13, %r12, %r1; -; SM70-NEXT: not.b32 %r2, %r13; -; SM70-NEXT: cvt.u32.u16 %r14, %rs1; -; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.b32 %r15, [%rd1]; -; SM70-NEXT: and.b32 %r19, %r15, %r2; +; SM70-NEXT: cvt.u32.u64 %r8, %rd2; +; SM70-NEXT: and.b32 %r9, %r8, 3; +; SM70-NEXT: shl.b32 %r1, %r9, 3; +; SM70-NEXT: mov.b32 %r10, 65535; +; SM70-NEXT: shl.b32 %r11, %r10, %r1; +; SM70-NEXT: not.b32 %r2, %r11; +; SM70-NEXT: cvt.u32.u16 %r12, %rs1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; +; SM70-NEXT: shl.b32 %r4, %r7, %r1; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r16, %r19, %r3; -; SM70-NEXT: or.b32 %r17, %r19, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16; -; SM70-NEXT: setp.eq.b32 %p1, %r7, %r17; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 -; SM70-NEXT: and.b32 %r8, %r7, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r19, %r8; -; SM70-NEXT: mov.b32 %r19, %r8; +; SM70-NEXT: and.b32 %r6, %r5, %r2; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: st.param.b32 [func_retval0], %r14; +; SM70-NEXT: st.param.b32 [func_retval0], %r12; ; SM70-NEXT: ret; ; SM90-LABEL: seq_cst_sys_i16( ; SM90: { diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index da303b7c38eb7..e6bce8991a71d 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -189,7 +189,7 @@ declare i32 @use(i32 %0, i32 %1) define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test_mad_multi_use( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_mad_multi_use_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll index dd3e4ecddcd2e..6c80055ef4673 100644 --- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll +++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll @@ -9,7 +9,7 @@ declare i64 @callee_variadic(ptr %p, ...); define %struct.64 @test_return_type_mismatch(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-NEXT: .reg .b64 %rd<32>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_return_type_mismatch_param_0]; @@ -29,35 +29,35 @@ define %struct.64 @test_return_type_mismatch(ptr %p) { ; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; ; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; -; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; -; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; -; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; -; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; -; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; -; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; -; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; -; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: shl.b64 %rd11, %rd9, 8; +; CHECK-NEXT: or.b64 %rd12, %rd11, %rd10; +; CHECK-NEXT: shl.b64 %rd13, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd14, %rd7, 24; +; CHECK-NEXT: or.b64 %rd15, %rd14, %rd13; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd12; +; CHECK-NEXT: shl.b64 %rd17, %rd5, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd6; +; CHECK-NEXT: shl.b64 %rd19, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd20, %rd3, 24; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd18; +; CHECK-NEXT: shl.b64 %rd23, %rd22, 32; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd16; ; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; -; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; -; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; -; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; -; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; -; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; -; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; -; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: shr.u64 %rd25, %rd24, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd25; +; CHECK-NEXT: shr.u64 %rd26, %rd24, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd26; +; CHECK-NEXT: shr.u64 %rd27, %rd24, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd27; +; CHECK-NEXT: shr.u64 %rd28, %rd24, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd28; +; CHECK-NEXT: shr.u64 %rd29, %rd24, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd24, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd24, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd31; ; CHECK-NEXT: ret; %ret = call %struct.64 @callee(ptr %p) ret %struct.64 %ret @@ -66,7 +66,7 @@ define %struct.64 @test_return_type_mismatch(ptr %p) { define i64 @test_param_type_mismatch(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 1, 0 @@ -87,7 +87,7 @@ define i64 @test_param_type_mismatch(ptr %p) { define i64 @test_param_count_mismatch(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_param_0]; @@ -111,7 +111,7 @@ define i64 @test_param_count_mismatch(ptr %p) { define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-NEXT: .reg .b64 %rd<32>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_return_type_mismatch_variadic_param_0]; @@ -131,35 +131,35 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { ; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; ; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; ; CHECK-NEXT: } // callseq 3 -; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; -; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; -; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; -; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; -; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; -; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; -; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; -; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; -; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; -; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; -; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; -; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: shl.b64 %rd11, %rd9, 8; +; CHECK-NEXT: or.b64 %rd12, %rd11, %rd10; +; CHECK-NEXT: shl.b64 %rd13, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd14, %rd7, 24; +; CHECK-NEXT: or.b64 %rd15, %rd14, %rd13; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd12; +; CHECK-NEXT: shl.b64 %rd17, %rd5, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd6; +; CHECK-NEXT: shl.b64 %rd19, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd20, %rd3, 24; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd18; +; CHECK-NEXT: shl.b64 %rd23, %rd22, 32; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd16; ; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; -; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; -; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; -; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; -; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; -; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; -; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; -; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: shr.u64 %rd25, %rd24, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd25; +; CHECK-NEXT: shr.u64 %rd26, %rd24, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd26; +; CHECK-NEXT: shr.u64 %rd27, %rd24, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd27; +; CHECK-NEXT: shr.u64 %rd28, %rd24, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd28; +; CHECK-NEXT: shr.u64 %rd29, %rd24, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd24, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd24, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd31; ; CHECK-NEXT: ret; %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p) ret %struct.64 %ret @@ -168,7 +168,7 @@ define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { define i64 @test_param_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0]; @@ -190,7 +190,7 @@ define i64 @test_param_type_mismatch_variadic(ptr %p) { define i64 @test_param_count_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch_variadic( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll index 31ecce5a66b64..2e68208786d24 100644 --- a/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll +++ b/llvm/test/CodeGen/NVPTX/cse-mov-sym.ll @@ -13,33 +13,33 @@ define i32 @test_mov_sym(i32 %offset1, i32 %offset2, i1 %cond) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<4>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b8 %rs1, [test_mov_sym_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r4, [test_mov_sym_param_0]; -; CHECK-NEXT: cvt.s64.s32 %rd1, %r4; +; CHECK-NEXT: ld.param.b32 %r1, [test_mov_sym_param_0]; +; CHECK-NEXT: cvt.s64.s32 %rd1, %r1; ; CHECK-NEXT: mov.b64 %rd2, global_smem; ; CHECK-NEXT: add.s64 %rd3, %rd2, %rd1; -; CHECK-NEXT: ld.shared.b32 %r7, [%rd3]; +; CHECK-NEXT: ld.shared.b32 %r4, [%rd3]; ; CHECK-NEXT: not.pred %p2, %p1; ; CHECK-NEXT: @%p2 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %if1.preheader -; CHECK-NEXT: ld.param.b32 %r5, [test_mov_sym_param_1]; -; CHECK-NEXT: setp.ne.b32 %p3, %r4, %r5; +; CHECK-NEXT: ld.param.b32 %r2, [test_mov_sym_param_1]; +; CHECK-NEXT: setp.ne.b32 %p3, %r1, %r2; ; CHECK-NEXT: $L__BB0_2: // %if1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: @%p3 bra $L__BB0_2; ; CHECK-NEXT: // %bb.3: // %if2 -; CHECK-NEXT: cvt.s64.s32 %rd4, %r5; -; CHECK-NEXT: add.s64 %rd6, %rd2, %rd4; -; CHECK-NEXT: ld.shared.b32 %r6, [%rd6]; -; CHECK-NEXT: add.s32 %r7, %r7, %r6; +; CHECK-NEXT: cvt.s64.s32 %rd4, %r2; +; CHECK-NEXT: add.s64 %rd5, %rd2, %rd4; +; CHECK-NEXT: ld.shared.b32 %r3, [%rd5]; +; CHECK-NEXT: add.s32 %r4, %r4, %r3; ; CHECK-NEXT: $L__BB0_4: // %end -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; entry: %gep = getelementptr inbounds i8, ptr addrspace(3) @global_smem, i32 %offset1 diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index 2841e6751d029..1d70b9deb6089 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -166,23 +166,23 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<11>; -; CHECK-NEXT: .reg .b32 %r<53>; +; CHECK-NEXT: .reg .b32 %r<43>; ; CHECK-NEXT: .reg .b64 %rd<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; -; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r27, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r28, [%rd2], 1, 0; -; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r29, [%rd2], 1, 0; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r15, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r16, [%rd2], 1, 0; +; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r17, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r18, [%rd2], 1, 0; +; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r19, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r30, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r20, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r31, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r21, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r32, [%rd2], 1, 0; +; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r22, [%rd2], 1, 0; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; @@ -196,92 +196,92 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; -; CHECK-NEXT: cvt.u32.u64 %r33, %rd2; -; CHECK-NEXT: and.b32 %r34, %r33, 3; -; CHECK-NEXT: shl.b32 %r1, %r34, 3; -; CHECK-NEXT: mov.b32 %r35, 65535; -; CHECK-NEXT: shl.b32 %r36, %r35, %r1; -; CHECK-NEXT: not.b32 %r2, %r36; -; CHECK-NEXT: mov.b32 %r37, 1; -; CHECK-NEXT: shl.b32 %r3, %r37, %r1; -; CHECK-NEXT: ld.shared::cluster.b32 %r38, [%rd1]; -; CHECK-NEXT: and.b32 %r48, %r38, %r2; +; CHECK-NEXT: cvt.u32.u64 %r23, %rd2; +; CHECK-NEXT: and.b32 %r24, %r23, 3; +; CHECK-NEXT: shl.b32 %r1, %r24, 3; +; CHECK-NEXT: mov.b32 %r25, 65535; +; CHECK-NEXT: shl.b32 %r26, %r25, %r1; +; CHECK-NEXT: not.b32 %r2, %r26; +; CHECK-NEXT: mov.b32 %r27, 1; +; CHECK-NEXT: shl.b32 %r3, %r27, %r1; +; CHECK-NEXT: ld.shared::cluster.b32 %r28, [%rd1]; +; CHECK-NEXT: and.b32 %r38, %r28, %r2; ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r39, %r48, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r39, %r48; -; CHECK-NEXT: setp.eq.b32 %p1, %r6, %r39; +; CHECK-NEXT: or.b32 %r29, %r38, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r4, [%rd1], %r29, %r38; +; CHECK-NEXT: setp.eq.b32 %p1, %r4, %r29; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 ; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1 -; CHECK-NEXT: and.b32 %r7, %r6, %r2; -; CHECK-NEXT: setp.ne.b32 %p2, %r48, %r7; -; CHECK-NEXT: mov.b32 %r48, %r7; +; CHECK-NEXT: and.b32 %r5, %r4, %r2; +; CHECK-NEXT: setp.ne.b32 %p2, %r38, %r5; +; CHECK-NEXT: mov.b32 %r38, %r5; ; CHECK-NEXT: @%p2 bra $L__BB4_1; ; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31 -; CHECK-NEXT: ld.shared::cluster.b32 %r40, [%rd1]; -; CHECK-NEXT: and.b32 %r49, %r40, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r30, [%rd1]; +; CHECK-NEXT: and.b32 %r39, %r30, %r2; ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r41, %r49, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r41, %r49; -; CHECK-NEXT: setp.eq.b32 %p3, %r10, %r41; +; CHECK-NEXT: or.b32 %r31, %r39, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r31, %r39; +; CHECK-NEXT: setp.eq.b32 %p3, %r6, %r31; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 ; CHECK-NEXT: // in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: and.b32 %r11, %r10, %r2; -; CHECK-NEXT: setp.ne.b32 %p4, %r49, %r11; -; CHECK-NEXT: mov.b32 %r49, %r11; +; CHECK-NEXT: and.b32 %r7, %r6, %r2; +; CHECK-NEXT: setp.ne.b32 %p4, %r39, %r7; +; CHECK-NEXT: mov.b32 %r39, %r7; ; CHECK-NEXT: @%p4 bra $L__BB4_4; ; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r42, [%rd1]; -; CHECK-NEXT: and.b32 %r50, %r42, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r32, [%rd1]; +; CHECK-NEXT: and.b32 %r40, %r32, %r2; ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r43, %r50, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd1], %r43, %r50; -; CHECK-NEXT: setp.eq.b32 %p5, %r14, %r43; +; CHECK-NEXT: or.b32 %r33, %r40, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r8, [%rd1], %r33, %r40; +; CHECK-NEXT: setp.eq.b32 %p5, %r8, %r33; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 ; CHECK-NEXT: // in Loop: Header=BB4_7 Depth=1 -; CHECK-NEXT: and.b32 %r15, %r14, %r2; -; CHECK-NEXT: setp.ne.b32 %p6, %r50, %r15; -; CHECK-NEXT: mov.b32 %r50, %r15; +; CHECK-NEXT: and.b32 %r9, %r8, %r2; +; CHECK-NEXT: setp.ne.b32 %p6, %r40, %r9; +; CHECK-NEXT: mov.b32 %r40, %r9; ; CHECK-NEXT: @%p6 bra $L__BB4_7; ; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11 ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r44, [%rd1]; -; CHECK-NEXT: and.b32 %r51, %r44, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r34, [%rd1]; +; CHECK-NEXT: and.b32 %r41, %r34, %r2; ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r45, %r51, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r18, [%rd1], %r45, %r51; -; CHECK-NEXT: setp.eq.b32 %p7, %r18, %r45; +; CHECK-NEXT: or.b32 %r35, %r41, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r35, %r41; +; CHECK-NEXT: setp.eq.b32 %p7, %r10, %r35; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 ; CHECK-NEXT: // in Loop: Header=BB4_10 Depth=1 -; CHECK-NEXT: and.b32 %r19, %r18, %r2; -; CHECK-NEXT: setp.ne.b32 %p8, %r51, %r19; -; CHECK-NEXT: mov.b32 %r51, %r19; +; CHECK-NEXT: and.b32 %r11, %r10, %r2; +; CHECK-NEXT: setp.ne.b32 %p8, %r41, %r11; +; CHECK-NEXT: mov.b32 %r41, %r11; ; CHECK-NEXT: @%p8 bra $L__BB4_10; ; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: ld.shared::cluster.b32 %r46, [%rd1]; -; CHECK-NEXT: and.b32 %r52, %r46, %r2; +; CHECK-NEXT: ld.shared::cluster.b32 %r36, [%rd1]; +; CHECK-NEXT: and.b32 %r42, %r36, %r2; ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: or.b32 %r47, %r52, %r3; -; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r22, [%rd1], %r47, %r52; -; CHECK-NEXT: setp.eq.b32 %p9, %r22, %r47; +; CHECK-NEXT: or.b32 %r37, %r42, %r3; +; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r12, [%rd1], %r37, %r42; +; CHECK-NEXT: setp.eq.b32 %p9, %r12, %r37; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure ; CHECK-NEXT: // in Loop: Header=BB4_13 Depth=1 -; CHECK-NEXT: and.b32 %r23, %r22, %r2; -; CHECK-NEXT: setp.ne.b32 %p10, %r52, %r23; -; CHECK-NEXT: mov.b32 %r52, %r23; +; CHECK-NEXT: and.b32 %r13, %r12, %r2; +; CHECK-NEXT: setp.ne.b32 %p10, %r42, %r13; +; CHECK-NEXT: mov.b32 %r42, %r13; ; CHECK-NEXT: @%p10 bra $L__BB4_13; ; CHECK-NEXT: $L__BB4_15: // %partword.cmpxchg.end ; CHECK-NEXT: fence.acq_rel.sys; diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 06fb8d2c7c54d..ce2f0f32a8748 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -12,7 +12,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) { ; CHECK-32-LABEL: test_dynamic_stackalloc( ; CHECK-32: { -; CHECK-32-NEXT: .reg .b32 %r<8>; +; CHECK-32-NEXT: .reg .b32 %r<7>; ; CHECK-32-EMPTY: ; CHECK-32-NEXT: // %bb.0: ; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_param_0]; @@ -32,7 +32,7 @@ define i32 @test_dynamic_stackalloc(i64 %n) { ; ; CHECK-64-LABEL: test_dynamic_stackalloc( ; CHECK-64: { -; CHECK-64-NEXT: .reg .b32 %r<3>; +; CHECK-64-NEXT: .reg .b32 %r<2>; ; CHECK-64-NEXT: .reg .b64 %rd<6>; ; CHECK-64-EMPTY: ; CHECK-64-NEXT: // %bb.0: diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index d61a63ce24f89..6d67ed0b4d539 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -31,16 +31,16 @@ define i1 @test_v2i8_load(ptr %a) { ; CHECK-LABEL: test_v2i8_load( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_v2i8_load_param_0]; ; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2; -; CHECK-NEXT: and.b16 %rs6, %rs5, 255; -; CHECK-NEXT: setp.eq.b16 %p1, %rs6, 0; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: and.b16 %rs4, %rs3, 255; +; CHECK-NEXT: setp.eq.b16 %p1, %rs4, 0; ; CHECK-NEXT: selp.b32 %r1, -1, 0, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 991311f9492b9..64c7792a61c8c 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -455,7 +455,7 @@ declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_call( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -478,7 +478,7 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_call_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -501,7 +501,7 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-LABEL: test_tailcall_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 467459759c42c..bcaefa1699d8b 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -863,7 +863,7 @@ declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0 define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_call( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1]; @@ -886,7 +886,7 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_call_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; @@ -909,7 +909,7 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-LABEL: test_tailcall_flipped( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll index 87274aa759bea..ba4bb76d25113 100644 --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -25,7 +25,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) { define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { ; CHECK-LABEL: t2_f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [t2_f32_param_0]; @@ -72,7 +72,7 @@ define ptx_device double @t1_f64(double %x, double %y, double %z) { define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { ; CHECK-LABEL: t2_f64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [t2_f64_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll index 636e12bf98943..4f1454d3788a4 100644 --- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll +++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll @@ -7,7 +7,6 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) { ; CHECK-LABEL: test_ld_param_const( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_const_param_0+4]; @@ -61,7 +60,6 @@ define void @test_ld_param_byval(ptr byval(i32) %a) { ; CHECK-LABEL: test_ld_param_byval( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 1, 0 @@ -98,8 +96,7 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [test_multi_block_param_1]; @@ -108,12 +105,12 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) { ; CHECK-NEXT: not.pred %p2, %p1; ; CHECK-NEXT: @%p2 bra $L__BB5_2; ; CHECK-NEXT: // %bb.1: // %if -; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+4]; +; CHECK-NEXT: ld.param.b32 %r1, [test_multi_block_param_0+4]; ; CHECK-NEXT: bra.uni $L__BB5_3; ; CHECK-NEXT: $L__BB5_2: // %else -; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [test_multi_block_param_0+8]; ; CHECK-NEXT: $L__BB5_3: // %end -; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; br i1 %p, label %if, label %else if: diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index df32e2a4cfad2..264f38021e1de 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -66,22 +66,22 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-LABEL: test_select_i1_basic( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<4>; -; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_param_1]; -; CHECK-NEXT: or.b32 %r4, %r1, %r2; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; ; CHECK-NEXT: setp.ne.b32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_2]; -; CHECK-NEXT: setp.eq.b32 %p2, %r5, 0; -; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_param_3]; -; CHECK-NEXT: setp.eq.b32 %p3, %r4, 0; -; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_basic_param_4]; -; CHECK-NEXT: selp.b32 %r9, %r7, %r8, %p2; -; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1; -; CHECK-NEXT: selp.b32 %r11, %r7, %r10, %p3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r11; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_param_2]; +; CHECK-NEXT: setp.eq.b32 %p2, %r4, 0; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_3]; +; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; +; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_param_4]; +; CHECK-NEXT: selp.b32 %r7, %r5, %r6, %p2; +; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; +; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 @@ -94,7 +94,7 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) { ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<13>; +; CHECK-NEXT: .reg .pred %p<11>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -106,14 +106,14 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; ; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; ; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; -; CHECK-NEXT: xor.pred %p6, %p1, %p3; +; CHECK-NEXT: xor.pred %p5, %p1, %p3; ; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; -; CHECK-NEXT: and.pred %p8, %p6, %p4; -; CHECK-NEXT: and.pred %p9, %p2, %p4; -; CHECK-NEXT: and.pred %p10, %p3, %p8; -; CHECK-NEXT: or.pred %p11, %p10, %p9; -; CHECK-NEXT: xor.pred %p12, %p11, %p3; -; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; +; CHECK-NEXT: and.pred %p6, %p5, %p4; +; CHECK-NEXT: and.pred %p7, %p2, %p4; +; CHECK-NEXT: and.pred %p8, %p3, %p6; +; CHECK-NEXT: or.pred %p9, %p8, %p7; +; CHECK-NEXT: xor.pred %p10, %p9, %p3; +; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p10; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll index 3bb9c6aec51ac..7bd8a0021f1b5 100644 --- a/llvm/test/CodeGen/NVPTX/i128-array.ll +++ b/llvm/test/CodeGen/NVPTX/i128-array.ll @@ -27,13 +27,13 @@ define [2 x i128] @foo(i64 %a, i32 %b) { define [2 x i128] @foo2(ptr byval([2 x i128]) %a) { ; CHECK-LABEL: foo2( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6}; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0+16]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; ; CHECK-NEXT: ret; %ptr0 = getelementptr [2 x i128], ptr %a, i64 0, i32 0 %1 = load i128, i128* %ptr0 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 44d85589b5056..cdbbabe3e3b05 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -7,137 +7,137 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<127>; +; CHECK-NEXT: .reg .b64 %rd<79>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1]; -; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; -; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; -; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; -; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2; -; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6; -; CHECK-NEXT: setp.eq.b64 %p3, %rd55, 0; -; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p4, %rd56, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [srem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [srem_i128_param_1]; +; CHECK-NEXT: shr.s64 %rd1, %rd9, 63; +; CHECK-NEXT: sub.cc.s64 %rd12, 0, %rd8; +; CHECK-NEXT: subc.cc.s64 %rd13, 0, %rd9; +; CHECK-NEXT: setp.lt.s64 %p1, %rd9, 0; +; CHECK-NEXT: selp.b64 %rd3, %rd13, %rd9, %p1; +; CHECK-NEXT: selp.b64 %rd2, %rd12, %rd8, %p1; +; CHECK-NEXT: sub.cc.s64 %rd14, 0, %rd10; +; CHECK-NEXT: subc.cc.s64 %rd15, 0, %rd11; +; CHECK-NEXT: setp.lt.s64 %p2, %rd11, 0; +; CHECK-NEXT: selp.b64 %rd5, %rd15, %rd11, %p2; +; CHECK-NEXT: selp.b64 %rd4, %rd14, %rd10, %p2; +; CHECK-NEXT: or.b64 %rd16, %rd4, %rd5; +; CHECK-NEXT: setp.eq.b64 %p3, %rd16, 0; +; CHECK-NEXT: or.b64 %rd17, %rd2, %rd3; +; CHECK-NEXT: setp.eq.b64 %p4, %rd17, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; -; CHECK-NEXT: setp.ne.b64 %p6, %rd6, 0; -; CHECK-NEXT: clz.b64 %r1, %rd6; -; CHECK-NEXT: cvt.u64.u32 %rd57, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd5; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r2; -; CHECK-NEXT: add.s64 %rd59, %rd58, 64; -; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6; -; CHECK-NEXT: setp.ne.b64 %p7, %rd4, 0; -; CHECK-NEXT: clz.b64 %r3, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd61, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; -; CHECK-NEXT: add.s64 %rd63, %rd62, 64; -; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; -; CHECK-NEXT: mov.b64 %rd117, 0; -; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; -; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; -; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0; +; CHECK-NEXT: setp.ne.b64 %p6, %rd5, 0; +; CHECK-NEXT: clz.b64 %r1, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd18, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd4; +; CHECK-NEXT: cvt.u64.u32 %rd19, %r2; +; CHECK-NEXT: add.s64 %rd20, %rd19, 64; +; CHECK-NEXT: selp.b64 %rd21, %rd18, %rd20, %p6; +; CHECK-NEXT: setp.ne.b64 %p7, %rd3, 0; +; CHECK-NEXT: clz.b64 %r3, %rd3; +; CHECK-NEXT: cvt.u64.u32 %rd22, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd23, %r4; +; CHECK-NEXT: add.s64 %rd24, %rd23, 64; +; CHECK-NEXT: selp.b64 %rd25, %rd22, %rd24, %p7; +; CHECK-NEXT: mov.b64 %rd70, 0; +; CHECK-NEXT: sub.cc.s64 %rd26, %rd21, %rd25; +; CHECK-NEXT: subc.cc.s64 %rd27, %rd70, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd26, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd27, 0; ; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0; +; CHECK-NEXT: setp.ne.b64 %p11, %rd27, 0; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; -; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; -; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13; +; CHECK-NEXT: xor.b64 %rd28, %rd26, 127; +; CHECK-NEXT: or.b64 %rd29, %rd28, %rd27; +; CHECK-NEXT: setp.eq.b64 %p14, %rd29, 0; +; CHECK-NEXT: selp.b64 %rd78, 0, %rd3, %p13; +; CHECK-NEXT: selp.b64 %rd77, 0, %rd2, %p13; ; CHECK-NEXT: or.pred %p15, %p13, %p14; ; CHECK-NEXT: @%p15 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; -; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; -; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd66; +; CHECK-NEXT: add.cc.s64 %rd71, %rd26, 1; +; CHECK-NEXT: addc.cc.s64 %rd72, %rd27, 0; +; CHECK-NEXT: or.b64 %rd30, %rd71, %rd72; +; CHECK-NEXT: setp.eq.b64 %p16, %rd30, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd26; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd31, %rd3, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; +; CHECK-NEXT: shr.u64 %rd32, %rd2, %r7; +; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8; +; CHECK-NEXT: shl.b64 %rd34, %rd2, %r8; ; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17; -; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6; -; CHECK-NEXT: mov.b64 %rd114, %rd117; +; CHECK-NEXT: selp.b64 %rd76, %rd34, %rd33, %p17; +; CHECK-NEXT: shl.b64 %rd75, %rd2, %r6; +; CHECK-NEXT: mov.b64 %rd69, %rd70; ; CHECK-NEXT: @%p16 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd119; -; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd71; +; CHECK-NEXT: shr.u64 %rd35, %rd2, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10; -; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; +; CHECK-NEXT: shl.b64 %rd36, %rd3, %r10; +; CHECK-NEXT: or.b64 %rd37, %rd35, %rd36; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11; +; CHECK-NEXT: shr.u64 %rd38, %rd3, %r11; ; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18; -; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9; -; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; -; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd114, 0; -; CHECK-NEXT: mov.b64 %rd117, %rd114; +; CHECK-NEXT: selp.b64 %rd73, %rd38, %rd37, %p18; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r9; +; CHECK-NEXT: add.cc.s64 %rd6, %rd4, -1; +; CHECK-NEXT: addc.cc.s64 %rd7, %rd5, -1; +; CHECK-NEXT: mov.b64 %rd69, 0; +; CHECK-NEXT: mov.b64 %rd70, %rd69; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; -; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; -; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; -; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; -; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; -; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; -; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; -; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd117, %rd95, 1; -; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; -; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; -; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; -; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0; +; CHECK-NEXT: shr.u64 %rd39, %rd73, 63; +; CHECK-NEXT: shl.b64 %rd40, %rd74, 1; +; CHECK-NEXT: or.b64 %rd41, %rd40, %rd39; +; CHECK-NEXT: shl.b64 %rd42, %rd73, 1; +; CHECK-NEXT: shr.u64 %rd43, %rd76, 63; +; CHECK-NEXT: or.b64 %rd44, %rd42, %rd43; +; CHECK-NEXT: shr.u64 %rd45, %rd75, 63; +; CHECK-NEXT: shl.b64 %rd46, %rd76, 1; +; CHECK-NEXT: or.b64 %rd47, %rd46, %rd45; +; CHECK-NEXT: shl.b64 %rd48, %rd75, 1; +; CHECK-NEXT: or.b64 %rd75, %rd70, %rd48; +; CHECK-NEXT: or.b64 %rd76, %rd69, %rd47; +; CHECK-NEXT: sub.cc.s64 %rd49, %rd6, %rd44; +; CHECK-NEXT: subc.cc.s64 %rd50, %rd7, %rd41; +; CHECK-NEXT: shr.s64 %rd51, %rd50, 63; +; CHECK-NEXT: and.b64 %rd70, %rd51, 1; +; CHECK-NEXT: and.b64 %rd52, %rd51, %rd4; +; CHECK-NEXT: and.b64 %rd53, %rd51, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd73, %rd44, %rd52; +; CHECK-NEXT: subc.cc.s64 %rd74, %rd41, %rd53; +; CHECK-NEXT: add.cc.s64 %rd71, %rd71, -1; +; CHECK-NEXT: addc.cc.s64 %rd72, %rd72, -1; +; CHECK-NEXT: or.b64 %rd54, %rd71, %rd72; +; CHECK-NEXT: setp.eq.b64 %p19, %rd54, 0; ; CHECK-NEXT: @%p19 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; -; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; -; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; -; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; +; CHECK-NEXT: shr.u64 %rd55, %rd75, 63; +; CHECK-NEXT: shl.b64 %rd56, %rd76, 1; +; CHECK-NEXT: or.b64 %rd57, %rd56, %rd55; +; CHECK-NEXT: shl.b64 %rd58, %rd75, 1; +; CHECK-NEXT: or.b64 %rd77, %rd70, %rd58; +; CHECK-NEXT: or.b64 %rd78, %rd69, %rd57; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; -; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; -; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; -; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; -; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; +; CHECK-NEXT: mul.hi.u64 %rd59, %rd4, %rd77; +; CHECK-NEXT: mad.lo.s64 %rd60, %rd4, %rd78, %rd59; +; CHECK-NEXT: mad.lo.s64 %rd61, %rd5, %rd77, %rd60; +; CHECK-NEXT: mul.lo.s64 %rd62, %rd4, %rd77; +; CHECK-NEXT: sub.cc.s64 %rd63, %rd2, %rd62; +; CHECK-NEXT: subc.cc.s64 %rd64, %rd3, %rd61; +; CHECK-NEXT: xor.b64 %rd65, %rd63, %rd1; +; CHECK-NEXT: xor.b64 %rd66, %rd64, %rd1; +; CHECK-NEXT: sub.cc.s64 %rd67, %rd65, %rd1; +; CHECK-NEXT: subc.cc.s64 %rd68, %rd66, %rd1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd67, %rd68}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -148,122 +148,122 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<113>; +; CHECK-NEXT: .reg .b64 %rd<66>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1]; -; CHECK-NEXT: or.b64 %rd45, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p1, %rd45, 0; -; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; -; CHECK-NEXT: setp.eq.b64 %p2, %rd46, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [urem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_param_1]; +; CHECK-NEXT: or.b64 %rd7, %rd1, %rd2; +; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; +; CHECK-NEXT: or.b64 %rd8, %rd5, %rd6; +; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; -; CHECK-NEXT: setp.ne.b64 %p4, %rd4, 0; -; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd47, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd48, %r2; -; CHECK-NEXT: add.s64 %rd49, %rd48, 64; -; CHECK-NEXT: selp.b64 %rd50, %rd47, %rd49, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd42, 0; -; CHECK-NEXT: clz.b64 %r3, %rd42; -; CHECK-NEXT: cvt.u64.u32 %rd51, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd41; -; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; -; CHECK-NEXT: add.s64 %rd53, %rd52, 64; -; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd103, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p4, %rd2, 0; +; CHECK-NEXT: clz.b64 %r1, %rd2; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd1; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; +; CHECK-NEXT: add.s64 %rd11, %rd10, 64; +; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd6, 0; +; CHECK-NEXT: clz.b64 %r3, %rd6; +; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; +; CHECK-NEXT: add.s64 %rd15, %rd14, 64; +; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: mov.b64 %rd57, 0; +; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; +; CHECK-NEXT: subc.cc.s64 %rd18, %rd57, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; +; CHECK-NEXT: selp.b64 %rd65, 0, %rd6, %p11; +; CHECK-NEXT: selp.b64 %rd64, 0, %rd5, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; -; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1; +; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0; +; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59; +; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7; +; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd100, %rd103; +; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6; +; CHECK-NEXT: mov.b64 %rd56, %rd57; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd105; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd58; +; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd27, %rd6, %r10; +; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd6, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9; -; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; -; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd100, 0; -; CHECK-NEXT: mov.b64 %rd103, %rd100; +; CHECK-NEXT: selp.b64 %rd60, %rd29, %rd28, %p16; +; CHECK-NEXT: shr.u64 %rd61, %rd6, %r9; +; CHECK-NEXT: add.cc.s64 %rd3, %rd1, -1; +; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, -1; +; CHECK-NEXT: mov.b64 %rd56, 0; +; CHECK-NEXT: mov.b64 %rd57, %rd56; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; -; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; -; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd103, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; -; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; -; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd30, %rd60, 63; +; CHECK-NEXT: shl.b64 %rd31, %rd61, 1; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30; +; CHECK-NEXT: shl.b64 %rd33, %rd60, 1; +; CHECK-NEXT: shr.u64 %rd34, %rd63, 63; +; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34; +; CHECK-NEXT: shr.u64 %rd36, %rd62, 63; +; CHECK-NEXT: shl.b64 %rd37, %rd63, 1; +; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36; +; CHECK-NEXT: shl.b64 %rd39, %rd62, 1; +; CHECK-NEXT: or.b64 %rd62, %rd57, %rd39; +; CHECK-NEXT: or.b64 %rd63, %rd56, %rd38; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd3, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd4, %rd32; +; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; +; CHECK-NEXT: and.b64 %rd57, %rd42, 1; +; CHECK-NEXT: and.b64 %rd43, %rd42, %rd1; +; CHECK-NEXT: and.b64 %rd44, %rd42, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd60, %rd35, %rd43; +; CHECK-NEXT: subc.cc.s64 %rd61, %rd32, %rd44; +; CHECK-NEXT: add.cc.s64 %rd58, %rd58, -1; +; CHECK-NEXT: addc.cc.s64 %rd59, %rd59, -1; +; CHECK-NEXT: or.b64 %rd45, %rd58, %rd59; +; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0; ; CHECK-NEXT: @%p17 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; -; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; -; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; +; CHECK-NEXT: shr.u64 %rd46, %rd62, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd63, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd62, 1; +; CHECK-NEXT: or.b64 %rd64, %rd57, %rd49; +; CHECK-NEXT: or.b64 %rd65, %rd56, %rd48; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; -; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; -; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; -; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; -; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; +; CHECK-NEXT: mul.hi.u64 %rd50, %rd1, %rd64; +; CHECK-NEXT: mad.lo.s64 %rd51, %rd1, %rd65, %rd50; +; CHECK-NEXT: mad.lo.s64 %rd52, %rd2, %rd64, %rd51; +; CHECK-NEXT: mul.lo.s64 %rd53, %rd1, %rd64; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd5, %rd53; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd6, %rd52; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd54, %rd55}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -308,132 +308,132 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<122>; +; CHECK-NEXT: .reg .b64 %rd<74>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1]; -; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; -; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; -; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; -; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1; -; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1; -; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49; -; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50; -; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0; -; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2; -; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2; -; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46; -; CHECK-NEXT: shr.s64 %rd5, %rd55, 63; -; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4; -; CHECK-NEXT: setp.eq.b64 %p3, %rd56, 0; -; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2; -; CHECK-NEXT: setp.eq.b64 %p4, %rd57, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [sdiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [sdiv_i128_param_1]; +; CHECK-NEXT: sub.cc.s64 %rd12, 0, %rd8; +; CHECK-NEXT: subc.cc.s64 %rd13, 0, %rd9; +; CHECK-NEXT: setp.lt.s64 %p1, %rd9, 0; +; CHECK-NEXT: selp.b64 %rd2, %rd13, %rd9, %p1; +; CHECK-NEXT: selp.b64 %rd1, %rd12, %rd8, %p1; +; CHECK-NEXT: sub.cc.s64 %rd14, 0, %rd10; +; CHECK-NEXT: subc.cc.s64 %rd15, 0, %rd11; +; CHECK-NEXT: setp.lt.s64 %p2, %rd11, 0; +; CHECK-NEXT: selp.b64 %rd4, %rd15, %rd11, %p2; +; CHECK-NEXT: selp.b64 %rd3, %rd14, %rd10, %p2; +; CHECK-NEXT: xor.b64 %rd16, %rd11, %rd9; +; CHECK-NEXT: shr.s64 %rd5, %rd16, 63; +; CHECK-NEXT: or.b64 %rd17, %rd3, %rd4; +; CHECK-NEXT: setp.eq.b64 %p3, %rd17, 0; +; CHECK-NEXT: or.b64 %rd18, %rd1, %rd2; +; CHECK-NEXT: setp.eq.b64 %p4, %rd18, 0; ; CHECK-NEXT: or.pred %p5, %p3, %p4; ; CHECK-NEXT: setp.ne.b64 %p6, %rd4, 0; ; CHECK-NEXT: clz.b64 %r1, %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd58, %r1; +; CHECK-NEXT: cvt.u64.u32 %rd19, %r1; ; CHECK-NEXT: clz.b64 %r2, %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd59, %r2; -; CHECK-NEXT: add.s64 %rd60, %rd59, 64; -; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6; +; CHECK-NEXT: cvt.u64.u32 %rd20, %r2; +; CHECK-NEXT: add.s64 %rd21, %rd20, 64; +; CHECK-NEXT: selp.b64 %rd22, %rd19, %rd21, %p6; ; CHECK-NEXT: setp.ne.b64 %p7, %rd2, 0; ; CHECK-NEXT: clz.b64 %r3, %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd62, %r3; +; CHECK-NEXT: cvt.u64.u32 %rd23, %r3; ; CHECK-NEXT: clz.b64 %r4, %rd1; -; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; -; CHECK-NEXT: add.s64 %rd64, %rd63, 64; -; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: mov.b64 %rd112, 0; -; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; -; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0; +; CHECK-NEXT: cvt.u64.u32 %rd24, %r4; +; CHECK-NEXT: add.s64 %rd25, %rd24, 64; +; CHECK-NEXT: selp.b64 %rd26, %rd23, %rd25, %p7; +; CHECK-NEXT: mov.b64 %rd65, 0; +; CHECK-NEXT: sub.cc.s64 %rd27, %rd22, %rd26; +; CHECK-NEXT: subc.cc.s64 %rd28, %rd65, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd27, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd28, 0; ; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0; +; CHECK-NEXT: setp.ne.b64 %p11, %rd28, 0; ; CHECK-NEXT: or.pred %p12, %p10, %p11; ; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; -; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; -; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0; -; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13; -; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13; +; CHECK-NEXT: xor.b64 %rd29, %rd27, 127; +; CHECK-NEXT: or.b64 %rd30, %rd29, %rd28; +; CHECK-NEXT: setp.eq.b64 %p14, %rd30, 0; +; CHECK-NEXT: selp.b64 %rd73, 0, %rd2, %p13; +; CHECK-NEXT: selp.b64 %rd72, 0, %rd1, %p13; ; CHECK-NEXT: or.pred %p15, %p13, %p14; ; CHECK-NEXT: @%p15 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; -; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; -; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd67; +; CHECK-NEXT: add.cc.s64 %rd66, %rd27, 1; +; CHECK-NEXT: addc.cc.s64 %rd67, %rd28, 0; +; CHECK-NEXT: or.b64 %rd31, %rd66, %rd67; +; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd27; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd32, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; +; CHECK-NEXT: shr.u64 %rd33, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8; +; CHECK-NEXT: shl.b64 %rd35, %rd1, %r8; ; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17; -; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6; -; CHECK-NEXT: mov.b64 %rd109, %rd112; +; CHECK-NEXT: selp.b64 %rd71, %rd35, %rd34, %p17; +; CHECK-NEXT: shl.b64 %rd70, %rd1, %r6; +; CHECK-NEXT: mov.b64 %rd64, %rd65; ; CHECK-NEXT: @%p16 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd114; -; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; +; CHECK-NEXT: shr.u64 %rd36, %rd1, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10; -; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; +; CHECK-NEXT: shl.b64 %rd37, %rd2, %r10; +; CHECK-NEXT: or.b64 %rd38, %rd36, %rd37; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11; +; CHECK-NEXT: shr.u64 %rd39, %rd2, %r11; ; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18; -; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9; -; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; -; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd109, 0; -; CHECK-NEXT: mov.b64 %rd112, %rd109; +; CHECK-NEXT: selp.b64 %rd68, %rd39, %rd38, %p18; +; CHECK-NEXT: shr.u64 %rd69, %rd2, %r9; +; CHECK-NEXT: add.cc.s64 %rd6, %rd3, -1; +; CHECK-NEXT: addc.cc.s64 %rd7, %rd4, -1; +; CHECK-NEXT: mov.b64 %rd64, 0; +; CHECK-NEXT: mov.b64 %rd65, %rd64; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; -; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; -; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; -; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; -; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; -; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; -; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; -; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; -; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; -; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; -; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; -; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; -; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; -; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; -; CHECK-NEXT: and.b64 %rd112, %rd96, 1; -; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; -; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; -; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; -; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; -; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; -; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0; +; CHECK-NEXT: shr.u64 %rd40, %rd68, 63; +; CHECK-NEXT: shl.b64 %rd41, %rd69, 1; +; CHECK-NEXT: or.b64 %rd42, %rd41, %rd40; +; CHECK-NEXT: shl.b64 %rd43, %rd68, 1; +; CHECK-NEXT: shr.u64 %rd44, %rd71, 63; +; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44; +; CHECK-NEXT: shr.u64 %rd46, %rd70, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd71, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd70, 1; +; CHECK-NEXT: or.b64 %rd70, %rd65, %rd49; +; CHECK-NEXT: or.b64 %rd71, %rd64, %rd48; +; CHECK-NEXT: sub.cc.s64 %rd50, %rd6, %rd45; +; CHECK-NEXT: subc.cc.s64 %rd51, %rd7, %rd42; +; CHECK-NEXT: shr.s64 %rd52, %rd51, 63; +; CHECK-NEXT: and.b64 %rd65, %rd52, 1; +; CHECK-NEXT: and.b64 %rd53, %rd52, %rd3; +; CHECK-NEXT: and.b64 %rd54, %rd52, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd68, %rd45, %rd53; +; CHECK-NEXT: subc.cc.s64 %rd69, %rd42, %rd54; +; CHECK-NEXT: add.cc.s64 %rd66, %rd66, -1; +; CHECK-NEXT: addc.cc.s64 %rd67, %rd67, -1; +; CHECK-NEXT: or.b64 %rd55, %rd66, %rd67; +; CHECK-NEXT: setp.eq.b64 %p19, %rd55, 0; ; CHECK-NEXT: @%p19 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; -; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; -; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; -; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; -; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; +; CHECK-NEXT: shr.u64 %rd56, %rd70, 63; +; CHECK-NEXT: shl.b64 %rd57, %rd71, 1; +; CHECK-NEXT: or.b64 %rd58, %rd57, %rd56; +; CHECK-NEXT: shl.b64 %rd59, %rd70, 1; +; CHECK-NEXT: or.b64 %rd72, %rd65, %rd59; +; CHECK-NEXT: or.b64 %rd73, %rd64, %rd58; ; CHECK-NEXT: $L__BB4_5: // %udiv-end -; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; -; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; +; CHECK-NEXT: xor.b64 %rd60, %rd72, %rd5; +; CHECK-NEXT: xor.b64 %rd61, %rd73, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd62, %rd60, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd63, %rd61, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd62, %rd63}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -444,116 +444,116 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<107>; +; CHECK-NEXT: .reg .b64 %rd<60>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1]; -; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44; -; CHECK-NEXT: setp.eq.b64 %p1, %rd45, 0; -; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; -; CHECK-NEXT: setp.eq.b64 %p2, %rd46, 0; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [udiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [udiv_i128_param_1]; +; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6; +; CHECK-NEXT: setp.eq.b64 %p1, %rd7, 0; +; CHECK-NEXT: or.b64 %rd8, %rd3, %rd4; +; CHECK-NEXT: setp.eq.b64 %p2, %rd8, 0; ; CHECK-NEXT: or.pred %p3, %p1, %p2; -; CHECK-NEXT: setp.ne.b64 %p4, %rd44, 0; -; CHECK-NEXT: clz.b64 %r1, %rd44; -; CHECK-NEXT: cvt.u64.u32 %rd47, %r1; -; CHECK-NEXT: clz.b64 %r2, %rd43; -; CHECK-NEXT: cvt.u64.u32 %rd48, %r2; -; CHECK-NEXT: add.s64 %rd49, %rd48, 64; -; CHECK-NEXT: selp.b64 %rd50, %rd47, %rd49, %p4; -; CHECK-NEXT: setp.ne.b64 %p5, %rd42, 0; -; CHECK-NEXT: clz.b64 %r3, %rd42; -; CHECK-NEXT: cvt.u64.u32 %rd51, %r3; -; CHECK-NEXT: clz.b64 %r4, %rd41; -; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; -; CHECK-NEXT: add.s64 %rd53, %rd52, 64; -; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd97, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p4, %rd6, 0; +; CHECK-NEXT: clz.b64 %r1, %rd6; +; CHECK-NEXT: cvt.u64.u32 %rd9, %r1; +; CHECK-NEXT: clz.b64 %r2, %rd5; +; CHECK-NEXT: cvt.u64.u32 %rd10, %r2; +; CHECK-NEXT: add.s64 %rd11, %rd10, 64; +; CHECK-NEXT: selp.b64 %rd12, %rd9, %rd11, %p4; +; CHECK-NEXT: setp.ne.b64 %p5, %rd4, 0; +; CHECK-NEXT: clz.b64 %r3, %rd4; +; CHECK-NEXT: cvt.u64.u32 %rd13, %r3; +; CHECK-NEXT: clz.b64 %r4, %rd3; +; CHECK-NEXT: cvt.u64.u32 %rd14, %r4; +; CHECK-NEXT: add.s64 %rd15, %rd14, 64; +; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5; +; CHECK-NEXT: mov.b64 %rd51, 0; +; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16; +; CHECK-NEXT: subc.cc.s64 %rd18, %rd51, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd19, %rd17, 127; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18; +; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0; +; CHECK-NEXT: selp.b64 %rd59, 0, %rd4, %p11; +; CHECK-NEXT: selp.b64 %rd58, 0, %rd3, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; -; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1; +; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0; +; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53; +; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd17; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd94, %rd97; +; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15; +; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6; +; CHECK-NEXT: mov.b64 %rd50, %rd51; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd99; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd52; +; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd27, %rd4, %r10; +; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd29, %rd4, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9; -; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; -; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.b64 %rd94, 0; -; CHECK-NEXT: mov.b64 %rd97, %rd94; +; CHECK-NEXT: selp.b64 %rd54, %rd29, %rd28, %p16; +; CHECK-NEXT: shr.u64 %rd55, %rd4, %r9; +; CHECK-NEXT: add.cc.s64 %rd1, %rd5, -1; +; CHECK-NEXT: addc.cc.s64 %rd2, %rd6, -1; +; CHECK-NEXT: mov.b64 %rd50, 0; +; CHECK-NEXT: mov.b64 %rd51, %rd50; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; -; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; -; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd97, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; -; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; -; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd30, %rd54, 63; +; CHECK-NEXT: shl.b64 %rd31, %rd55, 1; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30; +; CHECK-NEXT: shl.b64 %rd33, %rd54, 1; +; CHECK-NEXT: shr.u64 %rd34, %rd57, 63; +; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34; +; CHECK-NEXT: shr.u64 %rd36, %rd56, 63; +; CHECK-NEXT: shl.b64 %rd37, %rd57, 1; +; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36; +; CHECK-NEXT: shl.b64 %rd39, %rd56, 1; +; CHECK-NEXT: or.b64 %rd56, %rd51, %rd39; +; CHECK-NEXT: or.b64 %rd57, %rd50, %rd38; +; CHECK-NEXT: sub.cc.s64 %rd40, %rd1, %rd35; +; CHECK-NEXT: subc.cc.s64 %rd41, %rd2, %rd32; +; CHECK-NEXT: shr.s64 %rd42, %rd41, 63; +; CHECK-NEXT: and.b64 %rd51, %rd42, 1; +; CHECK-NEXT: and.b64 %rd43, %rd42, %rd5; +; CHECK-NEXT: and.b64 %rd44, %rd42, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd54, %rd35, %rd43; +; CHECK-NEXT: subc.cc.s64 %rd55, %rd32, %rd44; +; CHECK-NEXT: add.cc.s64 %rd52, %rd52, -1; +; CHECK-NEXT: addc.cc.s64 %rd53, %rd53, -1; +; CHECK-NEXT: or.b64 %rd45, %rd52, %rd53; +; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0; ; CHECK-NEXT: @%p17 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; -; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; -; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; +; CHECK-NEXT: shr.u64 %rd46, %rd56, 63; +; CHECK-NEXT: shl.b64 %rd47, %rd57, 1; +; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46; +; CHECK-NEXT: shl.b64 %rd49, %rd56, 1; +; CHECK-NEXT: or.b64 %rd58, %rd51, %rd49; +; CHECK-NEXT: or.b64 %rd59, %rd50, %rd48; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd58, %rd59}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 74136bbe478c9..7f48245af4a26 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -635,7 +635,7 @@ declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_call( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -658,7 +658,7 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_call_flipped( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -681,7 +681,7 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-LABEL: test_tailcall_flipped( ; COMMON: { -; COMMON-NEXT: .reg .b32 %r<5>; +; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index 98f94bb7b3ac1..53150c1a01314 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -69,7 +69,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { define <2 x i8> @test_call_2xi8(<2 x i8> %a) { ; O0-LABEL: test_call_2xi8( ; O0: { -; O0-NEXT: .reg .b16 %rs<7>; +; O0-NEXT: .reg .b16 %rs<5>; ; O0-NEXT: .reg .b32 %r<2>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: @@ -87,7 +87,7 @@ define <2 x i8> @test_call_2xi8(<2 x i8> %a) { ; ; O3-LABEL: test_call_2xi8( ; O3: { -; O3-NEXT: .reg .b16 %rs<7>; +; O3-NEXT: .reg .b16 %rs<5>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 26336b83c4f96..40d6a07310265 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1298,7 +1298,7 @@ declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_call( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_call_param_1]; @@ -1317,7 +1317,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_call( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_param_0]; @@ -1340,7 +1340,7 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_call_flipped( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; @@ -1359,7 +1359,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_call_flipped( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; @@ -1382,7 +1382,7 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_tailcall_flipped( ; O0: { -; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b32 %r<4>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; @@ -1401,7 +1401,7 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; ; O3-LABEL: test_tailcall_flipped( ; O3: { -; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b32 %r<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index 782e6720e5112..673fb73948268 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -16,8 +16,8 @@ define internal i32 @foo() { ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; @@ -29,8 +29,8 @@ define internal i32 @foo() { ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: add.u64 %rd2, %SP, 0; ; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd4, %SPL, 1; -; CHECK-NEXT: ld.local.b8 %rs1, [%rd4]; +; CHECK-NEXT: add.u64 %rd3, %SPL, 1; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd3]; ; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_0; @@ -53,8 +53,8 @@ define internal i32 @bar() { ; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; @@ -66,9 +66,9 @@ define internal i32 @bar() { ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: add.u64 %rd2, %SP, 0; ; CHECK-NEXT: st.param.b64 [param1], %rd2; -; CHECK-NEXT: add.u64 %rd4, %SPL, 8; -; CHECK-NEXT: ld.local.b64 %rd5, [%rd4]; -; CHECK-NEXT: st.param.b64 [param0], %rd5; +; CHECK-NEXT: add.u64 %rd3, %SPL, 8; +; CHECK-NEXT: ld.local.b64 %rd4, [%rd3]; +; CHECK-NEXT: st.param.b64 [param0], %rd4; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_1; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 037d7df1aee59..bf0dd58e27a35 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -11,32 +11,32 @@ define void @test_b128_in_loop() { ; CHECK-LABEL: test_b128_in_loop( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b64 %rd<15>; -; CHECK-NEXT: .reg .b128 %rq<3>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b128 %rq<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.global.s32 %rd1, [size]; ; CHECK-NEXT: setp.eq.b64 %p1, %rd1, 0; ; CHECK-NEXT: @%p1 bra $L__BB0_3; ; CHECK-NEXT: // %bb.1: // %BB1 -; CHECK-NEXT: ld.global.v2.b64 {%rd12, %rd13}, [x]; -; CHECK-NEXT: mov.b64 %rd14, 0; +; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [x]; +; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: $L__BB0_2: // %BB2 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov.b128 %rq1, {%rd12, %rd13}; +; CHECK-NEXT: mov.b128 %rq1, {%rd2, %rd3}; ; CHECK-NEXT: // begin inline asm ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b64 lo; ; CHECK-NEXT: .reg .b64 hi; ; CHECK-NEXT: mov.b128 {lo, hi}, %rq1; -; CHECK-NEXT: add.cc.u64 lo, lo, %rd14; +; CHECK-NEXT: add.cc.u64 lo, lo, %rd4; ; CHECK-NEXT: mov.b128 %rq1, {lo, hi}; ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm -; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1; -; CHECK-NEXT: st.global.v2.b64 [x], {%rd12, %rd13}; -; CHECK-NEXT: add.s64 %rd14, %rd14, 1; -; CHECK-NEXT: setp.ne.b64 %p2, %rd1, %rd14; +; CHECK-NEXT: mov.b128 {%rd2, %rd3}, %rq1; +; CHECK-NEXT: st.global.v2.b64 [x], {%rd2, %rd3}; +; CHECK-NEXT: add.s64 %rd4, %rd4, 1; +; CHECK-NEXT: setp.ne.b64 %p2, %rd1, %rd4; ; CHECK-NEXT: @%p2 bra $L__BB0_2; ; CHECK-NEXT: $L__BB0_3: // %BB3 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll index a6238352179ca..4620c5e01008c 100644 --- a/llvm/test/CodeGen/NVPTX/jump-table.ll +++ b/llvm/test/CodeGen/NVPTX/jump-table.ll @@ -10,11 +10,11 @@ define void @foo(i32 %i) { ; CHECK-LABEL: foo( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.b32 %r2, [foo_param_0]; -; CHECK-NEXT: setp.gt.u32 %p1, %r2, 3; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_0]; +; CHECK-NEXT: setp.gt.u32 %p1, %r1, 3; ; CHECK-NEXT: @%p1 bra $L__BB0_6; ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: $L_brx_0: .branchtargets @@ -22,7 +22,7 @@ define void @foo(i32 %i) { ; CHECK-NEXT: $L__BB0_3, ; CHECK-NEXT: $L__BB0_4, ; CHECK-NEXT: $L__BB0_5; -; CHECK-NEXT: brx.idx %r2, $L_brx_0; +; CHECK-NEXT: brx.idx %r1, $L_brx_0; ; CHECK-NEXT: $L__BB0_2: // %case0 ; CHECK-NEXT: st.global.b32 [out], 0; ; CHECK-NEXT: bra.uni $L__BB0_6; diff --git a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll index 03523a3be50c2..dfb0e80d0907d 100644 --- a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll +++ b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll @@ -12,7 +12,7 @@ define ptr @foo(i1 %cond) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b8 %rs1, [foo_param_0]; @@ -21,14 +21,14 @@ define ptr @foo(i1 %cond) { ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b64 retval0; ; CHECK-NEXT: call.uni (retval0), baz, (); -; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: ld.param.b64 %rd1, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: @%p1 bra $L__BB0_2; ; CHECK-NEXT: // %bb.1: // %bb ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .b64 param0; ; CHECK-NEXT: .param .b64 retval0; -; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), bar, (param0); ; CHECK-NEXT: } // callseq 1 ; CHECK-NEXT: $L__BB0_2: // %common.ret diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index efa2666090ccc..3ac8f65ff858b 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -7,7 +7,7 @@ declare <4 x float> @bar() define void @foo(ptr %ptr) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index ec8dd0c5c9350..d542fa58684a1 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -593,51 +593,51 @@ define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { ; SM20-LABEL: foo19( ; SM20: { ; SM20-NEXT: .reg .pred %p<2>; -; SM20-NEXT: .reg .b32 %r<10>; -; SM20-NEXT: .reg .b64 %rd<8>; +; SM20-NEXT: .reg .b32 %r<4>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: // %entry -; SM20-NEXT: ld.param.b32 %r8, [foo19_param_2]; -; SM20-NEXT: ld.param.b64 %rd5, [foo19_param_0]; -; SM20-NEXT: cvta.to.global.u64 %rd7, %rd5; -; SM20-NEXT: ld.param.b64 %rd6, [foo19_param_1]; -; SM20-NEXT: cvta.to.global.u64 %rd2, %rd6; -; SM20-NEXT: mov.b32 %r9, 0f00000000; +; SM20-NEXT: ld.param.b32 %r2, [foo19_param_2]; +; SM20-NEXT: ld.param.b64 %rd2, [foo19_param_0]; +; SM20-NEXT: cvta.to.global.u64 %rd4, %rd2; +; SM20-NEXT: ld.param.b64 %rd3, [foo19_param_1]; +; SM20-NEXT: cvta.to.global.u64 %rd1, %rd3; +; SM20-NEXT: mov.b32 %r3, 0f00000000; ; SM20-NEXT: $L__BB18_1: // %loop ; SM20-NEXT: // =>This Inner Loop Header: Depth=1 -; SM20-NEXT: ld.global.b32 %r7, [%rd7]; -; SM20-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM20-NEXT: add.s64 %rd7, %rd7, 4; -; SM20-NEXT: add.s32 %r8, %r8, -1; -; SM20-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM20-NEXT: ld.global.b32 %r1, [%rd4]; +; SM20-NEXT: add.rn.f32 %r3, %r1, %r3; +; SM20-NEXT: add.s64 %rd4, %rd4, 4; +; SM20-NEXT: add.s32 %r2, %r2, -1; +; SM20-NEXT: setp.ne.b32 %p1, %r2, 0; ; SM20-NEXT: @%p1 bra $L__BB18_1; ; SM20-NEXT: // %bb.2: // %exit -; SM20-NEXT: st.global.b32 [%rd2], %r9; +; SM20-NEXT: st.global.b32 [%rd1], %r3; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo19( ; SM35: { ; SM35-NEXT: .reg .pred %p<2>; -; SM35-NEXT: .reg .b32 %r<10>; -; SM35-NEXT: .reg .b64 %rd<8>; +; SM35-NEXT: .reg .b32 %r<4>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: // %entry -; SM35-NEXT: ld.param.b32 %r8, [foo19_param_2]; -; SM35-NEXT: ld.param.b64 %rd5, [foo19_param_0]; -; SM35-NEXT: cvta.to.global.u64 %rd7, %rd5; -; SM35-NEXT: ld.param.b64 %rd6, [foo19_param_1]; -; SM35-NEXT: cvta.to.global.u64 %rd2, %rd6; -; SM35-NEXT: mov.b32 %r9, 0f00000000; +; SM35-NEXT: ld.param.b32 %r2, [foo19_param_2]; +; SM35-NEXT: ld.param.b64 %rd2, [foo19_param_0]; +; SM35-NEXT: cvta.to.global.u64 %rd4, %rd2; +; SM35-NEXT: ld.param.b64 %rd3, [foo19_param_1]; +; SM35-NEXT: cvta.to.global.u64 %rd1, %rd3; +; SM35-NEXT: mov.b32 %r3, 0f00000000; ; SM35-NEXT: $L__BB18_1: // %loop ; SM35-NEXT: // =>This Inner Loop Header: Depth=1 -; SM35-NEXT: ld.global.nc.b32 %r7, [%rd7]; -; SM35-NEXT: add.rn.f32 %r9, %r7, %r9; -; SM35-NEXT: add.s64 %rd7, %rd7, 4; -; SM35-NEXT: add.s32 %r8, %r8, -1; -; SM35-NEXT: setp.ne.b32 %p1, %r8, 0; +; SM35-NEXT: ld.global.nc.b32 %r1, [%rd4]; +; SM35-NEXT: add.rn.f32 %r3, %r1, %r3; +; SM35-NEXT: add.s64 %rd4, %rd4, 4; +; SM35-NEXT: add.s32 %r2, %r2, -1; +; SM35-NEXT: setp.ne.b32 %p1, %r2, 0; ; SM35-NEXT: @%p1 bra $L__BB18_1; ; SM35-NEXT: // %bb.2: // %exit -; SM35-NEXT: st.global.b32 [%rd2], %r9; +; SM35-NEXT: st.global.b32 [%rd1], %r3; ; SM35-NEXT: ret; entry: br label %loop diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index ae069cf956c36..f7137e05a5e4f 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -12,13 +12,13 @@ define void @foo(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot0[4]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<4>; +; PTX32-NEXT: .reg .b32 %r<3>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot0; ; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: st.local.b32 [%r3], %r1; +; PTX32-NEXT: add.u32 %r2, %SPL, 0; +; PTX32-NEXT: st.local.b32 [%r2], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo( @@ -27,13 +27,13 @@ define void @foo(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<3>; +; PTX64-NEXT: .reg .b64 %rd<2>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot0; ; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0]; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: st.local.b32 [%rd2], %r1; +; PTX64-NEXT: add.u64 %rd1, %SPL, 0; +; PTX64-NEXT: st.local.b32 [%rd1], %r1; ; PTX64-NEXT: ret; %local = alloca i32, align 4 store volatile i32 %a, ptr %local @@ -97,15 +97,15 @@ define void @foo3(i32 %a) { ; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12]; ; PTX32-NEXT: .reg .b32 %SP; ; PTX32-NEXT: .reg .b32 %SPL; -; PTX32-NEXT: .reg .b32 %r<6>; +; PTX32-NEXT: .reg .b32 %r<5>; ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot2; ; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: shl.b32 %r4, %r1, 2; -; PTX32-NEXT: add.s32 %r5, %r3, %r4; -; PTX32-NEXT: st.local.b32 [%r5], %r1; +; PTX32-NEXT: add.u32 %r2, %SPL, 0; +; PTX32-NEXT: shl.b32 %r3, %r1, 2; +; PTX32-NEXT: add.s32 %r4, %r2, %r3; +; PTX32-NEXT: st.local.b32 [%r4], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo3( @@ -114,14 +114,14 @@ define void @foo3(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<4>; +; PTX64-NEXT: .reg .b64 %rd<3>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; -; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: mad.wide.s32 %rd3, %r1, 4, %rd2; -; PTX64-NEXT: st.local.b32 [%rd3], %r1; +; PTX64-NEXT: add.u64 %rd1, %SPL, 0; +; PTX64-NEXT: mad.wide.s32 %rd2, %r1, 4, %rd1; +; PTX64-NEXT: st.local.b32 [%rd2], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 %1 = getelementptr inbounds i32, ptr %local, i32 %a diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index f5df0fcde1883..8adde4ceefbf4 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -340,19 +340,19 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0; -; PTX-NEXT: ld.param.b64 %rd5, [grid_const_phi_param_1]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5; +; PTX-NEXT: mov.b64 %rd3, grid_const_phi_param_0; +; PTX-NEXT: ld.param.b64 %rd2, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd2; ; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB9_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: add.s64 %rd6, %rd6, 4; +; PTX-NEXT: add.s64 %rd3, %rd3, 4; ; PTX-NEXT: $L__BB9_2: // %merge -; PTX-NEXT: ld.param.b32 %r2, [%rd6]; +; PTX-NEXT: ld.param.b32 %r2, [%rd3]; ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( @@ -396,20 +396,20 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0; -; PTX-NEXT: ld.param.b64 %rd6, [grid_const_phi_ngc_param_2]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX-NEXT: mov.b64 %rd4, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.b64 %rd3, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd3; ; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB10_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1; -; PTX-NEXT: add.s64 %rd7, %rd2, 4; +; PTX-NEXT: add.s64 %rd4, %rd2, 4; ; PTX-NEXT: $L__BB10_2: // %merge -; PTX-NEXT: ld.param.b32 %r2, [%rd7]; +; PTX-NEXT: ld.param.b32 %r2, [%rd4]; ; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 4d36ff9496ede..21257e21bea9f 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -456,63 +456,63 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<48>; +; PTX-NEXT: .reg .b64 %rd<47>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; -; PTX-NEXT: add.u64 %rd3, %SPL, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.b32 [%rd3+4], %r1; +; PTX-NEXT: st.local.b32 [%rd2+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.b32 [%rd3], %r2; -; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1]; -; PTX-NEXT: ld.volatile.b8 %rd5, [%rd1+1]; -; PTX-NEXT: shl.b64 %rd6, %rd5, 8; -; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; -; PTX-NEXT: ld.volatile.b8 %rd8, [%rd1+2]; -; PTX-NEXT: shl.b64 %rd9, %rd8, 16; -; PTX-NEXT: ld.volatile.b8 %rd10, [%rd1+3]; -; PTX-NEXT: shl.b64 %rd11, %rd10, 24; -; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; -; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; -; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+4]; -; PTX-NEXT: ld.volatile.b8 %rd15, [%rd1+5]; -; PTX-NEXT: shl.b64 %rd16, %rd15, 8; -; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; -; PTX-NEXT: ld.volatile.b8 %rd18, [%rd1+6]; -; PTX-NEXT: shl.b64 %rd19, %rd18, 16; -; PTX-NEXT: ld.volatile.b8 %rd20, [%rd1+7]; -; PTX-NEXT: shl.b64 %rd21, %rd20, 24; -; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; -; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; -; PTX-NEXT: shl.b64 %rd24, %rd23, 32; -; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; -; PTX-NEXT: st.volatile.b64 [%SP], %rd25; -; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+8]; -; PTX-NEXT: ld.volatile.b8 %rd27, [%rd1+9]; -; PTX-NEXT: shl.b64 %rd28, %rd27, 8; -; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; -; PTX-NEXT: ld.volatile.b8 %rd30, [%rd1+10]; -; PTX-NEXT: shl.b64 %rd31, %rd30, 16; -; PTX-NEXT: ld.volatile.b8 %rd32, [%rd1+11]; -; PTX-NEXT: shl.b64 %rd33, %rd32, 24; -; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; -; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; -; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+12]; -; PTX-NEXT: ld.volatile.b8 %rd37, [%rd1+13]; -; PTX-NEXT: shl.b64 %rd38, %rd37, 8; -; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; -; PTX-NEXT: ld.volatile.b8 %rd40, [%rd1+14]; -; PTX-NEXT: shl.b64 %rd41, %rd40, 16; -; PTX-NEXT: ld.volatile.b8 %rd42, [%rd1+15]; -; PTX-NEXT: shl.b64 %rd43, %rd42, 24; -; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; -; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; -; PTX-NEXT: shl.b64 %rd46, %rd45, 32; -; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; -; PTX-NEXT: st.volatile.b64 [%SP+8], %rd47; +; PTX-NEXT: st.local.b32 [%rd2], %r2; +; PTX-NEXT: ld.volatile.b8 %rd3, [%rd1]; +; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1+1]; +; PTX-NEXT: shl.b64 %rd5, %rd4, 8; +; PTX-NEXT: or.b64 %rd6, %rd5, %rd3; +; PTX-NEXT: ld.volatile.b8 %rd7, [%rd1+2]; +; PTX-NEXT: shl.b64 %rd8, %rd7, 16; +; PTX-NEXT: ld.volatile.b8 %rd9, [%rd1+3]; +; PTX-NEXT: shl.b64 %rd10, %rd9, 24; +; PTX-NEXT: or.b64 %rd11, %rd10, %rd8; +; PTX-NEXT: or.b64 %rd12, %rd11, %rd6; +; PTX-NEXT: ld.volatile.b8 %rd13, [%rd1+4]; +; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+5]; +; PTX-NEXT: shl.b64 %rd15, %rd14, 8; +; PTX-NEXT: or.b64 %rd16, %rd15, %rd13; +; PTX-NEXT: ld.volatile.b8 %rd17, [%rd1+6]; +; PTX-NEXT: shl.b64 %rd18, %rd17, 16; +; PTX-NEXT: ld.volatile.b8 %rd19, [%rd1+7]; +; PTX-NEXT: shl.b64 %rd20, %rd19, 24; +; PTX-NEXT: or.b64 %rd21, %rd20, %rd18; +; PTX-NEXT: or.b64 %rd22, %rd21, %rd16; +; PTX-NEXT: shl.b64 %rd23, %rd22, 32; +; PTX-NEXT: or.b64 %rd24, %rd23, %rd12; +; PTX-NEXT: st.volatile.b64 [%SP], %rd24; +; PTX-NEXT: ld.volatile.b8 %rd25, [%rd1+8]; +; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+9]; +; PTX-NEXT: shl.b64 %rd27, %rd26, 8; +; PTX-NEXT: or.b64 %rd28, %rd27, %rd25; +; PTX-NEXT: ld.volatile.b8 %rd29, [%rd1+10]; +; PTX-NEXT: shl.b64 %rd30, %rd29, 16; +; PTX-NEXT: ld.volatile.b8 %rd31, [%rd1+11]; +; PTX-NEXT: shl.b64 %rd32, %rd31, 24; +; PTX-NEXT: or.b64 %rd33, %rd32, %rd30; +; PTX-NEXT: or.b64 %rd34, %rd33, %rd28; +; PTX-NEXT: ld.volatile.b8 %rd35, [%rd1+12]; +; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+13]; +; PTX-NEXT: shl.b64 %rd37, %rd36, 8; +; PTX-NEXT: or.b64 %rd38, %rd37, %rd35; +; PTX-NEXT: ld.volatile.b8 %rd39, [%rd1+14]; +; PTX-NEXT: shl.b64 %rd40, %rd39, 16; +; PTX-NEXT: ld.volatile.b8 %rd41, [%rd1+15]; +; PTX-NEXT: shl.b64 %rd42, %rd41, 24; +; PTX-NEXT: or.b64 %rd43, %rd42, %rd40; +; PTX-NEXT: or.b64 %rd44, %rd43, %rd38; +; PTX-NEXT: shl.b64 %rd45, %rd44, 32; +; PTX-NEXT: or.b64 %rd46, %rd45, %rd34; +; PTX-NEXT: st.volatile.b64 [%SP+8], %rd46; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -651,7 +651,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b16 %rs<3>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot12; @@ -663,10 +663,10 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: st.b32 [%SP], %r1; ; PTX-NEXT: ld.param.b32 %r2, [test_select_write_param_0]; ; PTX-NEXT: st.b32 [%SP+4], %r2; -; PTX-NEXT: add.u64 %rd2, %SPL, 4; -; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; -; PTX-NEXT: st.local.b32 [%rd5], 1; +; PTX-NEXT: add.u64 %rd1, %SPL, 4; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; PTX-NEXT: st.local.b32 [%rd3], 1; ; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 @@ -743,7 +743,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60: { ; PTX_60-NEXT: .reg .pred %p<2>; ; PTX_60-NEXT: .reg .b16 %rs<3>; -; PTX_60-NEXT: .reg .b32 %r<5>; +; PTX_60-NEXT: .reg .b32 %r<2>; ; PTX_60-NEXT: .reg .b64 %rd<3>; ; PTX_60-EMPTY: ; PTX_60-NEXT: // %bb.0: // %bb @@ -752,12 +752,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX_60-NEXT: ld.param.b64 %rd2, [test_phi_param_2]; ; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; -; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_0]; +; PTX_60-NEXT: ld.param.b32 %r1, [test_phi_param_0]; ; PTX_60-NEXT: @%p1 bra $L__BB13_2; ; PTX_60-NEXT: // %bb.1: // %second -; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_1+4]; +; PTX_60-NEXT: ld.param.b32 %r1, [test_phi_param_1+4]; ; PTX_60-NEXT: $L__BB13_2: // %merge -; PTX_60-NEXT: st.global.b32 [%rd1], %r4; +; PTX_60-NEXT: st.global.b32 [%rd1], %r1; ; PTX_60-NEXT: ret; ; ; PTX_70-LABEL: test_phi( @@ -765,21 +765,21 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_70-NEXT: .reg .pred %p<2>; ; PTX_70-NEXT: .reg .b16 %rs<3>; ; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<8>; +; PTX_70-NEXT: .reg .b64 %rd<5>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb ; PTX_70-NEXT: ld.param.b8 %rs1, [test_phi_param_3]; ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0; -; PTX_70-NEXT: ld.param.b64 %rd6, [test_phi_param_2]; -; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX_70-NEXT: mov.b64 %rd4, test_phi_param_0; +; PTX_70-NEXT: ld.param.b64 %rd3, [test_phi_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd3; ; PTX_70-NEXT: @%p1 bra $L__BB13_2; ; PTX_70-NEXT: // %bb.1: // %second ; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1; -; PTX_70-NEXT: add.s64 %rd7, %rd2, 4; +; PTX_70-NEXT: add.s64 %rd4, %rd2, 4; ; PTX_70-NEXT: $L__BB13_2: // %merge -; PTX_70-NEXT: ld.param.b32 %r1, [%rd7]; +; PTX_70-NEXT: ld.param.b32 %r1, [%rd4]; ; PTX_70-NEXT: st.global.b32 [%rd1], %r1; ; PTX_70-NEXT: ret; bb: @@ -830,7 +830,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b16 %rs<3>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot14; @@ -841,14 +841,14 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_phi_write_param_1+4]; ; PTX-NEXT: st.b32 [%SP], %r1; -; PTX-NEXT: add.u64 %rd6, %SPL, 4; +; PTX-NEXT: add.u64 %rd2, %SPL, 4; ; PTX-NEXT: ld.param.b32 %r2, [test_phi_write_param_0]; ; PTX-NEXT: st.b32 [%SP+4], %r2; ; PTX-NEXT: @%p1 bra $L__BB14_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: mov.b64 %rd6, %rd1; +; PTX-NEXT: mov.b64 %rd2, %rd1; ; PTX-NEXT: $L__BB14_2: // %merge -; PTX-NEXT: st.local.b32 [%rd6], 1; +; PTX-NEXT: st.local.b32 [%rd2], 1; ; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -882,13 +882,13 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot15; -; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; ; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; -; PTX-NEXT: st.local.b32 [%rd2], %r1; +; PTX-NEXT: st.local.b32 [%rd1], %r1; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; @@ -908,7 +908,6 @@ define void @device_func(ptr byval(i32) align 4 %input) { ; PTX-LABEL: device_func( ; PTX: { ; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: { // callseq 3, 0 diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index b2994c0a97585..62f99e991ea1e 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -7,14 +7,14 @@ target triple = "nvptx64-nvidia-cuda" define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: wombat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<11>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb -; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2]; -; CHECK-NEXT: ld.param.b32 %r3, [wombat_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [wombat_param_0]; -; CHECK-NEXT: mov.b32 %r10, 0; +; CHECK-NEXT: ld.param.b32 %r3, [wombat_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [wombat_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [wombat_param_0]; +; CHECK-NEXT: mov.b32 %r7, 0; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 @@ -23,15 +23,15 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: st.param.b64 [param0], 0; ; CHECK-NEXT: call.uni (retval0), quux, (param0); ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; -; CHECK-NEXT: or.b32 %r8, %r4, %r7; -; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; -; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r9; -; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r10; +; CHECK-NEXT: mul.lo.s32 %r4, %r7, %r2; +; CHECK-NEXT: or.b32 %r5, %r3, %r4; +; CHECK-NEXT: mul.lo.s32 %r6, %r1, %r5; +; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r6; +; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r7; ; CHECK-NEXT: add.rn.f64 %rd3, %rd2, %rd1; ; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: st.global.b64 [%rd4], %rd3; -; CHECK-NEXT: mov.b32 %r10, 1; +; CHECK-NEXT: mov.b32 %r7, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: br label %bb3 diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll index c5ea9f850ea1f..06d7384200696 100644 --- a/llvm/test/CodeGen/NVPTX/param-add.ll +++ b/llvm/test/CodeGen/NVPTX/param-add.ll @@ -14,7 +14,7 @@ declare i32 @callee(%struct.1float %a) define i32 @test(%struct.1float alignstack(32) %data) { ; CHECK-LABEL: test( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 2155fb4031c36..8899709d1cf15 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -21,7 +21,7 @@ target triple = "nvptx64-nvidia-cuda" define float @caller_md(float %a, float %b) { ; CHECK-LABEL: caller_md( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [caller_md_param_0]; @@ -62,7 +62,7 @@ define float @callee_md(%struct.float2 alignstack(8) %a) { define float @caller(float %a, float %b) { ; CHECK-LABEL: caller( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [caller_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll index 8056855a0d539..d443aebf32447 100644 --- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll index abc2ea89b62cf..c0ced65709610 100644 --- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -34,7 +34,6 @@ define ptx_kernel void @bar(i32 %val, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [bar_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll index 3138d7c4c14db..20f6e2ec50c2c 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -37,7 +37,7 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-LABEL: bar( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<6>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; @@ -58,7 +58,7 @@ declare float @texfunc(i64) define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-LABEL: baz( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -74,8 +74,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-NEXT: call.uni (retval0), texfunc, (param0); ; CHECK-NEXT: ld.param.b32 %r6, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: add.rn.f32 %r8, %r2, %r6; -; CHECK-NEXT: st.global.b32 [%rd2], %r8; +; CHECK-NEXT: add.rn.f32 %r7, %r2, %r6; +; CHECK-NEXT: st.global.b32 [%rd2], %r7; ; CHECK-NEXT: ret; %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0) %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll index 4edbec48e6bec..c5299046e1db3 100644 --- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll +++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll @@ -35,7 +35,6 @@ define i32 @t1() { ; CHECK-LABEL: t1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: txq.width.b32 %r1, [tex0]; @@ -66,7 +65,6 @@ define i32 @t3() { ; CHECK-LABEL: t3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: txq.height.b32 %r1, [tex0]; @@ -97,7 +95,6 @@ define i32 @s1() { ; CHECK-LABEL: s1( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: suq.width.b32 %r1, [surf0]; @@ -128,7 +125,6 @@ define i32 @s3() { ; CHECK-LABEL: s3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: suq.height.b32 %r1, [surf0]; diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index 697eb90fb1740..526355247c009 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -24,9 +24,9 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-LABEL: test_s_i8i16p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b16 %rs<9>; ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i16p_param_0]; @@ -45,14 +45,14 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-NEXT: ld.param.b8 %rs4, [retval0+4]; ; CHECK-NEXT: ld.param.b8 %rs5, [retval0+3]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: shl.b16 %rs8, %rs4, 8; -; CHECK-NEXT: or.b16 %rs9, %rs8, %rs5; +; CHECK-NEXT: shl.b16 %rs6, %rs4, 8; +; CHECK-NEXT: or.b16 %rs7, %rs6, %rs5; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs5; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs2; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-NEXT: shr.u16 %rs12, %rs9, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs12; +; CHECK-NEXT: shr.u16 %rs8, %rs7, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs8; ; CHECK-NEXT: ret; %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) ret %s_i8i16p %r @@ -62,9 +62,9 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-LABEL: test_s_i8i32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i32p_param_0]; @@ -91,22 +91,22 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) ret %s_i8i32p %r @@ -116,8 +116,8 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-LABEL: test_s_i8i64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<36>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i64p_param_0]; @@ -144,38 +144,38 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; ; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; -; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; -; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; -; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; -; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; -; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; -; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; -; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; -; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; -; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: shl.b64 %rd15, %rd13, 8; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd14; +; CHECK-NEXT: shl.b64 %rd17, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd11, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd17; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd16; +; CHECK-NEXT: shl.b64 %rd21, %rd9, 8; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd10; +; CHECK-NEXT: shl.b64 %rd23, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd24, %rd7, 24; +; CHECK-NEXT: or.b64 %rd25, %rd24, %rd23; +; CHECK-NEXT: or.b64 %rd26, %rd25, %rd22; +; CHECK-NEXT: shl.b64 %rd27, %rd26, 32; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd20; ; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; -; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; -; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; -; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; -; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; -; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; -; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; +; CHECK-NEXT: shr.u64 %rd29, %rd28, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd28, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd28, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd31; +; CHECK-NEXT: shr.u64 %rd32, %rd28, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd32; +; CHECK-NEXT: shr.u64 %rd33, %rd28, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd28, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd28, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; ; CHECK-NEXT: ret; %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) ret %s_i8i64p %r @@ -185,8 +185,8 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-LABEL: test_s_i8f16p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<15>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<11>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16p_param_0]; @@ -207,14 +207,14 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-NEXT: ld.param.b8 %rs6, [retval0+4]; ; CHECK-NEXT: ld.param.b8 %rs7, [retval0+3]; ; CHECK-NEXT: } // callseq 3 -; CHECK-NEXT: shl.b16 %rs10, %rs6, 8; -; CHECK-NEXT: or.b16 %rs11, %rs10, %rs7; +; CHECK-NEXT: shl.b16 %rs8, %rs6, 8; +; CHECK-NEXT: or.b16 %rs9, %rs8, %rs7; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs7; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs4; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-NEXT: shr.u16 %rs14, %rs11, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs14; +; CHECK-NEXT: shr.u16 %rs10, %rs9, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10; ; CHECK-NEXT: ret; %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) ret %s_i8f16p %r @@ -224,9 +224,9 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-LABEL: test_s_i8f16x2p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f16x2p_param_0]; @@ -253,22 +253,22 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 4 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) ret %s_i8f16x2p %r @@ -278,9 +278,9 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-LABEL: test_s_i8f32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<24>; -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f32p_param_0]; @@ -307,22 +307,22 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; ; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 5 -; CHECK-NEXT: shl.b32 %r12, %r8, 8; -; CHECK-NEXT: or.b32 %r13, %r12, %r9; -; CHECK-NEXT: shl.b32 %r15, %r7, 16; -; CHECK-NEXT: shl.b32 %r17, %r6, 24; -; CHECK-NEXT: or.b32 %r18, %r17, %r15; -; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: shl.b32 %r10, %r8, 8; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: shl.b32 %r12, %r7, 16; +; CHECK-NEXT: shl.b32 %r13, %r6, 24; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: shr.u32 %r21, %r19, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; -; CHECK-NEXT: shr.u32 %r22, %r19, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; -; CHECK-NEXT: shr.u32 %r23, %r19, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; +; CHECK-NEXT: shr.u32 %r16, %r15, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r16; +; CHECK-NEXT: shr.u32 %r17, %r15, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; +; CHECK-NEXT: shr.u32 %r18, %r15, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r18; ; CHECK-NEXT: ret; %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) ret %s_i8f32p %r @@ -332,8 +332,8 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-LABEL: test_s_i8f64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<36>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f64p_param_0]; @@ -360,38 +360,38 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; ; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 6 -; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; -; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; -; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; -; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; -; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; -; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; -; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; -; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; -; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; -; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; -; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; -; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; -; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: shl.b64 %rd15, %rd13, 8; +; CHECK-NEXT: or.b64 %rd16, %rd15, %rd14; +; CHECK-NEXT: shl.b64 %rd17, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd11, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd17; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd16; +; CHECK-NEXT: shl.b64 %rd21, %rd9, 8; +; CHECK-NEXT: or.b64 %rd22, %rd21, %rd10; +; CHECK-NEXT: shl.b64 %rd23, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd24, %rd7, 24; +; CHECK-NEXT: or.b64 %rd25, %rd24, %rd23; +; CHECK-NEXT: or.b64 %rd26, %rd25, %rd22; +; CHECK-NEXT: shl.b64 %rd27, %rd26, 32; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd20; ; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; ; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; ; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; -; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; -; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; -; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; -; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; -; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; -; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; -; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; +; CHECK-NEXT: shr.u64 %rd29, %rd28, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd29; +; CHECK-NEXT: shr.u64 %rd30, %rd28, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd30; +; CHECK-NEXT: shr.u64 %rd31, %rd28, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd31; +; CHECK-NEXT: shr.u64 %rd32, %rd28, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd32; +; CHECK-NEXT: shr.u64 %rd33, %rd28, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd28, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd28, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; ; CHECK-NEXT: ret; %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) ret %s_i8f64p %r diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index a9b3675b67155..890753b6ac5aa 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -104,7 +104,7 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -143,29 +143,29 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; ; CHECK-PTX-NEXT: .reg .b32 %r<6>; -; CHECK-PTX-NEXT: .reg .b64 %rd<9>; +; CHECK-PTX-NEXT: .reg .b64 %rd<8>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2; ; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics2_param_0]; ; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics2_param_1]; -; CHECK-PTX-NEXT: add.u64 %rd3, %SPL, 0; -; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 7; -; CHECK-PTX-NEXT: and.b64 %rd5, %rd4, -8; -; CHECK-PTX-NEXT: ld.b32 %r2, [%rd5]; -; CHECK-PTX-NEXT: ld.s8 %r3, [%rd5+4]; -; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd5+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3+2], %rs1; -; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd5+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3+1], %rs2; -; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd5+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd3], %rs3; -; CHECK-PTX-NEXT: ld.b64 %rd6, [%rd5+8]; +; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; +; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; +; CHECK-PTX-NEXT: ld.b32 %r2, [%rd4]; +; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4+4]; +; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd4+7]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; +; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd4+6]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; +; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd4+5]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; +; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4+8]; ; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; -; CHECK-PTX-NEXT: cvt.u64.u32 %rd7, %r5; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd6; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd8; +; CHECK-PTX-NEXT: cvt.u64.u32 %rd6, %r5; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd6, %rd5; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd7; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -202,19 +202,19 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; -; CHECK-PTX-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1; +; CHECK-PTX-NEXT: st.local.b8 [%rd1+2], %rs1; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2; +; CHECK-PTX-NEXT: st.local.b8 [%rd1+1], %rs2; ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; -; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; +; CHECK-PTX-NEXT: st.local.b8 [%rd1], %rs3; ; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; ; CHECK-PTX-NEXT: st.b8 [%SP+12], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; @@ -222,8 +222,8 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3; +; CHECK-PTX-NEXT: add.u64 %rd2, %SP, 8; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd2; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; @@ -282,7 +282,7 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -309,18 +309,18 @@ entry: define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { ; CHECK-PTX-LABEL: variadics4( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .reg .b64 %rd<10>; +; CHECK-PTX-NEXT: .reg .b64 %rd<9>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry -; CHECK-PTX-NEXT: ld.param.b64 %rd2, [variadics4_param_1]; -; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 7; -; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; -; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4]; -; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0]; -; CHECK-PTX-NEXT: ld.param.b64 %rd7, [variadics4_param_0+8]; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd6, %rd7; -; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, %rd5; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd9; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics4_param_1]; +; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7; +; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; +; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: ld.param.b64 %rd5, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6; +; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd8; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -348,27 +348,27 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b64 %rd<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<7>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; -; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s+8]; -; CHECK-PTX-NEXT: st.local.b64 [%rd2+8], %rd3; -; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s]; -; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4; +; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd2, [__const_$_qux_$_s+8]; +; CHECK-PTX-NEXT: st.local.b64 [%rd1+8], %rd2; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s]; +; CHECK-PTX-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; ; CHECK-PTX-NEXT: .param .b64 param1; ; CHECK-PTX-NEXT: .param .b32 retval0; -; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 16; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5; -; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8]; -; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6; -; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2]; -; CHECK-PTX-NEXT: st.param.b64 [param0], %rd7; +; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4; +; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd1+8]; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd5; +; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd1]; +; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6; ; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1); ; CHECK-PTX-NEXT: } // callseq 3 ; CHECK-PTX-NEXT: ret; diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index e1da112ceebb5..c368a1c7e8ebe 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -6,8 +6,8 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) { ; CHECK-LABEL: caller_St8x4( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 0, 0 @@ -23,11 +23,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.b32 %r2, [caller_St8x4_param_1]; -; CHECK-NEXT: st.b64 [%r2], %rd5; -; CHECK-NEXT: st.b64 [%r2+8], %rd6; -; CHECK-NEXT: st.b64 [%r2+16], %rd7; -; CHECK-NEXT: st.b64 [%r2+24], %rd8; +; CHECK-NEXT: ld.param.b32 %r1, [caller_St8x4_param_1]; +; CHECK-NEXT: st.b64 [%r1], %rd5; +; CHECK-NEXT: st.b64 [%r1+8], %rd6; +; CHECK-NEXT: st.b64 [%r1+16], %rd7; +; CHECK-NEXT: st.b64 [%r1+24], %rd8; ; CHECK-NEXT: ret; %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2 %.fca.0.extract = extractvalue [4 x i64] %call, 0 @@ -48,7 +48,6 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by ; CHECK-LABEL: callee_St8x4( ; CHECK: // @callee_St8x4 ; CHECK-NEXT: { -; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: