diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2e66796bcb6bc..91df516b80857 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -232,8 +232,7 @@ bool SIFoldOperandsImpl::frameIndexMayFold( const unsigned Opc = UseMI.getOpcode(); switch (Opc) { case AMDGPU::S_ADD_I32: - case AMDGPU::S_OR_B32: - case AMDGPU::S_AND_B32: + case AMDGPU::S_ADD_U32: case AMDGPU::V_ADD_U32_e32: case AMDGPU::V_ADD_CO_U32_e32: // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 413408b417c5a..4417f205646ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -426,4 +426,150 @@ body: | $sgpr4 = COPY %4 $sgpr5 = COPY %5 SI_RETURN implicit $sgpr4, implicit $sgpr5 + +... + +name: fold_frame_index__s_add_u32__fi_const +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_const + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 128, implicit-def $scc + ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_ADD_U32 %0, 128, implicit-def $scc + $sgpr4 = COPY %1 + SI_RETURN implicit $sgpr4 +... + +--- +name: fold_frame_index__s_add_u32__const_fi +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index__s_add_u32__const_fi + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 128, %stack.0, implicit-def $scc + ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_ADD_U32 128, %0, implicit-def $scc + $sgpr4 = COPY %1 + SI_RETURN implicit $sgpr4 +... + +--- +name: fold_frame_index__s_add_u32__fi_inlineimm +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_inlineimm + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 16, implicit-def $scc + ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_ADD_U32 %0, 16, implicit-def $scc + $sgpr4 = COPY %1 + SI_RETURN implicit $sgpr4 +... + +--- +name: fold_frame_index__s_add_u32__inlineimm_fi +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index__s_add_u32__inlineimm_fi + ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 16, %stack.0, implicit-def $scc + ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]] + ; CHECK-NEXT: SI_RETURN implicit $sgpr4 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_ADD_U32 16, %0, implicit-def $scc + $sgpr4 = COPY %1 + SI_RETURN implicit $sgpr4 +... + +--- +name: no_fold_literal_and_fi_s_or_b32 +tracksRegLiveness: true +frameInfo: + maxAlignment: 16 + localFrameSize: 8192 +stack: + - { id: 0, size: 4096, alignment: 4, local-offset: 0 } + - { id: 1, size: 4096, alignment: 16, local-offset: 4096 } +body: | + bb.0: + ; CHECK-LABEL: name: no_fold_literal_and_fi_s_or_b32 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc + ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]] + %0:sreg_32 = S_MOV_B32 12345 + %1:sreg_32 = S_MOV_B32 %stack.1 + %2:sreg_32 = S_AND_B32 killed %1, killed %0, implicit-def dead $scc + S_ENDPGM 0, implicit %2 + +... + +--- +name: no_fold_literal_or_fi_s_or_b32 +tracksRegLiveness: true +frameInfo: + maxAlignment: 16 + localFrameSize: 8192 +stack: + - { id: 0, size: 4096, alignment: 4, local-offset: 0 } + - { id: 1, size: 4096, alignment: 16, local-offset: 4096 } +body: | + bb.0: + ; CHECK-LABEL: name: no_fold_literal_or_fi_s_or_b32 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc + ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]] + %0:sreg_32 = S_MOV_B32 12345 + %1:sreg_32 = S_MOV_B32 %stack.1 + %2:sreg_32 = S_OR_B32 killed %1, killed %0, implicit-def dead $scc + S_ENDPGM 0, implicit %2 + +... + +--- +name: no_fold_literal_and_fi_s_mul_i32 +tracksRegLiveness: true +frameInfo: + maxAlignment: 16 + localFrameSize: 8192 +stack: + - { id: 0, size: 4096, alignment: 4, local-offset: 0 } + - { id: 1, size: 4096, alignment: 16, local-offset: 4096 } +body: | + bb.0: + ; CHECK-LABEL: name: no_fold_literal_and_fi_s_mul_i32 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1 + ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc + ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_MUL_I32_]] + %0:sreg_32 = S_MOV_B32 12345 + %1:sreg_32 = S_MOV_B32 %stack.1 + %2:sreg_32 = S_MUL_I32 killed %1, killed %0, implicit-def dead $scc + S_ENDPGM 0, implicit %2 + ... diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir index ab0aa16cf6c09..2bdc3f671897c 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir @@ -394,8 +394,10 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr - ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec - ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]] + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: SI_RETURN implicit [[COPY]] %0:sreg_32 = S_MOV_B32 %stack.0 %1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc %2:vgpr_32 = COPY %1 @@ -410,8 +412,10 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr - ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec - ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]] + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 128, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: SI_RETURN implicit [[COPY]] %0:sreg_32 = S_MOV_B32 %stack.0 %1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc %2:vgpr_32 = COPY %1 @@ -426,8 +430,8 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr - ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec - ; CHECK-NEXT: SI_RETURN implicit %1 + ; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]] %0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc %1:vgpr_32 = COPY %0 SI_RETURN implicit %1 @@ -441,8 +445,8 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr - ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec - ; CHECK-NEXT: SI_RETURN implicit %1 + ; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]] %0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc %1:vgpr_32 = COPY %0 SI_RETURN implicit %1 @@ -521,8 +525,10 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr - ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec - ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]] + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: SI_RETURN implicit [[COPY]] %0:sreg_32 = S_MOV_B32 %stack.0 %1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc %2:vgpr_32 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 004403f46a4d4..7125e7740c10a 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -374,4 +374,46 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i ret void } +; GCN-LABEL: {{^}}fi_sop2_and_literal_error: +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00 +define amdgpu_kernel void @fi_sop2_and_literal_error() #0 { +entry: + %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) + %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) + %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32 + br label %.shuffle.then.i.i.i.i + +.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry + store i64 0, ptr addrspace(5) null, align 4 + %or = and i32 %p2i, -512 + %icmp = icmp ugt i32 %or, 9999999 + br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i + +vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i + %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4 + store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4 + ret void +} + +; GCN-LABEL: {{^}}fi_sop2_or_literal_error: +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +define amdgpu_kernel void @fi_sop2_or_literal_error() #0 { +entry: + %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5) + %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5) + %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32 + br label %.shuffle.then.i.i.i.i + +.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry + store i64 0, ptr addrspace(5) null, align 4 + %or = or i32 %p2i, 12345 + %icmp = icmp ugt i32 %or, 9999999 + br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i + +vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i + %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4 + store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4 + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll index 2cb440b1b7a01..08ea81ad81ae5 100644 --- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll @@ -7,9 +7,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]] -; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], +; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; GCN: s_and_b32 s{{[0-9]+}}, [[FI]], 0xfffc +; GCN: v_mov_b32_e32 [[VFI:v[0-9]+]], [[FI]]{{$}} +; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], [[VFI]] define amdgpu_kernel void @scratch_buffer_known_high_masklo16() { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 15, ptr addrspace(5) %alloca @@ -20,11 +21,15 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; SCRATCH128K-NOT: v_and_b32 -; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]] -; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]] -; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]] +; SCRATCH256K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH256K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc + +; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc + +; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc + ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo17() { %alloca = alloca i32, align 4, addrspace(5) @@ -36,11 +41,17 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; SCRATCH128K-NOT: v_and_b32 -; SCRATCH256K-NOT: v_and_b32 -; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]] -; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]] +; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; SCRATCH128K-NOT: and_b32 +; SCRATCH256K-NOT: and_b32 + +; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc + +; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc + ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo18() { %alloca = alloca i32, align 4, addrspace(5) @@ -52,11 +63,16 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() { } ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20: -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; SCRATCH128K-NOT: v_and_b32 -; SCRATCH256K-NOT: v_and_b32 -; SCRATCH1024K-NOT: v_and_b32 -; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]] +; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; SCRATCH1024K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} + +; SCRATCH128K-NOT: and_b32 +; SCRATCH256K-NOT: and_b32 +; SCRATCH1024K-NOT: and_b32 + +; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0xffffc ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo20() { %alloca = alloca i32, align 4, addrspace(5) @@ -69,7 +85,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() { ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21: ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; GCN-NOT: v_and_b32 +; GCN-NOT: and_b32 ; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], define amdgpu_kernel void @scratch_buffer_known_high_masklo21() { %alloca = alloca i32, align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 8ec3b7e2508ac..a3ebaec4811a9 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -224,54 +224,55 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 ; MUBUF-NEXT: ; %bb.2: ; %split ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c0, v1 -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d4, v2 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc -; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d0, v2 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 +; MUBUF-NEXT: s_movk_i32 s4, 0x4000 +; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c4, v2 -; MUBUF-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 +; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0 +; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12cc, v1 -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c8, v2 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_mov_b32_e32 v0, s4 +; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3 +; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000 +; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000 -; MUBUF-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6 +; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000 +; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000 +; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:4 glc +; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000 +; MUBUF-NEXT: buffer_load_dword v6, v7, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v11, 0x4000 -; MUBUF-NEXT: buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc +; MUBUF-NEXT: buffer_load_dword v7, v8, s[0:3], 0 offen offset:4 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000 -; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc +; MUBUF-NEXT: buffer_load_dword v8, v9, s[0:3], 0 offen offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; MUBUF-NEXT: buffer_load_dword v9, v10, s[0:3], 0 offen offset:12 glc +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc ; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8 -; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc -; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 +; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 +; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc ; MUBUF-NEXT: v_mov_b32_e32 v12, 0 +; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16