diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index ba72a3b71ffd7..a4e9c92b48976 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -196,6 +196,10 @@ class CombinerHelper { /// Match (and (load x), mask) -> zextload x bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed + /// load. + bool matchCombineExtractedVectorLoad(MachineInstr &MI, BuildFnTy &MatchInfo); + bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo); void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 76b83cc5df073..237d8d63aafa4 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -259,6 +259,12 @@ def sext_inreg_to_zext_inreg : GICombineRule< }]) >; +def combine_extracted_vector_load : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root, + [{ return Helper.matchCombineExtractedVectorLoad(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def combine_indexed_load_store : GICombineRule< (defs root:$root, indexed_load_store_matchdata:$matchinfo), (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root, @@ -1283,8 +1289,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop, constant_fold_fp_binop]>; def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, - extract_vec_elt_combines, combines_for_extload, - undef_combines, identity_combines, phi_combines, + extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load, + undef_combines, identity_combines, phi_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big, reassocs, ptr_add_immed_chain, shl_ashr_to_sext_inreg, sext_inreg_of_load, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 51c268ab77c22..9123ae5835a57 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1165,6 +1165,101 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr, return RealUse; } +bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT); + + // Check if there is a load that defines the vector being extracted from. + auto *LoadMI = getOpcodeDef(MI.getOperand(1).getReg(), MRI); + if (!LoadMI) + return false; + + Register Vector = MI.getOperand(1).getReg(); + LLT VecEltTy = MRI.getType(Vector).getElementType(); + LLT ResultTy = MRI.getType(MI.getOperand(0).getReg()); + + assert(ResultTy == VecEltTy); + + // Checking whether we should reduce the load width. + if (!MRI.hasOneNonDBGUse(Vector)) + return false; + + // Check if the defining load is simple. + if (!LoadMI->isSimple()) + return false; + + // If the vector element type is not a multiple of a byte then we are unable + // to correctly compute an address to load only the extracted element as a + // scalar. + if (!VecEltTy.isByteSized()) + return false; + + // Check if the new load that we are going to create is legal + // if we are in the post-legalization phase. + MachineMemOperand MMO = LoadMI->getMMO(); + Align Alignment = MMO.getAlign(); + MachinePointerInfo PtrInfo; + uint64_t Offset; + + // Finding the appropriate PtrInfo if offset is a known constant. + // This is required to create the memory operand for the narrowed load. + // This machine memory operand object helps us infer about legality + // before we proceed to combine the instruction. + if (auto CVal = getIConstantVRegVal(Vector, MRI)) { + int Elt = CVal->getZExtValue(); + // FIXME: should be (ABI size)*Elt. + Offset = VecEltTy.getSizeInBits() * Elt / 8; + PtrInfo = MMO.getPointerInfo().getWithOffset(Offset); + } else { + // Discard the pointer info except the address space because the memory + // operand can't represent this new access since the offset is variable. + Offset = VecEltTy.getSizeInBits() / 8; + PtrInfo = MachinePointerInfo(MMO.getPointerInfo().getAddrSpace()); + } + + Alignment = commonAlignment(Alignment, Offset); + + Register VecPtr = LoadMI->getPointerReg(); + LLT PtrTy = MRI.getType(VecPtr); + + MachineFunction &MF = *MI.getMF(); + auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, VecEltTy); + + LegalityQuery::MemDesc MMDesc(*NewMMO); + + LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltTy, PtrTy}, {MMDesc}}; + + if (!isLegalOrBeforeLegalizer(Q)) + return false; + + // Load must be allowed and fast on the target. + LLVMContext &C = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + unsigned Fast = 0; + if (!getTargetLowering().allowsMemoryAccess(C, DL, VecEltTy, *NewMMO, + &Fast) || + !Fast) + return false; + + Register Result = MI.getOperand(0).getReg(); + Register Index = MI.getOperand(2).getReg(); + + MatchInfo = [=](MachineIRBuilder &B) { + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(B.getMF(), DummyObserver, B); + //// Get pointer to the vector element. + Register finalPtr = Helper.getVectorElementPointer( + LoadMI->getPointerReg(), MRI.getType(LoadMI->getOperand(0).getReg()), + Index); + // New G_LOAD instruction. + B.buildLoad(Result, finalPtr, PtrInfo, Alignment); + // Remove original GLOAD instruction. + LoadMI->eraseFromParent(); + }; + + return true; +} + bool CombinerHelper::matchCombineIndexedLoadStore( MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) { auto &LdSt = cast(MI); diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index 0d7620d1c883d..7493afd672d43 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -14659,17 +14659,9 @@ define i8 @load_single_extract_variable_index_i8(ptr %A, i32 %idx) { ; ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i8: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: sub sp, sp, #16 -; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: mov w9, w1 -; CHECK-GISEL-NEXT: ldr q0, [x0] -; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and x9, x9, #0xf -; CHECK-GISEL-NEXT: lsl x10, x9, #1 -; CHECK-GISEL-NEXT: str q0, [sp] -; CHECK-GISEL-NEXT: sub x9, x10, x9 -; CHECK-GISEL-NEXT: ldrb w0, [x8, x9] -; CHECK-GISEL-NEXT: add sp, sp, #16 +; CHECK-GISEL-NEXT: mov w8, w1 +; CHECK-GISEL-NEXT: and x8, x8, #0xf +; CHECK-GISEL-NEXT: ldrb w0, [x0, x8] ; CHECK-GISEL-NEXT: ret %lv = load <16 x i8>, ptr %A %e = extractelement <16 x i8> %lv, i32 %idx @@ -14692,15 +14684,9 @@ define i16 @load_single_extract_variable_index_i16(ptr %A, i32 %idx) { ; ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i16: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: sub sp, sp, #16 -; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: ldr q0, [x0] -; CHECK-GISEL-NEXT: mov w9, w1 -; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and x9, x9, #0x7 -; CHECK-GISEL-NEXT: str q0, [sp] -; CHECK-GISEL-NEXT: ldrh w0, [x8, x9, lsl #1] -; CHECK-GISEL-NEXT: add sp, sp, #16 +; CHECK-GISEL-NEXT: mov w8, w1 +; CHECK-GISEL-NEXT: and x8, x8, #0x7 +; CHECK-GISEL-NEXT: ldrh w0, [x0, x8, lsl #1] ; CHECK-GISEL-NEXT: ret %lv = load <8 x i16>, ptr %A %e = extractelement <8 x i16> %lv, i32 %idx @@ -14717,15 +14703,9 @@ define i32 @load_single_extract_variable_index_i32(ptr %A, i32 %idx) { ; ; CHECK-GISEL-LABEL: load_single_extract_variable_index_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: sub sp, sp, #16 -; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: ldr q0, [x0] -; CHECK-GISEL-NEXT: mov w9, w1 -; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and x9, x9, #0x3 -; CHECK-GISEL-NEXT: str q0, [sp] -; CHECK-GISEL-NEXT: ldr w0, [x8, x9, lsl #2] -; CHECK-GISEL-NEXT: add sp, sp, #16 +; CHECK-GISEL-NEXT: mov w8, w1 +; CHECK-GISEL-NEXT: and x8, x8, #0x3 +; CHECK-GISEL-NEXT: ldr w0, [x0, x8, lsl #2] ; CHECK-GISEL-NEXT: ret %lv = load <4 x i32>, ptr %A %e = extractelement <4 x i32> %lv, i32 %idx @@ -14779,14 +14759,8 @@ define i32 @load_single_extract_variable_index_masked_i32(ptr %A, i32 %idx) { ; ; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: sub sp, sp, #16 -; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: ldr q0, [x0] -; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and w9, w1, #0x3 -; CHECK-GISEL-NEXT: str q0, [sp] -; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2] -; CHECK-GISEL-NEXT: add sp, sp, #16 +; CHECK-GISEL-NEXT: and w8, w1, #0x3 +; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2] ; CHECK-GISEL-NEXT: ret %idx.x = and i32 %idx, 3 %lv = load <4 x i32>, ptr %A @@ -14803,14 +14777,8 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) { ; ; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked2_i32: ; CHECK-GISEL: ; %bb.0: -; CHECK-GISEL-NEXT: sub sp, sp, #16 -; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GISEL-NEXT: ldr q0, [x0] -; CHECK-GISEL-NEXT: mov x8, sp -; CHECK-GISEL-NEXT: and w9, w1, #0x1 -; CHECK-GISEL-NEXT: str q0, [sp] -; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2] -; CHECK-GISEL-NEXT: add sp, sp, #16 +; CHECK-GISEL-NEXT: and w8, w1, #0x1 +; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2] ; CHECK-GISEL-NEXT: ret %idx.x = and i32 %idx, 1 %lv = load <4 x i32>, ptr %A diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll index 69346de9bb798..80dc3dead35ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -124,71 +124,71 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) ; GFX9-LABEL: test_add_mul_multiple_defs_z: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3 +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3 +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_add_mul_multiple_defs_z: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul float %x, %y @@ -202,71 +202,71 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace ; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3 +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3 +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul float %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 33a4d3c5494f7..a13c60b4e8414 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,170 +8,12 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 -; GCN-NEXT: s_add_i32 s32, s32, 0x10000 -; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v0, 63, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v20 -; GCN-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NEXT: v_mov_b32_e32 v18, v22 -; GCN-NEXT: v_mov_b32_e32 v19, v23 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: v_and_b32_e32 v2, 63, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: global_load_dword v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr @@ -183,174 +25,12 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 -; GCN-NEXT: s_add_i32 s32, s32, 0x10000 -; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v0, v1, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v20 -; GCN-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NEXT: v_mov_b32_e32 v18, v22 -; GCN-NEXT: v_mov_b32_e32 v19, v23 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(16) -; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: global_load_ushort v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, ptr addrspace(1) %ptr @@ -362,171 +42,12 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 -; GCN-NEXT: s_add_i32 s32, s32, 0x10000 -; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v0, 31, v6 -; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 -; GCN-NEXT: v_add_u32_e32 v1, v2, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v16, v20 -; GCN-NEXT: v_mov_b32_e32 v17, v21 -; GCN-NEXT: v_mov_b32_e32 v18, v22 -; GCN-NEXT: v_mov_b32_e32 v19, v23 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: v_and_b32_e32 v2, 31, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll index 786d65f7dcc40..057790617204c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -8,29 +8,36 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GCN-NEXT: s_lshl_b32 m0, s4, 1 +; GCN-NEXT: s_and_b32 s0, s4, 3 +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: s_add_u32 s0, s2, s0 +; GCN-NEXT: s_addc_u32 s1, s3, s1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11] ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 -; GFX10-NEXT: s_lshl_b32 m0, s4, 1 +; GFX10-NEXT: s_and_b32 s0, s4, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[8:23], s[2:3], 0x0 -; GFX11-NEXT: s_lshl_b32 m0, s4, 1 +; GFX11-NEXT: s_and_b32 s0, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX11-NEXT: s_movrels_b64 s[2:3], s[10:11] ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -40,46 +47,32 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX9-NEXT: s_lshl_b32 s0, s2, 1 -; GFX9-NEXT: s_lshl_b32 s2, s0, 1 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v18, v2 -; GFX9-NEXT: s_set_gpr_idx_off ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GFX9-NEXT: v_mov_b32_e32 v3, v3 -; GFX9-NEXT: s_set_gpr_idx_off -; GFX9-NEXT: v_readfirstlane_b32 s2, v18 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11] -; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] -; GFX8-NEXT: s_lshl_b32 s0, s2, 1 -; GFX8-NEXT: s_lshl_b32 m0, s0, 1 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_movrels_b32_e32 v1, v3 -; GFX8-NEXT: v_movrels_b32_e32 v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -88,20 +81,13 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; ; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 -; GFX7-NEXT: s_lshl_b32 s0, s2, 1 -; GFX7-NEXT: s_lshl_b32 m0, s0, 1 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_movrels_b32_e32 v1, v3 -; GFX7-NEXT: v_movrels_b32_e32 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2 @@ -110,44 +96,38 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr ; ; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 -; GFX10-NEXT: s_lshl_b32 s0, s2, 1 -; GFX10-NEXT: s_lshl_b32 m0, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_movrels_b32_e32 v1, v3 -; GFX10-NEXT: v_movrels_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off -; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 -; GFX11-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32 -; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48 -; GFX11-NEXT: s_lshl_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 m0, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_movrels_b32_e32 v0, v2 -; GFX11-NEXT: v_movrels_b32_e32 v1, v3 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(1) %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -158,298 +138,66 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_add_u32_e32 v16, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] -; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 -; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 -; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v12, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v13, v15, s4 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, v5, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off -; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 -; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 -; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, ptr addrspace(1) %ptr %element = extractelement <4 x i128> %vector, i32 %idx @@ -459,68 +207,15 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { ; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_add_u32_e32 v19, 1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: v_mov_b32_e32 v8, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, s8 -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, s12 -; GFX9-NEXT: v_mov_b32_e32 v14, s13 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX9-NEXT: v_mov_b32_e32 v15, s14 -; GFX9-NEXT: v_mov_b32_e32 v16, s15 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 @@ -529,68 +224,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre ; ; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: v_mov_b32_e32 v8, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v9, s8 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GFX8-NEXT: v_mov_b32_e32 v11, s10 -; GFX8-NEXT: v_mov_b32_e32 v12, s11 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GFX8-NEXT: v_mov_b32_e32 v13, s12 -; GFX8-NEXT: v_mov_b32_e32 v14, s13 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -599,68 +241,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre ; ; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: v_mov_b32_e32 v6, s5 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX7-NEXT: v_mov_b32_e32 v7, s6 -; GFX7-NEXT: v_mov_b32_e32 v8, s7 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX7-NEXT: v_mov_b32_e32 v9, s8 -; GFX7-NEXT: v_mov_b32_e32 v10, s9 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GFX7-NEXT: v_mov_b32_e32 v11, s10 -; GFX7-NEXT: v_mov_b32_e32 v12, s11 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GFX7-NEXT: v_mov_b32_e32 v13, s12 -; GFX7-NEXT: v_mov_b32_e32 v14, s13 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc -; GFX7-NEXT: v_add_i32_e32 v19, vcc, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX7-NEXT: v_mov_b32_e32 v15, s14 -; GFX7-NEXT: v_mov_b32_e32 v16, s15 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2 @@ -669,54 +258,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre ; ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -725,63 +275,18 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre ; ; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 ; GFX11-NEXT: ; return to shader part epilog @@ -793,19 +298,19 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) { ; GCN-LABEL: extractelement_sgpr_v4i128_idx0: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr @@ -814,34 +319,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p } define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i128_idx1: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s2, s6 -; GCN-NEXT: s_mov_b32 s3, s7 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i128_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s2, s6 -; GFX10-NEXT: s_mov_b32 s3, s7 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s4 -; GFX11-NEXT: s_mov_b32 s1, s5 -; GFX11-NEXT: s_mov_b32 s2, s6 -; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 1 @@ -849,34 +354,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p } define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i128_idx2: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s8 -; GCN-NEXT: s_mov_b32 s1, s9 -; GCN-NEXT: s_mov_b32 s2, s10 -; GCN-NEXT: s_mov_b32 s3, s11 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i128_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s8 -; GFX10-NEXT: s_mov_b32 s1, s9 -; GFX10-NEXT: s_mov_b32 s2, s10 -; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s8 -; GFX11-NEXT: s_mov_b32 s1, s9 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 2 @@ -884,34 +389,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p } define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i128_idx3: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s12 -; GCN-NEXT: s_mov_b32 s1, s13 -; GCN-NEXT: s_mov_b32 s2, s14 -; GCN-NEXT: s_mov_b32 s3, s15 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i128_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s12 -; GFX10-NEXT: s_mov_b32 s1, s13 -; GFX10-NEXT: s_mov_b32 s2, s14 -; GFX10-NEXT: s_mov_b32 s3, s15 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i128_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s12 -; GFX11-NEXT: s_mov_b32 s1, s13 -; GFX11-NEXT: s_mov_b32 s2, s14 -; GFX11-NEXT: s_mov_b32 s3, s15 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, ptr addrspace(4) %ptr %element = extractelement <4 x i128> %vector, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 7028d1157787f..6d772df3fa281 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -6,42 +6,74 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s2, s4, 1 -; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_and_b32 s1, s4, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s4, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s4, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, s2, 1 +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 1 -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_and_b32 s0, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -51,71 +83,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(1) %ptr @@ -127,70 +159,66 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) ; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, ptr addrspace(1) %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -198,48 +226,74 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) } define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr @@ -248,22 +302,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg } define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i16_idx0: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr %element = extractelement <4 x i16> %vector, i32 0 @@ -271,25 +351,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i16_idx1: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 2 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr %element = extractelement <4 x i16> %vector, i32 1 @@ -297,25 +402,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i16_idx2: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 4 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr %element = extractelement <4 x i16> %vector, i32 2 @@ -323,25 +453,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i16_idx3: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s1, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 6 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, ptr addrspace(4) %ptr %element = extractelement <4 x i16> %vector, i32 3 @@ -352,14 +507,14 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -369,21 +524,21 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, ptr addrspace(1) %ptr @@ -395,17 +550,17 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i16_idx1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_idx1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1: @@ -414,25 +569,22 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_idx1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_idx1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, ptr addrspace(1) %ptr %element = extractelement <4 x i16> %vector, i32 1 @@ -443,17 +595,17 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i16_idx2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_idx2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2: @@ -462,25 +614,22 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_idx2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_idx2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, ptr addrspace(1) %ptr %element = extractelement <4 x i16> %vector, i32 2 @@ -491,17 +640,17 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i16_idx3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_idx3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3: @@ -510,25 +659,22 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_idx3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_idx3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, ptr addrspace(1) %ptr %element = extractelement <4 x i16> %vector, i32 3 @@ -536,54 +682,74 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) { } define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s5, s4, 1 -; GCN-NEXT: s_cmp_eq_u32 s5, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 2 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 3 -; GCN-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-NEXT: s_and_b32 s1, s4, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s4, 7 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 7 +; GFX7-NEXT: s_lshl_b32 s4, s2, 1 +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: s_cselect_b32 s0, s3, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s4, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s5, s4, 1 +; GFX11-NEXT: s_and_b32 s0, s4, 7 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: s_cselect_b32 s0, s2, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 %idx @@ -593,92 +759,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_and_b32 s0, s2, 7 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_lshl_b32 s0, s1, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_and_b32 s0, s2, 7 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_lshl_b32 s0, s1, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_and_b32 s0, s2, 7 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s1, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(1) %ptr @@ -690,91 +835,66 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) ; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 %idx @@ -782,64 +902,74 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) } define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr @@ -848,22 +978,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx0: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 0 @@ -871,25 +1027,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx1: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 2 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 1 @@ -897,25 +1078,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx2: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 4 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 2 @@ -923,25 +1129,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx3: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s1, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 6 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 3 @@ -949,25 +1180,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx4: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 8 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 4 @@ -975,25 +1231,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx5: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s2, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:10 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 10 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s2, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:10 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:10 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 5 @@ -1001,25 +1282,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx6: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s3 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 12 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx6: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 6 @@ -1027,25 +1333,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr } define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i16_idx7: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:14 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 14 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_idx7: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:14 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_idx7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s3, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:14 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, ptr addrspace(4) %ptr %element = extractelement <8 x i16> %vector, i32 7 @@ -1056,14 +1387,14 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1073,21 +1404,21 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr @@ -1099,17 +1430,17 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx1: @@ -1118,25 +1449,22 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 1 @@ -1147,17 +1475,17 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx2: @@ -1166,25 +1494,22 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 2 @@ -1195,17 +1520,17 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx3: @@ -1214,25 +1539,22 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 3 @@ -1243,17 +1565,17 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx4: @@ -1262,25 +1584,22 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 4 @@ -1291,17 +1610,17 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx5: @@ -1310,25 +1629,22 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:10 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:10 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 5 @@ -1339,17 +1655,17 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx6: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx6: @@ -1358,25 +1674,22 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:12 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx6: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:12 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx6: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:12 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 6 @@ -1387,17 +1700,17 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i16_idx7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx7: @@ -1406,27 +1719,26 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_idx7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:14 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_idx7: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:14 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, ptr addrspace(1) %ptr %element = extractelement <8 x i16> %vector, i32 7 ret i16 %element } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index c7f49d526fac0..c2394ec461490 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -6,32 +6,68 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_and_b32 s1, s4, 3 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s4, 3 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s4, 3 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_and_b32 s4, s4, 3 +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_and_b32 s1, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s1, s1, 3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr %element = extractelement <4 x i8> %vector, i32 %idx @@ -41,55 +77,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s2, 3 -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_and_b32 s0, s2, 3 -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_and_b32 s0, s2, 3 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v4i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v4i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_and_b32 s0, s2, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(1) %ptr @@ -101,55 +147,60 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, ptr addrspace(1) %ptr %element = extractelement <4 x i8> %vector, i32 %idx @@ -159,53 +210,67 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { ; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 3, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr @@ -214,22 +279,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p } define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i8_idx0: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr %element = extractelement <4 x i8> %vector, i32 0 @@ -237,25 +328,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i8_idx1: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr %element = extractelement <4 x i8> %vector, i32 1 @@ -263,25 +379,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i8_idx2: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 2 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr %element = extractelement <4 x i8> %vector, i32 2 @@ -289,25 +430,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v4i8_idx3: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 24 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 3 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i8_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i8_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, ptr addrspace(4) %ptr %element = extractelement <4 x i8> %vector, i32 3 @@ -318,14 +484,14 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i8_idx0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -335,21 +501,21 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i8_idx0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i8_idx0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, ptr addrspace(1) %ptr @@ -361,17 +527,17 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i8_idx1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_idx1: @@ -380,25 +546,22 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i8_idx1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i8_idx1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, ptr addrspace(1) %ptr %element = extractelement <4 x i8> %vector, i32 1 @@ -409,17 +572,17 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i8_idx2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_idx2: @@ -428,25 +591,22 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i8_idx2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i8_idx2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, ptr addrspace(1) %ptr %element = extractelement <4 x i8> %vector, i32 2 @@ -457,17 +617,17 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v4i8_idx3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i8_idx3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i8_idx3: @@ -476,25 +636,22 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i8_idx3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i8_idx3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, ptr addrspace(1) %ptr %element = extractelement <4 x i8> %vector, i32 3 @@ -502,42 +659,68 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) { } define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s2, s4, 2 -; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_and_b32 s1, s4, 3 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s4, 7 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_and_b32 s4, s4, 7 +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s2, s4, 2 -; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 3 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s4, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s2, s4, 2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 3 -; GFX11-NEXT: s_lshl_b32 s1, s1, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s4, 7 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 %idx @@ -547,71 +730,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 2 -; GFX9-NEXT: s_and_b32 s1, s2, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_lshl_b32 s0, s1, 3 +; GFX9-NEXT: s_and_b32 s0, s2, 7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 2 -; GFX8-NEXT: s_and_b32 s1, s2, 3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_lshl_b32 s0, s1, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 7 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_lshr_b32 s0, s2, 2 -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_lshl_b32 s0, s1, 3 +; GFX7-NEXT: s_and_b32 s0, s2, 7 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: s_and_b32 s0, s2, 3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: s_and_b32 s0, s2, 7 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v8i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX11-NEXT: s_and_b32 s0, s2, 3 -; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_and_b32 s0, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(1) %ptr @@ -623,70 +800,60 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 %idx @@ -694,48 +861,69 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { } define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v2, 7, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v2, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 7, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr @@ -744,22 +932,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx0: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 0 @@ -767,25 +981,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx1: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 1 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 1 @@ -793,25 +1032,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx2: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 2 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 2 @@ -819,25 +1083,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx3: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s0, 24 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 3 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx3: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx3: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 3 @@ -845,25 +1134,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx4: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 4 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx4: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx4: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 4 @@ -871,25 +1185,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx5: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s1, 8 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 5 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx5: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx5: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 5 @@ -897,25 +1236,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx6: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s1, 16 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 6 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx6: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx6: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 6 @@ -923,25 +1287,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr) } define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(ptr addrspace(4) inreg %ptr) { -; GCN-LABEL: extractelement_sgpr_v8i8_idx7: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, s1, 24 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i8_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i8_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s2, 7 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i8_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i8_idx7: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s1, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i8_idx7: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s0, s1, 24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, ptr addrspace(4) %ptr %element = extractelement <8 x i8> %vector, i32 7 @@ -952,14 +1341,14 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -969,21 +1358,21 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr @@ -995,17 +1384,17 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx1: @@ -1014,25 +1403,22 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 1 @@ -1043,17 +1429,17 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx2: @@ -1062,25 +1448,22 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 2 @@ -1091,17 +1474,17 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx3: @@ -1110,25 +1493,22 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 3 @@ -1139,17 +1519,17 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx4: @@ -1158,25 +1538,22 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 4 @@ -1187,17 +1564,17 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx5: @@ -1206,25 +1583,22 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 5 @@ -1235,17 +1609,17 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx6: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx6: @@ -1254,25 +1628,22 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx6: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx6: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 6 @@ -1283,17 +1654,17 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v8i8_idx7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i8_idx7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i8_idx7: @@ -1302,25 +1673,22 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i8_idx7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i8_idx7: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, ptr addrspace(1) %ptr %element = extractelement <8 x i8> %vector, i32 7 @@ -1328,54 +1696,68 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) { } define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s5, s4, 2 -; GCN-NEXT: s_cmp_eq_u32 s5, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 2 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 3 -; GCN-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-NEXT: s_and_b32 s1, s4, 3 -; GCN-NEXT: s_lshl_b32 s1, s1, 3 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s4, 15 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_u32 s0, s2, s0 +; GFX9-NEXT: s_addc_u32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s4, 15 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: s_add_u32 s0, s2, s0 +; GFX8-NEXT: s_addc_u32 s1, s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s5, s4, 2 -; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: s_cselect_b32 s0, s3, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 3 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s0, s4, 15 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: s_add_u32 s0, s2, s0 +; GFX10-NEXT: s_addc_u32 s1, s3, s1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s5, s4, 2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: s_cselect_b32 s0, s2, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s1, s1, 3 -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s0, s4, 15 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, ptr addrspace(4) %ptr %element = extractelement <16 x i8> %vector, i32 %idx @@ -1385,92 +1767,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg % define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) { ; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 3 +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_lshl_b32 s0, s1, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: s_ashr_i32 s1, s0, 31 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_lshl_b32 s0, s1, 3 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_lshr_b32 s0, s2, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_and_b32 s0, s2, 15 +; GFX7-NEXT: s_ashr_i32 s1, s0, 31 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s1, 3 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX10-NEXT: s_and_b32 s0, s2, 3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v16i8_sgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_ashr_i32 s1, s0, 31 +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX11-NEXT: s_and_b32 s0, s2, 3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 s0, s0, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, ptr addrspace(1) %ptr @@ -1482,91 +1837,60 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { ; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 %idx @@ -1574,64 +1898,69 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) { } define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, ptr addrspace(4) %ptr @@ -1643,14 +1972,14 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1660,21 +1989,21 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx0: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr @@ -1686,17 +2015,17 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx1: @@ -1705,25 +2034,22 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 1 @@ -1734,17 +2060,17 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx2: @@ -1753,25 +2079,22 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 2 @@ -1782,17 +2105,17 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx3: @@ -1801,25 +2124,22 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx3: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 3 @@ -1830,17 +2150,17 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx4: @@ -1849,25 +2169,22 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 4 @@ -1878,17 +2195,17 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx5: @@ -1897,25 +2214,22 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 5 @@ -1926,17 +2240,17 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx6: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx6: @@ -1945,25 +2259,22 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx6: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx6: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 6 @@ -1974,17 +2285,17 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx7: @@ -1993,25 +2304,22 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx7: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 7 @@ -2022,17 +2330,17 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx8: @@ -2041,25 +2349,22 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 8 @@ -2070,17 +2375,17 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx9: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:9 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx9: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx9: @@ -2089,25 +2394,22 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:9 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx9: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:9 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx9: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:9 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 9 @@ -2118,17 +2420,17 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx10: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx10: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx10: @@ -2137,25 +2439,22 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx10: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:10 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx10: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:10 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 10 @@ -2166,17 +2465,17 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx11: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 11, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx11: @@ -2185,25 +2484,22 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx11: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:11 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx11: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:11 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 11 @@ -2214,17 +2510,17 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx12: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx12: @@ -2233,25 +2529,22 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:12 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx12: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:12 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:12 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 12 @@ -2262,17 +2555,17 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx13: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:13 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx13: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 13, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx13: @@ -2281,25 +2574,22 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:13 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx13: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:13 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx13: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:13 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 13 @@ -2310,17 +2600,17 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx14: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx14: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx14: @@ -2329,25 +2619,22 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx14: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:14 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx14: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:14 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 14 @@ -2358,17 +2645,17 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) { ; GFX9-LABEL: extractelement_vgpr_v16i8_idx15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v16i8_idx15: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v16i8_idx15: @@ -2377,27 +2664,26 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:15 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v16i8_idx15: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:15 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v16i8_idx15: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:15 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, ptr addrspace(1) %ptr %element = extractelement <16 x i8> %vector, i32 15 ret i8 %element } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 8bf34caea4051..d4c536bdd5ebe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4738,35 +4738,31 @@ define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) { ; GPRIDX-LABEL: v_extract_v64i32_7: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:28 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: v_extract_v64i32_7: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 28, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; MOVREL-NEXT: flat_load_dword v0, v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_extract_v64i32_7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:28 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_extract_v64i32_7: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:28 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 7 @@ -4777,7 +4773,7 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) { ; GPRIDX-LABEL: v_extract_v64i32_32: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:128 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; @@ -4786,21 +4782,21 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) { ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; MOVREL-NEXT: flat_load_dword v0, v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_extract_v64i32_32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:128 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_extract_v64i32_32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:128 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr @@ -4812,35 +4808,31 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) { ; GPRIDX-LABEL: v_extract_v64i32_33: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:132 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: v_extract_v64i32_33: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x84, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; MOVREL-NEXT: flat_load_dword v0, v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_extract_v64i32_33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:132 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_extract_v64i32_33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:132 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 33 @@ -4851,35 +4843,31 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) { ; GPRIDX-LABEL: v_extract_v64i32_37: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 +; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:148 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x94, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; MOVREL-NEXT: flat_load_dword v0, v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) -; MOVREL-NEXT: v_mov_b32_e32 v0, v5 ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_extract_v64i32_37: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:148 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_extract_v64i32_37: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:144 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:148 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, ptr addrspace(1) %ptr %elt = extractelement <64 x i32> %vec, i32 37