diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 4e509816716698..359fc55035fa21 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -984,65 +984,13 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, if (VWidth == 1) return nullptr; - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - - // Assume the arguments are unchanged and later override them, if needed. - SmallVector Args(II->arg_begin(), II->arg_end()); + ConstantInt *NewDMask = nullptr; if (DMaskIdx < 0) { - // Buffer case. - - const unsigned ActiveBits = DemandedElts.getActiveBits(); - const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); - - // Start assuming the prefix of elements is demanded, but possibly clear some other bits if - // there are trailing zeros (unused components at front) and update offset. - DemandedElts = (1 << ActiveBits) - 1; - - if (UnusedComponentsAtFront > 0) { - static const unsigned InvalidOffsetIdx = 0xf; - - unsigned OffsetIdx; - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_buffer_load_format: - OffsetIdx = 1; - break; - case Intrinsic::amdgcn_s_buffer_load: - // If resulting type is vec3, there is no point in trimming the - // load with updated offset, as the vec3 would most likely be widened to - // vec4 anyway during lowering. - if (ActiveBits == 4 && UnusedComponentsAtFront == 1) - OffsetIdx = InvalidOffsetIdx; - else - OffsetIdx = 1; - break; - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: - case Intrinsic::amdgcn_struct_buffer_load: - case Intrinsic::amdgcn_struct_buffer_load_format: - OffsetIdx = 2; - break; - default: - // TODO: handle *tbuffer* intrinsics. - OffsetIdx = InvalidOffsetIdx; - break; - } - - if (OffsetIdx != InvalidOffsetIdx) { - // Clear demanded bits and update the offset. - DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); - auto Offset = II->getArgOperand(OffsetIdx); - unsigned SingleComponentSizeInBits = getDataLayout().getTypeSizeInBits(II->getType()->getScalarType()); - unsigned OffsetAdd = UnusedComponentsAtFront * SingleComponentSizeInBits / 8; - auto OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); - Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); - } - } + // Pretend that a prefix of elements is demanded to simplify the code + // below. + DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; } else { - // Image case. - ConstantInt *DMask = cast(II->getArgOperand(DMaskIdx)); unsigned DMaskVal = DMask->getZExtValue() & 0xf; @@ -1061,7 +1009,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, } if (DMaskVal != NewDMaskVal) - Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); + NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); } unsigned NewNumElts = DemandedElts.countPopulation(); @@ -1069,8 +1017,8 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, return UndefValue::get(II->getType()); if (NewNumElts >= VWidth && DemandedElts.isMask()) { - if (DMaskIdx >= 0) - II->setArgOperand(DMaskIdx, Args[DMaskIdx]); + if (NewDMask) + II->setArgOperand(DMaskIdx, NewDMask); return nullptr; } @@ -1093,6 +1041,16 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, OverloadTys[0] = NewTy; Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); + SmallVector Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) + Args.push_back(II->getArgOperand(I)); + + if (NewDMask) + Args[DMaskIdx] = NewDMask; + + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); NewCall->takeName(II); NewCall->copyMetadata(*II); @@ -1761,7 +1719,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: - case Intrinsic::amdgcn_s_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 40797ec88ab0cb..389ca6010049c2 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -46,9 +46,9 @@ define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 @@ -65,9 +65,9 @@ define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 @@ -75,9 +75,9 @@ define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 @@ -85,9 +85,9 @@ define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 @@ -104,9 +104,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -114,9 +114,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -133,9 +133,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> } ; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -246,9 +246,9 @@ define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 @@ -256,9 +256,9 @@ define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i3 } ; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 @@ -275,9 +275,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -427,9 +427,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -446,9 +446,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -456,9 +456,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -466,9 +466,9 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -485,9 +485,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -495,9 +495,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> } ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -514,9 +514,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x } ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -543,9 +543,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -553,9 +553,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -572,9 +572,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -666,9 +666,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -685,9 +685,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -695,9 +695,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -705,9 +705,9 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -724,9 +724,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -734,9 +734,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 } ; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -753,9 +753,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f3 } ; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -782,9 +782,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -792,9 +792,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inre } ; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -811,9 +811,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 } ; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -901,9 +901,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -920,9 +920,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -930,9 +930,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -940,9 +940,9 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -959,9 +959,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i3 } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -969,9 +969,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i3 } ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -988,9 +988,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 } ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -1017,9 +1017,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -1027,9 +1027,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %r } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -1046,9 +1046,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i3 } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> @@ -1140,9 +1140,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 @@ -1159,9 +1159,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 @@ -1169,9 +1169,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 @@ -1179,9 +1179,9 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 @@ -1198,9 +1198,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32( } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -1208,9 +1208,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32( } ; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> @@ -1227,9 +1227,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v } ; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data +; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> +; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> @@ -1256,9 +1256,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 @@ -1266,9 +1266,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> i } ; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 +; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 @@ -1285,9 +1285,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32( } ; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data +; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) +; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> +; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32>