diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index adf4e0139e03c..4bf68d2934256 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -384,7 +384,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64, - MVT::v16f64, MVT::v16i64}, + MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16}, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7d457edad0d5c..413b3b5afa57a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -165,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -269,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, - MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, - MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, - MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -553,8 +555,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT VT : + {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -640,6 +643,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f16, Promote); AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v32i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v32i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -662,12 +675,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Custom); setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Expand); - for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}) { setOperationAction( {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, Vec16, Custom); @@ -690,10 +706,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, - MVT::v16f16, MVT::v16i16}, + MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, Custom); - for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) + for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, @@ -701,7 +717,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::SSUBSAT}, VT, Custom); - for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) // Split vector operations. setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, VT, Custom); @@ -737,7 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}, Custom); setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); @@ -5107,7 +5124,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -5130,7 +5147,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -5897,7 +5914,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || + VT == MVT::v16f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -6415,7 +6433,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (VecSize == 128 || VecSize == 256) { + if (VecSize == 128 || VecSize == 256 || VecSize == 512) { SDValue Lo, Hi; EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); @@ -6428,9 +6446,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, DAG.getConstant(1, SL, MVT::i32))); - } else { - assert(VecSize == 256); - + } else if (VecSize == 256) { SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); SDValue Parts[4]; for (unsigned P = 0; P < 4; ++P) { @@ -6442,6 +6458,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Parts[0], Parts[1])); Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, Parts[2], Parts[3])); + } else { + assert(VecSize == 512); + + SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); + SDValue Parts[8]; + for (unsigned P = 0; P < 8; ++P) { + Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(P, SL, MVT::i32)); + } + + Lo = DAG.getBitcast(LoVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[0], Parts[1], Parts[2], Parts[3])); + Hi = DAG.getBitcast(HiVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[4], Parts[5],Parts[6], Parts[7])); } EVT IdxVT = Idx.getValueType(); @@ -6607,6 +6639,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } + if (VT == MVT::v32i16 || VT == MVT::v32f16) { + EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 8); + MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + + SmallVector Parts[8]; + for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { + for (unsigned P = 0; P < 8; ++P) + Parts[P].push_back(Op.getOperand(I + P * E)); + } + SDValue Casts[8]; + for (unsigned P = 0; P < 8; ++P) { + SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); + Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + } + + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -9507,7 +9560,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || + VT.getSizeInBits() == 512) return splitTernaryVectorOp(Op, DAG); assert(VT.getSizeInBits() == 64); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3cd0821b0f86c..e56269438472e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1619,6 +1619,16 @@ def : BitConvert ; def : BitConvert ; // 512-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 304ee53cc5a87..7ea2280c474b0 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -930,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>; let GlobalPriority = true in { -defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; } @@ -984,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>; let GlobalPriority = true in { -defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll index 2bebc5ed9b53b..2a966b4ea178f 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -76,7 +76,7 @@ define amdgpu_kernel void @add_i16() #0 { ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-LABEL: 'add_i16' @@ -98,7 +98,7 @@ define amdgpu_kernel void @add_i16() #0 { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'add_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll index b57f26cdc2928..564bc4912af7d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll @@ -57,8 +57,8 @@ define i32 @add(i32 %arg) { ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -115,8 +115,8 @@ define i32 @add(i32 %arg) { ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) { ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) { ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll index b1ff4a4a0acb1..d6481caef916d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll @@ -57,8 +57,8 @@ define i32 @add(i32 %arg) { ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -115,8 +115,8 @@ define i32 @add(i32 %arg) { ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) { ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) { ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index d22d8a98b4a43..55994d865fa6c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -115,7 +115,7 @@ define amdgpu_kernel void @fadd_f16() #0 { ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' @@ -135,7 +135,7 @@ define amdgpu_kernel void @fadd_f16() #0 { ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll index 2830bfcdaed20..911b4319eaa4e 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -227,7 +227,7 @@ define amdgpu_kernel void @fdiv_f16_f32ieee() #0 { ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 1152 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee' @@ -247,7 +247,7 @@ define amdgpu_kernel void @fdiv_f16_f32ieee() #0 { ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef @@ -278,7 +278,7 @@ define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 { ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 1152 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz' @@ -298,7 +298,7 @@ define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 { ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index a9f1210a598f0..ab4e98201f6d7 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -145,7 +145,7 @@ define amdgpu_kernel void @fma_f16() #0 { ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' @@ -165,7 +165,7 @@ define amdgpu_kernel void @fma_f16() #0 { ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index c8dab09e0dbf7..2e4a9c70f3717 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -115,7 +115,7 @@ define amdgpu_kernel void @fmul_f16() #0 { ; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' @@ -135,7 +135,7 @@ define amdgpu_kernel void @fmul_f16() #0 { ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index b3bf580e75e66..4e71a71326bad 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -115,7 +115,7 @@ define amdgpu_kernel void @fsub_f16() #0 { ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' @@ -135,7 +135,7 @@ define amdgpu_kernel void @fsub_f16() #0 { ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll index 1444db7248330..e6193791ff53a 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @mul_i16() #0 { ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'mul_i16' @@ -99,7 +99,7 @@ define amdgpu_kernel void @mul_i16() #0 { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %i16 = mul i16 undef, undef @@ -144,7 +144,7 @@ define i32 @mul_constpow2() { ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -182,7 +182,7 @@ define i32 @mul_constpow2() { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -244,7 +244,7 @@ define i32 @mul_uniformconstpow2() { ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -282,7 +282,7 @@ define i32 @mul_uniformconstpow2() { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -344,7 +344,7 @@ define i32 @mul_constnegpow2() { ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -382,7 +382,7 @@ define i32 @mul_constnegpow2() { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -444,7 +444,7 @@ define i32 @mul_uniformconstnegpow2() { ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -482,7 +482,7 @@ define i32 @mul_uniformconstnegpow2() { ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll index e81c12126953d..ba9d9f75230d1 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -18,14 +18,6 @@ define amdgpu_kernel void @v_input_output_i8() { ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' -define amdgpu_kernel void @s_input_output_v32f16() { - %v = tail call <32 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() - tail call void asm sideeffect "; use $0", "s"(<32 x half> %v) - ret void -} - ; SICI: error: couldn't allocate output register for constraint 's' ; SICI: error: couldn't allocate input reg for constraint 's' ; VI-NOT: error diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll index 2093324622225..c3c08b55f607d 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll @@ -90,4 +90,26 @@ define amdgpu_kernel void @v_input_output_v16i16() { ret void } +; GCN-LABEL: {{^}}v_input_output_v32f16 +; GCN: v_mov_b32 v[0:15], -1 +; GCN: ; use v[0:15] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v32f16() { + %v = tail call <32 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<32 x half> %v) + ret void +} + +; GCN-LABEL: {{^}}v_input_output_v32i16 +; GCN: v_mov_b32 v[0:15], -1 +; GCN: ; use v[0:15] +; INVALID: error: couldn't allocate output register for constraint 'v' +; INVALID: error: couldn't allocate input reg for constraint 'v' +define amdgpu_kernel void @v_input_output_v32i16() { + %v = tail call <32 x i16> asm sideeffect "v_mov_b32 $0, -1", "=v"() + tail call void asm sideeffect "; use $0", "v"(<32 x i16> %v) + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index bee3d455187ca..5332da6827ec3 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2341,104 +2341,104 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s1, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s2, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s7, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s11, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s13, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s14, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 32 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6598,134 +6598,127 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s2, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s0, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s1, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s3, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s4, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s7, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s9, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s11, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s15, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s15, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 16 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x80 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 48 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 @@ -6733,6 +6726,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7269,197 +7269,201 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s15 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s13 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s9 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s56, s5 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s3 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s72, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s1, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[20:21], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s15 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[42:43], s[4:5], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0x90 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0x70 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: s_add_u32 s42, s16, 0x50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s43, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s43 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-VI-NEXT: s_add_u32 s36, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s37, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s13, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s15, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xf0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NOHSA-VI-NEXT: s_add_u32 s14, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s15, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s15 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xd0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-VI-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s13, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x90 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 0x50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s16, 64 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s16 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index e89c44d5b94a8..25a84e9e787fb 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -2767,45 +2767,45 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6108,29 +6108,29 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i16_to_v8i64: @@ -7522,95 +7522,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[21:24], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v58, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v55 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v34 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v32 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -8242,100 +8242,116 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[0:1] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v3 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[4:5] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[8:9] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v7 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[22:23], 48, v[12:13] -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[21:22], 48, v[2:3] +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v18 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[20:21], 48, v[6:7] -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[19:20], 48, v[10:11] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i64 v[18:19], 48, v[14:15] -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v18, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v17, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i16_to_v32i64: