-
Notifications
You must be signed in to change notification settings - Fork 10.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] make v32i16/v32f16 legal #70484
Merged
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some upcoming intrinsics use these new types
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesSome upcoming intrinsics will be using these new types Patch is 120.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70484.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c1d..4bf68d293425621 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -384,7 +384,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
- MVT::v16f64, MVT::v16i64},
+ MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5cdf..568f8078373fcfa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -165,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -275,7 +277,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32}) {
+ MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -554,7 +556,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16,
+ MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -640,6 +643,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -662,12 +675,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Expand);
- for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}) {
setOperationAction(
{ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
Vec16, Custom);
@@ -690,10 +706,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
- MVT::v16f16, MVT::v16i16},
+ MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
Custom);
- for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -701,7 +717,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SSUBSAT},
VT, Custom);
- for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
// Split vector operations.
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
VT, Custom);
@@ -737,8 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT,
{MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
- Custom);
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}, Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -5107,7 +5123,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5130,7 +5146,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5897,7 +5913,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+ VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -6415,7 +6432,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128 || VecSize == 256) {
+ if (VecSize == 128 || VecSize == 256 || VecSize == 512 ) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6428,9 +6445,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Hi = DAG.getBitcast(HiVT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
DAG.getConstant(1, SL, MVT::i32)));
- } else {
- assert(VecSize == 256);
-
+ } else if (VecSize == 256) {
SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
SDValue Parts[4];
for (unsigned P = 0; P < 4; ++P) {
@@ -6442,6 +6457,20 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Parts[0], Parts[1]));
Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
Parts[2], Parts[3]));
+ } else {
+ assert(VecSize == 512);
+
+ SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+ SDValue Parts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[0], Parts[1], Parts[2], Parts[3]));
+ Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[4], Parts[5],Parts[6], Parts[7]));
}
EVT IdxVT = Idx.getValueType();
@@ -6607,6 +6636,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 8);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 8> Parts[8];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+ for (unsigned P = 0; P < 8; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -9507,7 +9557,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+ VT.getSizeInBits() == 512)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3cd0821b0f86c5f..e56269438472ee2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1619,6 +1619,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
// 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
def : BitConvert <v8i64, v8f64, VReg_512>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 304ee53cc5a87da..7ea2280c474b05e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -930,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}
@@ -984,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 2bebc5ed9b53bde..2a966b4ea178f27 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @add_i16() #0 {
; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
; SLOW16-LABEL: 'add_i16'
@@ -98,7 +98,7 @@ define amdgpu_kernel void @add_i16() #0 {
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SLOW16-SIZE-LABEL: 'add_i16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index b57f26cdc2928be..564bc4912af7d30 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -57,8 +57,8 @@ define i32 @add(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -115,8 +115,8 @@ define i32 @add(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) {
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) {
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT: Cos...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 793e636ecb194f43a1fbcc7ccbe9b38888b96f0d 57b3c5726c5c25a429c745f45d4eb423b8181446 -- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 413b3b5afa57..387018e58bb4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6473,7 +6473,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Parts[0], Parts[1], Parts[2], Parts[3]));
Hi = DAG.getBitcast(HiVT,
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
- Parts[4], Parts[5],Parts[6], Parts[7]));
+ Parts[4], Parts[5], Parts[6], Parts[7]));
}
EVT IdxVT = Idx.getValueType();
|
Typos "v32" |
Some upcoming intrinsics use these new types
changpeng
changed the title
[AMDGPU] make w32i16/w32f16 legal
[AMDGPU] make v32i16/v32f16 legal
Oct 27, 2023
rampitec
approved these changes
Oct 27, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Some upcoming intrinsics will be using these new types