Skip to content

Commit c444af1

Browse files
committed
[CostModel][X86] Add CostKinds handling for mul ops
This was achieved using the 'cost-tables vs llvm-mca' script D103695 Also fix a missing pmullw v16i16 half-rate throughput as znver1 double-pumps - matches numbers from AMD SoG + Agner
1 parent 28e5e3d commit c444af1

24 files changed

+2307
-818
lines changed

llvm/lib/Target/X86/X86ScheduleZnver1.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ defm : ZnWriteResFpuPair<WriteVecALUY, [ZnFPU], 1, [2], 2>;
422422
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
423423
defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
424424
defm : ZnWriteResFpuPair<WriteVecIMulX, [ZnFPU0], 4>;
425-
defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4>;
425+
defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4, [2], 2>;
426426
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
427427
defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4, [2]>;
428428
defm : ZnWriteResFpuPair<WritePMULLDY, [ZnFPU0], 4, [4], 2>;

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
341341
return LT.first * KindCost.value();
342342

343343
static const CostKindTblEntry SLMCostTable[] = {
344-
{ ISD::MUL, MVT::v4i32, { 11 } }, // pmulld
345-
{ ISD::MUL, MVT::v8i16, { 2 } }, // pmullw
344+
{ ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
345+
{ ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
346346
{ ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
347347
{ ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
348348
{ ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
@@ -358,7 +358,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
358358
// slm muldq version throughput is 2 and addq throughput 4
359359
// thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
360360
// 3X4 (addq throughput) = 17
361-
{ ISD::MUL, MVT::v2i64, { 17 } },
361+
{ ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
362362
// slm addq\subq throughput is 4
363363
{ ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
364364
{ ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
@@ -629,9 +629,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
629629
return LT.first * KindCost.value();
630630

631631
static const CostKindTblEntry AVX512DQCostTable[] = {
632-
{ ISD::MUL, MVT::v2i64, { 2 } }, // pmullq
633-
{ ISD::MUL, MVT::v4i64, { 2 } }, // pmullq
634-
{ ISD::MUL, MVT::v8i64, { 2 } } // pmullq
632+
{ ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
633+
{ ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
634+
{ ISD::MUL, MVT::v8i64, { 2, 15, 1, 3 } } // pmullq
635635
};
636636

637637
// Look for AVX512DQ lowering tricks for custom cases.
@@ -656,6 +656,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
656656
{ ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
657657
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
658658

659+
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
660+
659661
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
660662
{ ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
661663
{ ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
@@ -711,10 +713,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
711713
{ ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
712714
{ ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
713715

714-
{ ISD::MUL, MVT::v16i32, { 1 } }, // pmulld (Skylake from agner.org)
715-
{ ISD::MUL, MVT::v8i32, { 1 } }, // pmulld (Skylake from agner.org)
716-
{ ISD::MUL, MVT::v4i32, { 1 } }, // pmulld (Skylake from agner.org)
717-
{ ISD::MUL, MVT::v8i64, { 6 } }, // 3*pmuludq/3*shift/2*add
716+
{ ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
717+
{ ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
718+
{ ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
719+
{ ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
718720
{ ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
719721

720722
{ ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
@@ -906,9 +908,11 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
906908
{ ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
907909
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
908910

909-
{ ISD::MUL, MVT::v16i16, { 1 } }, // pmullw
910-
{ ISD::MUL, MVT::v8i32, { 4 } }, // pmulld
911-
{ ISD::MUL, MVT::v4i64, { 6 } }, // 3*pmuludq/3*shift/2*add
911+
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw
912+
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
913+
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
914+
{ ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
915+
{ ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
912916

913917
{ ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
914918
{ ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
@@ -952,9 +956,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
952956
// We don't have to scalarize unsupported ops. We can issue two half-sized
953957
// operations and we only need to extract the upper YMM half.
954958
// Two ops + 1 extract + 1 insert = 4.
955-
{ ISD::MUL, MVT::v16i16, { 4 } },
956-
{ ISD::MUL, MVT::v8i32, { 5 } }, // BTVER2 from http://www.agner.org/
957-
{ ISD::MUL, MVT::v4i64, { 12 } },
959+
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
960+
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
961+
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
962+
{ ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
958963

959964
{ ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
960965
{ ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
@@ -1062,7 +1067,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
10621067
{ ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
10631068
{ ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
10641069

1065-
{ ISD::MUL, MVT::v2i64, { 6 } } // 3*pmuludq/3*shift/2*add
1070+
{ ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
10661071
};
10671072

10681073
if (ST->hasSSE42())
@@ -1083,7 +1088,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
10831088
{ ISD::SRA, MVT::v16i8, { 21 } }, // pblendvb sequence.
10841089
{ ISD::SRA, MVT::v8i16, { 13 } }, // pblendvb sequence.
10851090

1086-
{ ISD::MUL, MVT::v4i32, { 2 } } // pmulld (Nehalem from agner.org)
1091+
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
10871092
};
10881093

10891094
if (ST->hasSSE41())
@@ -1127,9 +1132,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
11271132
{ ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
11281133
{ ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
11291134

1130-
{ ISD::MUL, MVT::v8i16, { 1 } }, // pmullw
1131-
{ ISD::MUL, MVT::v4i32, { 6 } }, // 3*pmuludq/4*shuffle
1132-
{ ISD::MUL, MVT::v2i64, { 8 } }, // 3*pmuludq/3*shift/2*add
1135+
{ ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
1136+
{ ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
1137+
{ ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
11331138

11341139
{ ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
11351140
{ ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/

llvm/test/Analysis/CostModel/X86/arith-fix.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ define i32 @smul(i32 %arg) {
103103
; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
104104
; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
105105
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
106-
; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
107-
; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
108-
; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
106+
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
107+
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
108+
; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
109109
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
110110
;
111111
; AVX512F-LABEL: 'smul'
@@ -122,7 +122,7 @@ define i32 @smul(i32 %arg) {
122122
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
123123
; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
124124
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
125-
; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
125+
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
126126
; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
127127
; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
128128
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -141,7 +141,7 @@ define i32 @smul(i32 %arg) {
141141
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
142142
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
143143
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
144-
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
144+
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
145145
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
146146
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
147147
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -160,7 +160,7 @@ define i32 @smul(i32 %arg) {
160160
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
161161
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
162162
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
163-
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
163+
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
164164
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
165165
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V64I8 = call <64 x i8> @llvm.smul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
166166
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -337,9 +337,9 @@ define i32 @umul(i32 %arg) {
337337
; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
338338
; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
339339
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
340-
; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
341-
; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
342-
; AVX2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
340+
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
341+
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
342+
; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
343343
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
344344
;
345345
; AVX512F-LABEL: 'umul'
@@ -356,7 +356,7 @@ define i32 @umul(i32 %arg) {
356356
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
357357
; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
358358
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
359-
; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
359+
; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
360360
; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
361361
; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
362362
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -375,7 +375,7 @@ define i32 @umul(i32 %arg) {
375375
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
376376
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
377377
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
378-
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
378+
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
379379
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
380380
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
381381
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -394,7 +394,7 @@ define i32 @umul(i32 %arg) {
394394
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
395395
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
396396
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
397-
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
397+
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
398398
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
399399
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V64I8 = call <64 x i8> @llvm.umul.fix.v64i8(<64 x i8> undef, <64 x i8> undef, i32 3)
400400
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef

0 commit comments

Comments
 (0)