Skip to content

Commit eb6429d

Browse files
committed
[CostModel][X86] Add uitpfp v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs
These were using (default) scalarized values.
1 parent e6ec7ab commit eb6429d

File tree

5 files changed

+49
-197
lines changed

5 files changed

+49
-197
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1877,12 +1877,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
18771877
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
18781878
{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
18791879
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
1880+
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 9 },
18801881
// This node is expanded into scalarized operations but BasicTTI is overly
18811882
// optimistic estimating its cost. It computes 3 per element (one
18821883
// vector-extract, one scalar conversion and one vector-insert). The
18831884
// problem is that the inserts form a read-modify-write chain so latency
18841885
// should be factored in too. Inflating the cost per element by 1.
1885-
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
18861886
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
18871887

18881888
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
@@ -1985,6 +1985,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
19851985
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
19861986
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
19871987
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
1988+
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
19881989

19891990
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
19901991
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },

llvm/test/Analysis/CostModel/X86/fptoui.ll

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -233,28 +233,20 @@ define i32 @fptoui_float_i64(i32 %arg) {
233233
}
234234

235235
define i32 @fptoui_float_i32(i32 %arg) {
236-
; SSE2-LABEL: 'fptoui_float_i32'
237-
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
238-
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
239-
; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
240-
; SSE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
241-
; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
242-
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
243-
;
244-
; SSE42-LABEL: 'fptoui_float_i32'
245-
; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
246-
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
247-
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
248-
; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
249-
; SSE42-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
250-
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
236+
; SSE-LABEL: 'fptoui_float_i32'
237+
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
238+
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
239+
; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
240+
; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
241+
; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
242+
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
251243
;
252244
; AVX-LABEL: 'fptoui_float_i32'
253245
; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
254246
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
255-
; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
256-
; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
257-
; AVX-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
247+
; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
248+
; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
249+
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
258250
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
259251
;
260252
; AVX512-LABEL: 'fptoui_float_i32'
@@ -264,14 +256,6 @@ define i32 @fptoui_float_i32(i32 %arg) {
264256
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
265257
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
266258
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
267-
;
268-
; SLM-LABEL: 'fptoui_float_i32'
269-
; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
270-
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
271-
; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
272-
; SLM-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
273-
; SLM-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
274-
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
275259
;
276260
%I32 = fptoui float undef to i32
277261
%V2I32 = fptoui <2 x float> undef to <2 x i32>

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
6161
}
6262

6363
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
64-
; SSE-LABEL: @fptosi_fptoui(
65-
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
66-
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
67-
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
68-
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
69-
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
70-
; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
71-
; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
72-
; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
73-
; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
74-
; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
75-
; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
76-
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
77-
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
78-
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
79-
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
80-
; SSE-NEXT: ret <8 x i32> [[R7]]
81-
;
82-
; SLM-LABEL: @fptosi_fptoui(
83-
; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
84-
; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
85-
; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
86-
; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
87-
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
88-
; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
89-
; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
90-
; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
91-
; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
92-
; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
93-
; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
94-
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
95-
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
96-
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
97-
; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
98-
; SLM-NEXT: ret <8 x i32> [[R7]]
99-
;
100-
; AVX-LABEL: @fptosi_fptoui(
101-
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
102-
; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
103-
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
104-
; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
105-
; AVX-NEXT: [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
106-
; AVX-NEXT: ret <8 x i32> [[R72]]
107-
;
108-
; AVX512-LABEL: @fptosi_fptoui(
109-
; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
110-
; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
111-
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
112-
; AVX512-NEXT: ret <8 x i32> [[TMP3]]
64+
; CHECK-LABEL: @fptosi_fptoui(
65+
; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
66+
; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
67+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
68+
; CHECK-NEXT: ret <8 x i32> [[TMP3]]
11369
;
11470
%a0 = extractelement <8 x float> %a, i32 0
11571
%a1 = extractelement <8 x float> %a, i32 1

llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
6161
}
6262

6363
define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
64-
; SSE-LABEL: @fptosi_fptoui(
65-
; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
66-
; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
67-
; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
68-
; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
69-
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
70-
; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
71-
; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
72-
; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
73-
; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
74-
; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
75-
; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
76-
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
77-
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
78-
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
79-
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
80-
; SSE-NEXT: ret <8 x i32> [[R7]]
81-
;
82-
; SLM-LABEL: @fptosi_fptoui(
83-
; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
84-
; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
85-
; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
86-
; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
87-
; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
88-
; SLM-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
89-
; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32
90-
; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32
91-
; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32
92-
; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32
93-
; SLM-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
94-
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
95-
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
96-
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
97-
; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
98-
; SLM-NEXT: ret <8 x i32> [[R7]]
99-
;
100-
; AVX-LABEL: @fptosi_fptoui(
101-
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
102-
; AVX-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
103-
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
104-
; AVX-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
105-
; AVX-NEXT: [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
106-
; AVX-NEXT: ret <8 x i32> [[R72]]
107-
;
108-
; AVX512-LABEL: @fptosi_fptoui(
109-
; AVX512-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
110-
; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
111-
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
112-
; AVX512-NEXT: ret <8 x i32> [[TMP3]]
64+
; CHECK-LABEL: @fptosi_fptoui(
65+
; CHECK-NEXT: [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
66+
; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
67+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
68+
; CHECK-NEXT: ret <8 x i32> [[TMP3]]
11369
;
11470
%a0 = extractelement <8 x float> %a, i32 0
11571
%a1 = extractelement <8 x float> %a, i32 1

0 commit comments

Comments
 (0)