Skip to content

Commit

Permalink
[SLP]Initialize the lane with the given value instead of default 0.
Browse files Browse the repository at this point in the history
There is a bug in the reordering analysis stage. If the element with the
given hash is not added to the map but has the same number of APOs and
instructions with same parent, but different instruction opcode, it will
be initalized with default values and then the counter is increased by
1. But the lane is not updated and default to 0 instead of the actual
   `Lane` value. It leads to the fact that the analysis is useless in
   many cases and default to lane 0 instead of actual lane with the
   minimum amount of APO operands.

Differential Revision: https://reviews.llvm.org/D116690
  • Loading branch information
alexey-bataev committed Jan 6, 2022
1 parent 31c7165 commit 7cb19fe
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 27 deletions.
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -1417,7 +1417,11 @@ class BoUpSLP {
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
} else if (NumFreeOpsHash.NumOfAPOs == Min &&
NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
++HashMap[NumFreeOpsHash.Hash].first;
auto It = HashMap.find(NumFreeOpsHash.Hash);
if (It == HashMap.end())
HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
else
++It->second.first;
}
}
// Select the lane with the minimum counter.
Expand Down
Expand Up @@ -68,10 +68,11 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP6]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down Expand Up @@ -207,8 +208,8 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
Expand Up @@ -68,10 +68,11 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP6]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
Expand Down Expand Up @@ -207,8 +208,8 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-LABEL: @reduction_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 5, i32 0, i32 7, i32 2>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
Expand Up @@ -17,7 +17,7 @@ define { <2 x float>, <2 x float> } @foo(%struct.sw* %v) {
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> poison, [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], poison
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], poison
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP8]], poison
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
Expand Down
27 changes: 13 additions & 14 deletions llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
Expand Up @@ -16,24 +16,23 @@ define i32 @bar() local_unnamed_addr {
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[SUB86_1]], i32 4
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[ADD78_2]], i32 5
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB86_1]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD78_1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD94_1]], i32 2
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_1]], i32 3
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[ADD78_2]], i32 4
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SUB102_3]], i32 5
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 5>
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
; CHECK-NEXT: [[TMP15:%.*]] = lshr <16 x i32> [[TMP14]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i32> [[TMP15]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP17:%.*]] = mul nuw <16 x i32> [[TMP16]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP18:%.*]] = add <16 x i32> [[TMP17]], [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i32> [[TMP18]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP19]])
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP20]], 16
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SUB102_3]], i32 4
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> poison, <16 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4>
; CHECK-NEXT: [[TMP11:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP12:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]]
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 21, i32 22, i32 7, i32 24, i32 25, i32 10, i32 27, i32 28, i32 13, i32 30, i32 31>
; CHECK-NEXT: [[TMP14:%.*]] = lshr <16 x i32> [[TMP13]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i32> [[TMP14]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
; CHECK-NEXT: [[TMP16:%.*]] = mul nuw <16 x i32> [[TMP15]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP17:%.*]] = add <16 x i32> [[TMP16]], [[TMP13]]
; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i32> [[TMP17]], [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP18]])
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP19]], 16
; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]]
; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
; CHECK-NEXT: ret i32 [[SHR120]]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
Expand Up @@ -421,7 +421,7 @@ define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocaptur
; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[D:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4
; CHECK-NEXT: ret void
Expand Down

0 comments on commit 7cb19fe

Please sign in to comment.