Skip to content

Commit

Permalink
[PowerPC] Disable perfect shuffle by default
Browse files Browse the repository at this point in the history
We are going to remove the old 'perfect shuffle' optimization since it
brings performance penalty in hot loop around vectors. For example, in
following loop sharing the same mask:

  %v.1 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27>
  %v.2 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27>

The generated instructions will be `vmrglw-vmrghw-vmrglw-vmrghw` instead
of `vperm-vperm`. In some large loop cases, this causes 20%+ performance
penalty.

The original attempt to resolve this is to pre-record masks of every
shufflevector operation in DAG, but that is somewhat complex and brings
unnecessary computation (to scan all nodes) in optimization. Here we
disable it by default. There're indeed some cases becoming worse after
this, which will be fixed in a more careful way in future patches.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D121082
  • Loading branch information
ecnelises committed Mar 15, 2022
1 parent 23e3cbe commit 300e129
Show file tree
Hide file tree
Showing 16 changed files with 363 additions and 251 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -129,7 +129,7 @@ static cl::opt<bool> EnableQuadwordAtomics(
static cl::opt<bool>
DisablePerfectShuffle("ppc-disable-perfect-shuffle",
cl::desc("disable vector permute decomposition"),
cl::init(false), cl::Hidden);
cl::init(true), cl::Hidden);

STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
@@ -1,5 +1,7 @@
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | grep vsldoi
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vor
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | grep vsldoi
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vor

; TODO: Fix this case when disabling perfect shuffle

define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) {
%tmp76 = shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>> [#uses=1]
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
Expand Up @@ -1447,16 +1447,16 @@ entry:
define <4 x float> @testSameVecEl0LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl0LE:
; CHECK-64: # %bb.0: # %entry
; CHECK-64-NEXT: xxspltw 0, 34, 2
; CHECK-64-NEXT: xxsldwi 0, 34, 0, 1
; CHECK-64-NEXT: xxsldwi 34, 0, 0, 3
; CHECK-64-NEXT: ld 3, L..C0(2) # %const.0
; CHECK-64-NEXT: lxv 35, 0(3)
; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl0LE:
; CHECK-32: # %bb.0: # %entry
; CHECK-32-NEXT: xxspltw 0, 34, 2
; CHECK-32-NEXT: xxsldwi 0, 34, 0, 1
; CHECK-32-NEXT: xxsldwi 34, 0, 0, 3
; CHECK-32-NEXT: lwz 3, L..C0(2) # %const.0
; CHECK-32-NEXT: lxv 35, 0(3)
; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
Expand All @@ -1465,16 +1465,16 @@ entry:
define <4 x float> @testSameVecEl1LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl1LE:
; CHECK-64: # %bb.0: # %entry
; CHECK-64-NEXT: xxswapd 0, 34
; CHECK-64-NEXT: xxmrghw 1, 34, 0
; CHECK-64-NEXT: xxmrghw 34, 1, 0
; CHECK-64-NEXT: ld 3, L..C1(2) # %const.0
; CHECK-64-NEXT: lxv 35, 0(3)
; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl1LE:
; CHECK-32: # %bb.0: # %entry
; CHECK-32-NEXT: xxswapd 0, 34
; CHECK-32-NEXT: xxmrghw 1, 34, 0
; CHECK-32-NEXT: xxmrghw 34, 1, 0
; CHECK-32-NEXT: lwz 3, L..C1(2) # %const.0
; CHECK-32-NEXT: lxv 35, 0(3)
; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
Expand All @@ -1483,16 +1483,16 @@ entry:
define <4 x float> @testSameVecEl3LE(<4 x float> %a) {
; CHECK-64-LABEL: testSameVecEl3LE:
; CHECK-64: # %bb.0: # %entry
; CHECK-64-NEXT: xxspltw 0, 34, 2
; CHECK-64-NEXT: xxswapd 1, 34
; CHECK-64-NEXT: xxsldwi 34, 1, 0, 2
; CHECK-64-NEXT: ld 3, L..C2(2) # %const.0
; CHECK-64-NEXT: lxv 35, 0(3)
; CHECK-64-NEXT: vperm 2, 2, 2, 3
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: testSameVecEl3LE:
; CHECK-32: # %bb.0: # %entry
; CHECK-32-NEXT: xxspltw 0, 34, 2
; CHECK-32-NEXT: xxswapd 1, 34
; CHECK-32-NEXT: xxsldwi 34, 1, 0, 2
; CHECK-32-NEXT: lwz 3, L..C2(2) # %const.0
; CHECK-32-NEXT: lxv 35, 0(3)
; CHECK-32-NEXT: vperm 2, 2, 2, 3
; CHECK-32-NEXT: blr
entry:
%vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
Expand Up @@ -30,12 +30,13 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-AIX-NEXT: lxvw4x 35, 0, 3
; CHECK-AIX-NEXT: addi 3, 1, -16
; CHECK-AIX-NEXT: lxvw4x 36, 0, 3
; CHECK-AIX-NEXT: ld 3, L..C0(2) # %const.0
; CHECK-AIX-NEXT: vmrghh 3, 2, 3
; CHECK-AIX-NEXT: vsplth 5, 2, 0
; CHECK-AIX-NEXT: vmrghh 2, 4, 2
; CHECK-AIX-NEXT: xxmrghw 35, 35, 37
; CHECK-AIX-NEXT: xxswapd 0, 35
; CHECK-AIX-NEXT: xxsldwi 34, 0, 34, 2
; CHECK-AIX-NEXT: vmrghh 4, 4, 2
; CHECK-AIX-NEXT: vsplth 2, 2, 0
; CHECK-AIX-NEXT: xxmrghw 34, 35, 34
; CHECK-AIX-NEXT: lxvw4x 35, 0, 3
; CHECK-AIX-NEXT: vperm 2, 2, 4, 3
; CHECK-AIX-NEXT: vsplth 3, 2, 1
; CHECK-AIX-NEXT: vsplth 2, 2, 4
; CHECK-AIX-NEXT: stxvw4x 35, 0, 5
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
Expand Up @@ -55,15 +55,15 @@ entry:
define <2 x i64> @buildl(i64 %a) {
; CHECK-LABEL: buildl:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lwz 5, L..C0(2) # %const.0
; CHECK-NEXT: stw 4, -16(1)
; CHECK-NEXT: stw 3, -32(1)
; CHECK-NEXT: addi 3, 1, -16
; CHECK-NEXT: addi 4, 1, -32
; CHECK-NEXT: lxvw4x 0, 0, 3
; CHECK-NEXT: lxvw4x 1, 0, 4
; CHECK-NEXT: xxmrghw 34, 1, 0
; CHECK-NEXT: xxswapd 0, 34
; CHECK-NEXT: xxsldwi 34, 0, 34, 2
; CHECK-NEXT: lxvw4x 35, 0, 3
; CHECK-NEXT: lxvw4x 36, 0, 4
; CHECK-NEXT: lxvw4x 34, 0, 5
; CHECK-NEXT: vperm 2, 4, 3, 2
; CHECK-NEXT: blr
entry:
%splat.splatinsert = insertelement <2 x i64> undef, i64 %a, i32 0
Expand All @@ -90,7 +90,7 @@ entry:
define <2 x double> @buildd() {
; CHECK-LABEL: buildd:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lwz 3, L..C0(2) # @d
; CHECK-NEXT: lwz 3, L..C1(2) # @d
; CHECK-NEXT: lxvdsx 34, 0, 3
; CHECK-NEXT: blr
entry:
Expand Down
24 changes: 14 additions & 10 deletions llvm/test/CodeGen/PowerPC/extract-and-store.ll
Expand Up @@ -584,14 +584,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
;
; CHECK-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: xxspltw vs0, vs34, 0
; CHECK-BE-NEXT: xxsldwi vs1, vs34, vs34, 1
; CHECK-BE-NEXT: li r3, 16
; CHECK-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha
; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-BE-NEXT: li r4, 20
; CHECK-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-BE-NEXT: lxvw4x vs35, 0, r3
; CHECK-BE-NEXT: li r3, 16
; CHECK-BE-NEXT: stxsiwx vs34, r5, r3
; CHECK-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-BE-NEXT: stfiwx f1, r5, r4
; CHECK-BE-NEXT: stxvw4x vs0, 0, r5
; CHECK-BE-NEXT: stfiwx f0, r5, r4
; CHECK-BE-NEXT: vperm v3, v2, v2, v3
; CHECK-BE-NEXT: stxvw4x vs35, 0, r5
; CHECK-BE-NEXT: blr
;
; CHECK-P9-LABEL: test_stores_exceed_vec_size:
Expand All @@ -610,14 +612,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
;
; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI16_0@toc@ha
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-P9-BE-NEXT: lxv vs35, 0(r3)
; CHECK-P9-BE-NEXT: li r3, 16
; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3
; CHECK-P9-BE-NEXT: li r3, 20
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-P9-BE-NEXT: stxv vs0, 0(r5)
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
; CHECK-P9-BE-NEXT: vperm v3, v2, v2, v3
; CHECK-P9-BE-NEXT: stxv vs35, 0(r5)
; CHECK-P9-BE-NEXT: blr
entry:
%vecext = extractelement <4 x i32> %a, i32 2
Expand Down

0 comments on commit 300e129

Please sign in to comment.