[PowerPC] Disable perfect shuffle by default

We are going to remove the old 'perfect shuffle' optimization since it brings performance penalty in hot loop around vectors. For example, in following loop sharing the same mask: %v.1 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27> %v.2 = shufflevector ... <0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27> The generated instructions will be `vmrglw-vmrghw-vmrglw-vmrghw` instead of `vperm-vperm`. In some large loop cases, this causes 20%+ performance penalty. The original attempt to resolve this is to pre-record masks of every shufflevector operation in DAG, but that is somewhat complex and brings unnecessary computation (to scan all nodes) in optimization. Here we disable it by default. There're indeed some cases becoming worse after this, which will be fixed in a more careful way in future patches. Reviewed By: jsji Differential Revision: https://reviews.llvm.org/D121082
llvm · Mar 15, 2022 · 300e129 · 300e129
1 parent 23e3cbe
commit 300e129
Show file tree

Hide file tree

Showing 16 changed files with 363 additions and 251 deletions.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -129,7 +129,7 @@ static cl::opt<bool> EnableQuadwordAtomics(
 static cl::opt<bool>
     DisablePerfectShuffle("ppc-disable-perfect-shuffle",
                           cl::desc("disable vector permute decomposition"),
-                          cl::init(false), cl::Hidden);
+                          cl::init(true), cl::Hidden);
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");

diff --git a/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll b/llvm/test/CodeGen/PowerPC/2006-08-11-RetVector.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | grep vsldoi
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 | not grep vor
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | grep vsldoi
+; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -mcpu=g5 -ppc-disable-perfect-shuffle=false | not grep vor
+
+; TODO: Fix this case when disabling perfect shuffle
 
 define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) {
         %tmp76 = shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 >     ; <<4 x float>> [#uses=1]

diff --git a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
@@ -1447,16 +1447,16 @@ entry:
 define <4 x float> @testSameVecEl0LE(<4 x float> %a) {
 ; CHECK-64-LABEL: testSameVecEl0LE:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xxspltw 0, 34, 2
-; CHECK-64-NEXT:    xxsldwi 0, 34, 0, 1
-; CHECK-64-NEXT:    xxsldwi 34, 0, 0, 3
+; CHECK-64-NEXT:    ld 3, L..C0(2) # %const.0
+; CHECK-64-NEXT:    lxv 35, 0(3)
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testSameVecEl0LE:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    xxspltw 0, 34, 2
-; CHECK-32-NEXT:    xxsldwi 0, 34, 0, 1
-; CHECK-32-NEXT:    xxsldwi 34, 0, 0, 3
+; CHECK-32-NEXT:    lwz 3, L..C0(2) # %const.0
+; CHECK-32-NEXT:    lxv 35, 0(3)
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-32-NEXT:    blr
 entry:
   %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
@@ -1465,16 +1465,16 @@ entry:
 define <4 x float> @testSameVecEl1LE(<4 x float> %a) {
 ; CHECK-64-LABEL: testSameVecEl1LE:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xxswapd 0, 34
-; CHECK-64-NEXT:    xxmrghw 1, 34, 0
-; CHECK-64-NEXT:    xxmrghw 34, 1, 0
+; CHECK-64-NEXT:    ld 3, L..C1(2) # %const.0
+; CHECK-64-NEXT:    lxv 35, 0(3)
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testSameVecEl1LE:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    xxswapd 0, 34
-; CHECK-32-NEXT:    xxmrghw 1, 34, 0
-; CHECK-32-NEXT:    xxmrghw 34, 1, 0
+; CHECK-32-NEXT:    lwz 3, L..C1(2) # %const.0
+; CHECK-32-NEXT:    lxv 35, 0(3)
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-32-NEXT:    blr
 entry:
   %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
@@ -1483,16 +1483,16 @@ entry:
 define <4 x float> @testSameVecEl3LE(<4 x float> %a) {
 ; CHECK-64-LABEL: testSameVecEl3LE:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xxspltw 0, 34, 2
-; CHECK-64-NEXT:    xxswapd 1, 34
-; CHECK-64-NEXT:    xxsldwi 34, 1, 0, 2
+; CHECK-64-NEXT:    ld 3, L..C2(2) # %const.0
+; CHECK-64-NEXT:    lxv 35, 0(3)
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testSameVecEl3LE:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    xxspltw 0, 34, 2
-; CHECK-32-NEXT:    xxswapd 1, 34
-; CHECK-32-NEXT:    xxsldwi 34, 1, 0, 2
+; CHECK-32-NEXT:    lwz 3, L..C2(2) # %const.0
+; CHECK-32-NEXT:    lxv 35, 0(3)
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-32-NEXT:    blr
 entry:
   %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>

diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll
@@ -30,12 +30,13 @@ define void @test_aix_splatimm(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-AIX-NEXT:    lxvw4x 35, 0, 3
 ; CHECK-AIX-NEXT:    addi 3, 1, -16
 ; CHECK-AIX-NEXT:    lxvw4x 36, 0, 3
+; CHECK-AIX-NEXT:    ld 3, L..C0(2) # %const.0
 ; CHECK-AIX-NEXT:    vmrghh 3, 2, 3
-; CHECK-AIX-NEXT:    vsplth 5, 2, 0
-; CHECK-AIX-NEXT:    vmrghh 2, 4, 2
-; CHECK-AIX-NEXT:    xxmrghw 35, 35, 37
-; CHECK-AIX-NEXT:    xxswapd 0, 35
-; CHECK-AIX-NEXT:    xxsldwi 34, 0, 34, 2
+; CHECK-AIX-NEXT:    vmrghh 4, 4, 2
+; CHECK-AIX-NEXT:    vsplth 2, 2, 0
+; CHECK-AIX-NEXT:    xxmrghw 34, 35, 34
+; CHECK-AIX-NEXT:    lxvw4x 35, 0, 3
+; CHECK-AIX-NEXT:    vperm 2, 2, 4, 3
 ; CHECK-AIX-NEXT:    vsplth 3, 2, 1
 ; CHECK-AIX-NEXT:    vsplth 2, 2, 4
 ; CHECK-AIX-NEXT:    stxvw4x 35, 0, 5

diff --git a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
@@ -55,15 +55,15 @@ entry:
 define <2 x i64> @buildl(i64 %a) {
 ; CHECK-LABEL: buildl:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz 5, L..C0(2) # %const.0
 ; CHECK-NEXT:    stw 4, -16(1)
 ; CHECK-NEXT:    stw 3, -32(1)
 ; CHECK-NEXT:    addi 3, 1, -16
 ; CHECK-NEXT:    addi 4, 1, -32
-; CHECK-NEXT:    lxvw4x 0, 0, 3
-; CHECK-NEXT:    lxvw4x 1, 0, 4
-; CHECK-NEXT:    xxmrghw 34, 1, 0
-; CHECK-NEXT:    xxswapd 0, 34
-; CHECK-NEXT:    xxsldwi 34, 0, 34, 2
+; CHECK-NEXT:    lxvw4x 35, 0, 3
+; CHECK-NEXT:    lxvw4x 36, 0, 4
+; CHECK-NEXT:    lxvw4x 34, 0, 5
+; CHECK-NEXT:    vperm 2, 4, 3, 2
 ; CHECK-NEXT:    blr
 entry:
   %splat.splatinsert = insertelement <2 x i64> undef, i64 %a, i32 0
@@ -90,7 +90,7 @@ entry:
 define <2 x double> @buildd() {
 ; CHECK-LABEL: buildd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lwz 3, L..C0(2) # @d
+; CHECK-NEXT:    lwz 3, L..C1(2) # @d
 ; CHECK-NEXT:    lxvdsx 34, 0, 3
 ; CHECK-NEXT:    blr
 entry:

diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll
@@ -584,14 +584,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
 ;
 ; CHECK-BE-LABEL: test_stores_exceed_vec_size:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxspltw vs0, vs34, 0
-; CHECK-BE-NEXT:    xxsldwi vs1, vs34, vs34, 1
-; CHECK-BE-NEXT:    li r3, 16
+; CHECK-BE-NEXT:    addis r3, r2, .LCPI16_0@toc@ha
+; CHECK-BE-NEXT:    xxsldwi vs0, vs34, vs34, 1
 ; CHECK-BE-NEXT:    li r4, 20
+; CHECK-BE-NEXT:    addi r3, r3, .LCPI16_0@toc@l
+; CHECK-BE-NEXT:    lxvw4x vs35, 0, r3
+; CHECK-BE-NEXT:    li r3, 16
 ; CHECK-BE-NEXT:    stxsiwx vs34, r5, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, vs34, vs0, 2
-; CHECK-BE-NEXT:    stfiwx f1, r5, r4
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r5
+; CHECK-BE-NEXT:    stfiwx f0, r5, r4
+; CHECK-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-BE-NEXT:    stxvw4x vs35, 0, r5
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: test_stores_exceed_vec_size:
@@ -610,14 +612,16 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
 ;
 ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size:
 ; CHECK-P9-BE:       # %bb.0: # %entry
-; CHECK-P9-BE-NEXT:    xxspltw vs0, vs34, 0
+; CHECK-P9-BE-NEXT:    addis r3, r2, .LCPI16_0@toc@ha
+; CHECK-P9-BE-NEXT:    xxsldwi vs0, vs34, vs34, 1
+; CHECK-P9-BE-NEXT:    addi r3, r3, .LCPI16_0@toc@l
+; CHECK-P9-BE-NEXT:    lxv vs35, 0(r3)
 ; CHECK-P9-BE-NEXT:    li r3, 16
 ; CHECK-P9-BE-NEXT:    stxsiwx vs34, r5, r3
 ; CHECK-P9-BE-NEXT:    li r3, 20
-; CHECK-P9-BE-NEXT:    xxsldwi vs0, vs34, vs0, 2
-; CHECK-P9-BE-NEXT:    stxv vs0, 0(r5)
-; CHECK-P9-BE-NEXT:    xxsldwi vs0, vs34, vs34, 1
 ; CHECK-P9-BE-NEXT:    stfiwx f0, r5, r3
+; CHECK-P9-BE-NEXT:    vperm v3, v2, v2, v3
+; CHECK-P9-BE-NEXT:    stxv vs35, 0(r5)
 ; CHECK-P9-BE-NEXT:    blr
 entry:
   %vecext = extractelement <4 x i32> %a, i32 2