Skip to content

Commit

Permalink
Merging r339260:
Browse files Browse the repository at this point in the history
------------------------------------------------------------------------
r339260 | syzaara | 2018-08-08 08:20:43 -0700 (Wed, 08 Aug 2018) | 13 lines

[PowerPC] Improve codegen for vector loads using scalar_to_vector

This patch aims to improve the codegen for vector loads involving the
scalar_to_vector (load X) sequence. Initially, ld->mv instructions were used
for scalar_to_vector (load X), so this patch allows scalar_to_vector (load X)
to utilize:

LXSD and LXSDX for i64 and f64
LXSIWAX for i32 (sign extension to i64)
LXSIWZX for i32 and f64

Committing on behalf of Amy Kwan.
Differential Revision: https://reviews.llvm.org/D48950
------------------------------------------------------------------------

llvm-svn: 347957
  • Loading branch information
tstellar committed Nov 30, 2018
1 parent d6ffc0c commit 4a6ae60
Show file tree
Hide file tree
Showing 15 changed files with 1,529 additions and 242 deletions.
1 change: 1 addition & 0 deletions llvm/lib/Target/PowerPC/P9InstrResources.td
Expand Up @@ -592,6 +592,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C],
XXPERM,
XXPERMR,
XXSLDWI,
XXSLDWIs,
XXSPLTIB,
XXSPLTW,
XXSPLTWs,
Expand Down
11 changes: 0 additions & 11 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Expand Up @@ -8454,17 +8454,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);

// If the source for the shuffle is a scalar_to_vector that came from a
// 32-bit load, it will have used LXVWSX so we don't need to splat again.
if (Subtarget.hasP9Vector() &&
((isLittleEndian && SplatIdx == 3) ||
(!isLittleEndian && SplatIdx == 0))) {
SDValue Src = V1.getOperand(0);
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
Src.getOperand(0).getOpcode() == ISD::LOAD &&
Src.getOperand(0).hasOneUse())
return V1;
}
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
DAG.getConstant(SplatIdx, dl, MVT::i32));
Expand Down
91 changes: 82 additions & 9 deletions llvm/lib/Target/PowerPC/PPCInstrVSX.td
Expand Up @@ -877,6 +877,12 @@ let Uses = [RM] in {
"xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
[(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
imm32SExt16:$SHW))]>;

let isCodeGenOnly = 1 in
def XXSLDWIs : XX3Form_2s<60, 2,
(outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$SHW),
"xxsldwi $XT, $XA, $XA, $SHW", IIC_VecPerm, []>;

def XXSPLTW : XX2Form_2<60, 164,
(outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
"xxspltw $XT, $XB, $UIM", IIC_VecPerm,
Expand All @@ -886,6 +892,7 @@ let Uses = [RM] in {
def XXSPLTWs : XX2Form_2<60, 164,
(outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
"xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;

} // hasSideEffects
} // UseVSXReg = 1

Expand Down Expand Up @@ -1466,8 +1473,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
(f64 (PPCmtvsra (i64 (vector_extract v2i64:$S, 1)))))),
(f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
}
def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
(v4i32 (XXSPLTWs (LIWAX xoaddr:$src), 1))>;

// Instructions for converting float to i64 feeding a store.
let Predicates = [NoP9Vector] in {
Expand Down Expand Up @@ -3050,13 +3055,47 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(STXVX $rS, xoaddr:$dst)>;
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
(STXVX $rS, xoaddr:$dst)>;
def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
(v4i32 (LXVWSX xoaddr:$src))>;
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
(v4f32 (LXVWSX xoaddr:$src))>;
def : Pat<(v4f32 (scalar_to_vector
(f32 (fpround (f64 (extloadf32 xoaddr:$src)))))),
(v4f32 (LXVWSX xoaddr:$src))>;

let AddedComplexity = 400 in {
// LIWAX - This instruction is used for sign extending i32 -> i64.
// LIWZX - This instruction will be emitted for i32, f32, and when
// zero-extending i32 to i64 (zext i32 -> i64).
let Predicates = [IsLittleEndian] in {

def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
(v2i64 (XXPERMDIs
(COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC), 2))>;

def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
(v2i64 (XXPERMDIs
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;

def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
(v4i32 (XXPERMDIs
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;

def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
(v4f32 (XXPERMDIs
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
}

let Predicates = [IsBigEndian] in {
def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
(v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;

def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
(v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;

def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
(v4i32 (XXSLDWIs
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;

def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
(v4f32 (XXSLDWIs
(COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
}

}

// Build vectors from i8 loads
def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
Expand Down Expand Up @@ -3218,6 +3257,39 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
(f32 (DFLOADf32 ixaddr:$src))>;


let AddedComplexity = 400 in {
// The following pseudoinstructions are used to ensure the utilization
// of all 64 VSX registers.
let Predicates = [IsLittleEndian, HasP9Vector] in {
def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
(v2i64 (XXPERMDIs
(COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
(v2i64 (XXPERMDIs
(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;

def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
(v2f64 (XXPERMDIs
(COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC), 2))>;
def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
(v2f64 (XXPERMDIs
(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC), 2))>;
}

let Predicates = [IsBigEndian, HasP9Vector] in {
def : Pat<(v2i64 (scalar_to_vector (i64 (load ixaddr:$src)))),
(v2i64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddr:$src)))),
(v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;

def : Pat<(v2f64 (scalar_to_vector (f64 (load ixaddr:$src)))),
(v2f64 (COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC))>;
def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddr:$src)))),
(v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC))>;
}
}

let Predicates = [IsBigEndian, HasP9Vector] in {

// (Un)Signed DWord vector extract -> QP
Expand Down Expand Up @@ -3932,3 +4004,4 @@ let AddedComplexity = 400 in {
(v4i32 (VEXTSH2W $A))>;
}
}

61 changes: 36 additions & 25 deletions llvm/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
@@ -1,35 +1,46 @@
; RUN: llc < %s -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P8
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
; RUN: | FileCheck %s --check-prefix=CHECK-P8
; RUN: llc < %s -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
; RUN: -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-P9
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs \
; RUN: | FileCheck %s --check-prefix=CHECK-P9

@a = external local_unnamed_addr global <4 x i32>, align 16
@pb = external local_unnamed_addr global float*, align 8

define void @testExpandPostRAPseudo(i32* nocapture readonly %ptr) {
; CHECK-P8-LABEL: testExpandPostRAPseudo:
; CHECK-P8: lxsiwax 34, 0, 3
; CHECK-P8-NEXT: xxspltw 34, 34, 1
; CHECK-P8-NEXT: stvx 2, 0, 4
; CHECK-P8: #APP
; CHECK-P8-NEXT: #Clobber Rigisters
; CHECK-P8-NEXT: #NO_APP
; CHECK-P8-NEXT: lis 4, 1024
; CHECK-P8-NEXT: lfiwax 0, 0, 3
; CHECK-P8: stfsx 0, 3, 4
; CHECK-P8-NEXT: blr

; CHECK-P9-LABEL: testExpandPostRAPseudo:
; CHECK-P9: lxvwsx 0, 0, 3
; CHECK-P9: stxvx 0, 0, 4
; CHECK-P9: #APP
; CHECK-P9-NEXT: #Clobber Rigisters
; CHECK-P9-NEXT: #NO_APP
; CHECK-P9-NEXT: lis 4, 1024
; CHECK-P9-NEXT: lfiwax 0, 0, 3
; CHECK-P9: stfsx 0, 3, 4
; CHECK-P9-NEXT: blr

; CHECK-P8-LABEL: testExpandPostRAPseudo:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8: lfiwzx f0, 0, r3
; CHECK-P8: ld r4, .LC0@toc@l(r4)
; CHECK-P8: xxpermdi vs0, f0, f0, 2
; CHECK-P8: xxspltw v2, vs0, 3
; CHECK-P8: stvx v2, 0, r4
; CHECK-P8: lis r4, 1024
; CHECK-P8: lfiwax f0, 0, r3
; CHECK-P8: addis r3, r2, .LC1@toc@ha
; CHECK-P8: ld r3, .LC1@toc@l(r3)
; CHECK-P8: xscvsxdsp f0, f0
; CHECK-P8: ld r3, 0(r3)
; CHECK-P8: stfsx f0, r3, r4
; CHECK-P8: blr
;
; CHECK-P9-LABEL: testExpandPostRAPseudo:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9: lfiwzx f0, 0, r3
; CHECK-P9: addis r4, r2, .LC0@toc@ha
; CHECK-P9: ld r4, .LC0@toc@l(r4)
; CHECK-P9: xxpermdi vs0, f0, f0, 2
; CHECK-P9: xxspltw vs0, vs0, 3
; CHECK-P9: stxvx vs0, 0, r4
; CHECK-P9: lis r4, 1024
; CHECK-P9: lfiwax f0, 0, r3
; CHECK-P9: addis r3, r2, .LC1@toc@ha
; CHECK-P9: ld r3, .LC1@toc@l(r3)
; CHECK-P9: xscvsxdsp f0, f0
; CHECK-P9: ld r3, 0(r3)
; CHECK-P9: stfsx f0, r3, r4
; CHECK-P9: blr
entry:
%0 = load i32, i32* %ptr, align 4
%splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
Expand Down
44 changes: 28 additions & 16 deletions llvm/test/CodeGen/PowerPC/build-vector-tests.ll
Expand Up @@ -109,8 +109,8 @@
;vector int spltRegVali(int val) { //
; return (vector int) val; //
;} //
;// P8: lxsiwax, xxspltw //
;// P9: lxvwsx //
;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
;vector int spltMemVali(int *ptr) { //
; return (vector int)*ptr; //
;} //
Expand Down Expand Up @@ -284,8 +284,8 @@
;vector unsigned int spltRegValui(unsigned int val) { //
; return (vector unsigned int) val; //
;} //
;// P8: lxsiwax, xxspltw //
;// P9: lxvwsx //
;// P8: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
;// P9: (LE) lfiwzx, xxpermdi, xxspltw (BE): lfiwzx, xxsldwi, xxspltw //
;vector unsigned int spltMemValui(unsigned int *ptr) { //
; return (vector unsigned int)*ptr; //
;} //
Expand Down Expand Up @@ -1202,15 +1202,21 @@ entry:
; P9LE-LABEL: spltMemVali
; P8BE-LABEL: spltMemVali
; P8LE-LABEL: spltMemVali
; P9BE: lxvwsx v2, 0, r3
; P9BE: lfiwzx f0, 0, r3
; P9BE: xxsldwi vs0, f0, f0, 1
; P9BE: xxspltw v2, vs0, 0
; P9BE: blr
; P9LE: lxvwsx v2, 0, r3
; P9LE: lfiwzx f0, 0, r3
; P9LE: xxpermdi vs0, f0, f0, 2
; P9LE: xxspltw v2, vs0, 3
; P9LE: blr
; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
; P8BE: lfiwzx f0, 0, r3
; P8BE: xxsldwi vs0, f0, f0, 1
; P8BE: xxspltw v2, vs0, 0
; P8BE: blr
; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
; P8LE: lfiwzx f0, 0, r3
; P8LE: xxpermdi vs0, f0, f0, 2
; P8LE: xxspltw v2, vs0, 3
; P8LE: blr
}

Expand Down Expand Up @@ -2338,15 +2344,21 @@ entry:
; P9LE-LABEL: spltMemValui
; P8BE-LABEL: spltMemValui
; P8LE-LABEL: spltMemValui
; P9BE: lxvwsx v2, 0, r3
; P9BE: lfiwzx f0, 0, r3
; P9BE: xxsldwi vs0, f0, f0, 1
; P9BE: xxspltw v2, vs0, 0
; P9BE: blr
; P9LE: lxvwsx v2, 0, r3
; P9LE: lfiwzx f0, 0, r3
; P9LE: xxpermdi vs0, f0, f0, 2
; P9LE: xxspltw v2, vs0, 3
; P9LE: blr
; P8BE: lxsiwax {{[vsf0-9]+}}, 0, r3
; P8BE: xxspltw v2, {{[vsf0-9]+}}, 1
; P8BE: lfiwzx f0, 0, r3
; P8BE: xxsldwi vs0, f0, f0, 1
; P8BE: xxspltw v2, vs0, 0
; P8BE: blr
; P8LE: lxsiwax {{[vsf0-9]+}}, 0, r3
; P8LE: xxspltw v2, {{[vsf0-9]+}}, 1
; P8LE: lfiwzx f0, 0, r3
; P8LE: xxpermdi vs0, f0, f0, 2
; P8LE: xxspltw v2, vs0, 3
; P8LE: blr
}

Expand Down
22 changes: 17 additions & 5 deletions llvm/test/CodeGen/PowerPC/load-v4i8-improved.ll
@@ -1,15 +1,27 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s \
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck --check-prefix=CHECK-LE \
; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s \
; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names | FileCheck \
; RUN: -implicit-check-not vmrg -implicit-check-not=vperm %s

define <16 x i8> @test(i32* %s, i32* %t) {
; CHECK-LE-LABEL: test:
; CHECK-LE: # %bb.0: # %entry
; CHECK-LE-NEXT: lfiwzx f0, 0, r3
; CHECK-LE-NEXT: xxpermdi vs0, f0, f0, 2
; CHECK-LE-NEXT: xxspltw v2, vs0, 3
; CHECK-LE-NEXT: blr

; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lfiwzx f0, 0, r3
; CHECK-NEXT: xxsldwi vs0, f0, f0, 1
; CHECK-NEXT: xxspltw v2, vs0, 0
; CHECK-NEXT: blr
entry:
%0 = bitcast i32* %s to <4 x i8>*
%1 = load <4 x i8>, <4 x i8>* %0, align 4
%2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x i8> %2
; CHECK-LABEL: test
; CHECK: lxsiwax 34, 0, 3
; CHECK: xxspltw 34, 34, 1
}

0 comments on commit 4a6ae60

Please sign in to comment.