Skip to content

Commit

Permalink
[X86, AVX] replace vinsertf128 intrinsics with generic shuffles
Browse files Browse the repository at this point in the history
We want to replace as much custom x86 shuffling via intrinsics
as possible because pushing the code down the generic shuffle
optimization path allows for better codegen and less complexity
in LLVM.

This is the sibling patch for the Clang half of this change:
http://reviews.llvm.org/D8088

Differential Revision: http://reviews.llvm.org/D8086

llvm-svn: 231794
  • Loading branch information
rotateright committed Mar 10, 2015
1 parent c989506 commit 19792fb
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 124 deletions.
13 changes: 0 additions & 13 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Expand Up @@ -1183,19 +1183,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_vextractf128_si_256 :
GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx_vinsertf128_pd_256 :
GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">,
Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx_vinsertf128_ps_256 :
GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx_vinsertf128_si_256 :
GCCBuiltin<"__builtin_ia32_vinsertf128_si256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
}

// Vector convert
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Expand Up @@ -4956,9 +4956,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
setValue(&I, Res);
return nullptr;
}
case Intrinsic::x86_avx_vinsertf128_pd_256:
case Intrinsic::x86_avx_vinsertf128_ps_256:
case Intrinsic::x86_avx_vinsertf128_si_256:
case Intrinsic::x86_avx2_vinserti128: {
EVT DestVT = TLI.getValueType(I.getType());
EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
Expand Down
52 changes: 51 additions & 1 deletion llvm/lib/IR/AutoUpgrade.cpp
Expand Up @@ -7,7 +7,9 @@
//
//===----------------------------------------------------------------------===//
//
// This file implements the auto-upgrade helper functions
// This file implements the auto-upgrade helper functions.
// This is where deprecated IR intrinsics and other IR features are updated to
// current specifications.
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -156,6 +158,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name.startswith("x86.avx2.pcmpeq.") ||
Name.startswith("x86.avx2.pcmpgt.") ||
Name.startswith("x86.avx.vpermil.") ||
Name == "x86.avx.vinsertf128.pd.256" ||
Name == "x86.avx.vinsertf128.ps.256" ||
Name == "x86.avx.vinsertf128.si.256" ||
Name == "x86.avx.movnt.dq.256" ||
Name == "x86.avx.movnt.pd.256" ||
Name == "x86.avx.movnt.ps.256" ||
Expand Down Expand Up @@ -626,6 +631,51 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
}

Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
} else if (Name == "llvm.x86.avx.vinsertf128.pd.256" ||
Name == "llvm.x86.avx.vinsertf128.ps.256" ||
Name == "llvm.x86.avx.vinsertf128.si.256") {
Value *Op0 = CI->getArgOperand(0);
Value *Op1 = CI->getArgOperand(1);
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
VectorType *VecTy = cast<VectorType>(CI->getType());
unsigned NumElts = VecTy->getNumElements();

// Mask off the high bits of the immediate value; hardware ignores those.
Imm = Imm & 1;

// Extend the second operand into a vector that is twice as big.
Value *UndefV = UndefValue::get(Op1->getType());
SmallVector<Constant*, 8> Idxs;
for (unsigned i = 0; i != NumElts; ++i) {
Idxs.push_back(Builder.getInt32(i));
}
Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs));

// Insert the second operand into the first operand.

// Note that there is no guarantee that instruction lowering will actually
// produce a vinsertf128 instruction for the created shuffles. In
// particular, the 0 immediate case involves no lane changes, so it can
// be handled as a blend.

// Example of shuffle mask for 32-bit elements:
// Imm = 1 <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
// Imm = 0 <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7 >

SmallVector<Constant*, 8> Idxs2;
// The low half of the result is either the low half of the 1st operand
// or the low half of the 2nd operand (the inserted vector).
for (unsigned i = 0; i != NumElts / 2; ++i) {
unsigned Idx = Imm ? i : (i + NumElts);
Idxs2.push_back(Builder.getInt32(Idx));
}
// The high half of the result is either the low half of the 2nd operand
// (the inserted vector) or the high half of the 1st operand.
for (unsigned i = NumElts / 2; i != NumElts; ++i) {
unsigned Idx = Imm ? (i + NumElts / 2) : i;
Idxs2.push_back(Builder.getInt32(Idx));
}
Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
} else {
bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
if (Name == "llvm.x86.avx.vpermil.pd.256")
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,5 +1,41 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s

; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.

define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone

define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone

define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
ret <8 x i32> %res
}

; Verify that high bits of the immediate are masked off. This should be the equivalent
; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
; not a vinsertf128 $1.
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone

define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vblendpd
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
Expand Down
24 changes: 0 additions & 24 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
Expand Up @@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone


define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
; CHECK: vinsertf128
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone


define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
; CHECK: vinsertf128
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone


define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK: vinsertf128
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone


define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vperm2f128
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
Expand Down
42 changes: 11 additions & 31 deletions llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s

; CHECK-LABEL: A:
; CHECK-NOT: vunpck
; CHECK: vinsertf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
Expand All @@ -9,6 +9,7 @@ entry:
ret <8 x float> %shuffle
}

; CHECK-LABEL: B:
; CHECK-NOT: vunpck
; CHECK: vinsertf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
Expand All @@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone

; Just check that no crash happens
; CHECK-SSE: _insert_crash
; CHECK-LABEL: _insert_crash:
define void @insert_crash() nounwind {
allocas:
%v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Expand All @@ -39,15 +40,15 @@ allocas:

;; DAG Combine must remove useless vinsertf128 instructions

; CHECK: DAGCombineA
; CHECK-LABEL: DAGCombineA:
; CHECK-NOT: vinsertf128 $1
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
%1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %2
}

; CHECK: DAGCombineB
; CHECK-LABEL: DAGCombineB:
; CHECK: vpaddd %xmm
; CHECK-NOT: vinsertf128 $1
; CHECK: vpaddd %xmm
Expand All @@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
ret <8 x i32> %2
}

; CHECK: insert_pd
define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
; CHECK: vinsertf128
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
ret <4 x double> %res
}

; CHECK: insert_undef_pd
; CHECK-LABEL: insert_undef_pd:
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
Expand All @@ -73,14 +67,7 @@ ret <4 x double> %res
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone


; CHECK: insert_ps
define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
; CHECK: vinsertf128
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
ret <8 x float> %res
}

; CHECK: insert_undef_ps
; CHECK-LABEL: insert_undef_ps:
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
Expand All @@ -89,14 +76,7 @@ ret <8 x float> %res
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone


; CHECK: insert_si
define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK: vinsertf128
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
ret <8 x i32> %res
}

; CHECK: insert_undef_si
; CHECK-LABEL: insert_undef_si:
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
Expand All @@ -105,7 +85,7 @@ ret <8 x i32> %res
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone

; rdar://10643481
; CHECK: vinsertf128_combine
; CHECK-LABEL: vinsertf128_combine:
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-NOT: vmovaps
; CHECK: vinsertf128
Expand All @@ -118,7 +98,7 @@ entry:
}

; rdar://11076953
; CHECK: vinsertf128_ucombine
; CHECK-LABEL: vinsertf128_ucombine:
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-NOT: vmovups
; CHECK: vinsertf128
Expand Down
52 changes: 0 additions & 52 deletions llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
Expand Up @@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
; Merge two consecutive 16-byte subvector loads into a single 32-byte load
; if it's faster.

declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)

; Use the vinsertf128 intrinsic to model source code
; that explicitly uses AVX intrinsics.
define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads

; SANDYB: vmovups
; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq

; BTVER2: vmovups
; BTVER2-NEXT: retq

; HASWELL: vmovups
; HASWELL-NEXT: retq

%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
%ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
%v1 = load <4 x float>, <4 x float>* %ptr1, align 1
%v2 = load <4 x float>, <4 x float>* %ptr2, align 1
%shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
ret <8 x float> %v3
}

; Swap the operands of the shufflevector and vinsertf128 to ensure that the
; pattern still matches.
define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_swap

; SANDYB: vmovups
; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq

; BTVER2: vmovups
; BTVER2-NEXT: retq

; HASWELL: vmovups
; HASWELL-NEXT: retq

%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
%ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%v1 = load <4 x float>, <4 x float>* %ptr1, align 1
%v2 = load <4 x float>, <4 x float>* %ptr2, align 1
%shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
ret <8 x float> %v3
}

; Replace the vinsertf128 intrinsic with a shufflevector as might be
; expected from auto-vectorized code.
define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_no_intrinsic

Expand Down

0 comments on commit 19792fb

Please sign in to comment.