[X86, AVX] replace vinsertf128 intrinsics with generic shuffles

We want to replace as much custom x86 shuffling via intrinsics as possible because pushing the code down the generic shuffle optimization path allows for better codegen and less complexity in LLVM. This is the sibling patch for the Clang half of this change: http://reviews.llvm.org/D8088 Differential Revision: http://reviews.llvm.org/D8086 llvm-svn: 231794
llvm · Mar 10, 2015 · 19792fb · 19792fb
1 parent c989506
commit 19792fb
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 124 deletions.
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1183,19 +1183,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_vextractf128_si_256 :
         GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx_vinsertf128_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vinsertf128_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vinsertf128_si_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_si256">,
-        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                  llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Vector convert

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4956,9 +4956,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
-  case Intrinsic::x86_avx_vinsertf128_pd_256:
-  case Intrinsic::x86_avx_vinsertf128_ps_256:
-  case Intrinsic::x86_avx_vinsertf128_si_256:
   case Intrinsic::x86_avx2_vinserti128: {
     EVT DestVT = TLI.getValueType(I.getType());
     EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the auto-upgrade helper functions
+// This file implements the auto-upgrade helper functions.
+// This is where deprecated IR intrinsics and other IR features are updated to
+// current specifications.
 //
 //===----------------------------------------------------------------------===//
 
@@ -156,6 +158,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name.startswith("x86.avx2.pcmpeq.") ||
         Name.startswith("x86.avx2.pcmpgt.") ||
         Name.startswith("x86.avx.vpermil.") ||
+        Name == "x86.avx.vinsertf128.pd.256" ||
+        Name == "x86.avx.vinsertf128.ps.256" ||
+        Name == "x86.avx.vinsertf128.si.256" ||
         Name == "x86.avx.movnt.dq.256" ||
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
@@ -626,6 +631,51 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       }
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
+    } else if (Name == "llvm.x86.avx.vinsertf128.pd.256" ||
+               Name == "llvm.x86.avx.vinsertf128.ps.256" ||
+               Name == "llvm.x86.avx.vinsertf128.si.256") {
+      Value *Op0 = CI->getArgOperand(0);
+      Value *Op1 = CI->getArgOperand(1);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned NumElts = VecTy->getNumElements();
+
+      // Mask off the high bits of the immediate value; hardware ignores those.
+      Imm = Imm & 1;
+
+      // Extend the second operand into a vector that is twice as big.
+      Value *UndefV = UndefValue::get(Op1->getType());
+      SmallVector<Constant*, 8> Idxs;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        Idxs.push_back(Builder.getInt32(i));
+      }
+      Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs));
+
+      // Insert the second operand into the first operand.
+
+      // Note that there is no guarantee that instruction lowering will actually
+      // produce a vinsertf128 instruction for the created shuffles. In
+      // particular, the 0 immediate case involves no lane changes, so it can
+      // be handled as a blend.
+
+      // Example of shuffle mask for 32-bit elements:
+      // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
+      // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >
+
+      SmallVector<Constant*, 8> Idxs2;
+      // The low half of the result is either the low half of the 1st operand
+      // or the low half of the 2nd operand (the inserted vector).
+      for (unsigned i = 0; i != NumElts / 2; ++i) {
+        unsigned Idx = Imm ? i : (i + NumElts);
+        Idxs2.push_back(Builder.getInt32(Idx));
+      }
+      // The high half of the result is either the low half of the 2nd operand
+      // (the inserted vector) or the high half of the 1st operand.
+      for (unsigned i = NumElts / 2; i != NumElts; ++i) {
+        unsigned Idx = Imm ? (i + NumElts / 2) : i;
+        Idxs2.push_back(Builder.getInt32(Idx));
+      }
+      Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")

diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,5 +1,41 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. 
+
+define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
+  ret <8 x i32> %res
+}
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
+; not a vinsertf128 $1.
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: 
+  ; CHECK-NOT:   vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
+
 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vblendpd
   %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]

diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 
-define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-
-
 define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vperm2f128
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]

diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
 
+; CHECK-LABEL: A:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
@@ -9,6 +9,7 @@ entry:
   ret <8 x float> %shuffle
 }
 
+; CHECK-LABEL: B:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
@@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
 ; Just check that no crash happens
-; CHECK-SSE: _insert_crash
+; CHECK-LABEL: _insert_crash:
 define void @insert_crash() nounwind {
 allocas:
   %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -39,15 +40,15 @@ allocas:
 
 ;; DAG Combine must remove useless vinsertf128 instructions
 
-; CHECK: DAGCombineA
+; CHECK-LABEL: DAGCombineA:
 ; CHECK-NOT: vinsertf128 $1
 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
   %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %2
 }
 
-; CHECK: DAGCombineB
+; CHECK-LABEL: DAGCombineB:
 ; CHECK: vpaddd %xmm
 ; CHECK-NOT: vinsertf128  $1
 ; CHECK: vpaddd %xmm
@@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
   ret <8 x i32> %2
 }
 
-; CHECK: insert_pd
-define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vinsertf128
-%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
-ret <4 x double> %res
-}
-
-; CHECK: insert_undef_pd
+; CHECK-LABEL: insert_undef_pd:
 define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -73,14 +67,7 @@ ret <4 x double> %res
 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
 
 
-; CHECK: insert_ps
-define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
-ret <8 x float> %res
-}
-
-; CHECK: insert_undef_ps
+; CHECK-LABEL: insert_undef_ps:
 define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -89,14 +76,7 @@ ret <8 x float> %res
 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
 
 
-; CHECK: insert_si
-define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
-ret <8 x i32> %res
-}
-
-; CHECK: insert_undef_si
+; CHECK-LABEL: insert_undef_si:
 define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -105,7 +85,7 @@ ret <8 x i32> %res
 declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
 
 ; rdar://10643481
-; CHECK: vinsertf128_combine
+; CHECK-LABEL: vinsertf128_combine:
 define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovaps
 ; CHECK: vinsertf128
@@ -118,7 +98,7 @@ entry:
 }
 
 ; rdar://11076953
-; CHECK: vinsertf128_ucombine
+; CHECK-LABEL: vinsertf128_ucombine:
 define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovups
 ; CHECK: vinsertf128

diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
 ; if it's faster.
 
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
-
-; Use the vinsertf128 intrinsic to model source code 
-; that explicitly uses AVX intrinsics.
-define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
-  ret <8 x float> %v3
-}
-
-; Swap the operands of the shufflevector and vinsertf128 to ensure that the
-; pattern still matches.
-define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads_swap
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
-  ret <8 x float> %v3
-}
-
-; Replace the vinsertf128 intrinsic with a shufflevector as might be
-; expected from auto-vectorized code.
 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic