Skip to content

Commit

Permalink
[X86][SSE] Enable commutation for SSE immediate blend instructions
Browse files Browse the repository at this point in the history
Patch to allow (v)blendps, (v)blendpd, (v)pblendw and vpblendd instructions to be commuted - swaps the src registers and inverts the blend mask.

This is primarily to improve memory folding (see new tests), but it also improves the quality of shuffles (see modified tests).

Differential Revision: http://reviews.llvm.org/D6015

llvm-svn: 221313
  • Loading branch information
RKSimon committed Nov 4, 2014
1 parent de4277a commit c9a0779
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 110 deletions.
49 changes: 49 additions & 0 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Expand Up @@ -2420,6 +2420,41 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
MI->getOperand(3).setImm(Size-Amt);
return TargetInstrInfo::commuteInstruction(MI, NewMI);
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
case X86::PBLENDWrri:
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:
case X86::VPBLENDDrri:
case X86::VPBLENDWrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWYrri:{
unsigned Mask;
switch (MI->getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::BLENDPDrri: Mask = 0x03; break;
case X86::BLENDPSrri: Mask = 0x0F; break;
case X86::PBLENDWrri: Mask = 0xFF; break;
case X86::VBLENDPDrri: Mask = 0x03; break;
case X86::VBLENDPSrri: Mask = 0x0F; break;
case X86::VBLENDPDYrri: Mask = 0x0F; break;
case X86::VBLENDPSYrri: Mask = 0xFF; break;
case X86::VPBLENDDrri: Mask = 0x0F; break;
case X86::VPBLENDWrri: Mask = 0xFF; break;
case X86::VPBLENDDYrri: Mask = 0xFF; break;
case X86::VPBLENDWYrri: Mask = 0xFF; break;
}
unsigned Imm = MI->getOperand(3).getImm();
if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);
NewMI = false;
}
MI->getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstruction(MI, NewMI);
}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
Expand Down Expand Up @@ -2504,6 +2539,20 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {
case X86::BLENDPDrri:
case X86::BLENDPSrri:
case X86::PBLENDWrri:
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:
case X86::VPBLENDDrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWrri:
case X86::VPBLENDWYrri:
SrcOpIdx1 = 1;
SrcOpIdx2 = 2;
return true;
case X86::VFMADDPDr231r:
case X86::VFMADDPSr231r:
case X86::VFMADDSDr231r:
Expand Down
56 changes: 28 additions & 28 deletions llvm/lib/Target/X86/X86InstrSSE.td
Expand Up @@ -7537,31 +7537,33 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,

let Predicates = [HasAVX] in {
let isCommutable = 0 in {
let ExeDomain = SSEPackedSingle in {
defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
VR128, loadv4f32, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
int_x86_avx_blend_ps_256, VR256, loadv8f32,
f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
VEX_4V, VEX_L;
}
let ExeDomain = SSEPackedDouble in {
defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
VR128, loadv2f64, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
int_x86_avx_blend_pd_256,VR256, loadv4f64,
f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
VEX_4V, VEX_L;
}
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
VR128, loadv2i64, i128mem, 0,
DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
}

let ExeDomain = SSEPackedSingle in {
defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
VR128, loadv4f32, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
int_x86_avx_blend_ps_256, VR256, loadv8f32,
f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
VEX_4V, VEX_L;
}
let ExeDomain = SSEPackedDouble in {
defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
VR128, loadv2f64, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
int_x86_avx_blend_pd_256,VR256, loadv4f64,
f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
VEX_4V, VEX_L;
}
defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
VR128, loadv2i64, i128mem, 0,
DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
VR128, loadv2i64, i128mem, 0,
DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
}

let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, loadv4f32, f128mem, 0,
Expand Down Expand Up @@ -7589,6 +7591,10 @@ let Predicates = [HasAVX2] in {

let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
VR128, memopv2i64, i128mem,
1, SSE_MPSADBW_ITINS>;
}
let ExeDomain = SSEPackedSingle in
defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
VR128, memopv4f32, f128mem,
Expand All @@ -7600,10 +7606,6 @@ let Constraints = "$src1 = $dst" in {
defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
VR128, memopv2i64, i128mem,
1, SSE_INTALU_ITINS_BLEND_P>;
defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
VR128, memopv2i64, i128mem,
1, SSE_MPSADBW_ITINS>;
}
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memopv4f32, f128mem, 1,
Expand Down Expand Up @@ -8827,12 +8829,10 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
}

let isCommutable = 0 in {
defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
VR128, loadv2i64, i128mem>;
defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
VR256, loadv4i64, i256mem>, VEX_L;
}

def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
imm:$mask)),
Expand Down
31 changes: 11 additions & 20 deletions llvm/test/CodeGen/X86/combine-or.ll
Expand Up @@ -19,8 +19,7 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
Expand All @@ -32,8 +31,7 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
Expand All @@ -45,8 +43,7 @@ define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
Expand Down Expand Up @@ -106,8 +103,7 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test9:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
%and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
Expand All @@ -119,8 +115,7 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test10:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <2 x i64> %a, <i64 0, i64 -1>
%and2 = and <2 x i64> %b, <i64 -1, i64 0>
Expand All @@ -132,8 +127,7 @@ define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test11:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
Expand Down Expand Up @@ -230,12 +224,10 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test18:
; CHECK: # BB#0:
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: xorps %xmm3, %xmm3
; CHECK-NEXT: blendps {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; CHECK-NEXT: orps %xmm0, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
Expand Down Expand Up @@ -295,8 +287,7 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: test_crash:
; CHECK: # BB#0:
; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
%shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
Expand Down
89 changes: 89 additions & 0 deletions llvm/test/CodeGen/X86/commute-blend-avx2.ll
@@ -0,0 +1,89 @@
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s

define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
%1 = load <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2

;LABEL: commute_fold_vpblendw_128
;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone

define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
%1 = load <16 x i16>* %b
%2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
ret <16 x i16> %2

;LABEL: commute_fold_vpblendw_256
;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
;CHECK-NEXT: retq
}
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone

define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
%1 = load <4 x i32>* %b
%2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
ret <4 x i32> %2

;LABEL: commute_fold_vpblendd_128
;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
;CHECK-NEXT: retq
}
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone

define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
%1 = load <8 x i32>* %b
%2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
ret <8 x i32> %2

;LABEL: commute_fold_vpblendd_256
;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
;CHECK-NEXT: retq
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone

define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
%1 = load <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
ret <4 x float> %2

;LABEL: commute_fold_vblendps_128
;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone

define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
%1 = load <8 x float>* %b
%2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
ret <8 x float> %2

;LABEL: commute_fold_vblendps_256
;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
;CHECK-NEXT: retq
}
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone

define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
%1 = load <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2

;LABEL: commute_fold_vblendpd_128
;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone

define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
%1 = load <4 x double>* %b
%2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
ret <4 x double> %2

;LABEL: commute_fold_vblendpd_256
;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
;CHECK-NEXT: retq
}
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -0,0 +1,34 @@
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s

define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
%1 = load <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2

;LABEL: commute_fold_pblendw
;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone

define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
%1 = load <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
ret <4 x float> %2

;LABEL: commute_fold_blendps
;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone

define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
%1 = load <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2

;LABEL: commute_fold_vblendpd
;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone

0 comments on commit c9a0779

Please sign in to comment.