13 changes: 8 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1023,7 +1023,8 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
}
}

static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
bool HasWholeProgramVisibility) {
SetVector<Function *> Functions;
for (Function &F : M) {
if (!F.isIntrinsic())
Expand All @@ -1038,9 +1039,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID});
&AAUnderlyingObjects::ID, &AAIndirectCallInfo::ID});

AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = HasWholeProgramVisibility;
AC.Allowed = &Allowed;
AC.IsModulePass = true;
AC.DefaultInitializeLiveInternals = false;
Expand Down Expand Up @@ -1086,7 +1088,7 @@ class AMDGPUAttributorLegacy : public ModulePass {

bool runOnModule(Module &M) override {
AnalysisGetter AG(this);
return runImpl(M, AG, *TM);
return runImpl(M, AG, *TM, /*HasWholeProgramVisibility=*/false);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
Expand All @@ -1107,8 +1109,9 @@ PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
AnalysisGetter AG(FAM);

// TODO: Probably preserves CFG
return runImpl(M, AG, TM) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
return runImpl(M, AG, TM, HasWholeProgramVisibility)
? PreservedAnalyses::none()
: PreservedAnalyses::all();
}

char AMDGPUAttributorLegacy::ID = 0;
Expand Down
15 changes: 9 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -735,12 +735,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});

// FIXME: Why is AMDGPUAttributor not in CGSCC?
PB.registerOptimizerLastEPCallback(
[this](ModulePassManager &MPM, OptimizationLevel Level) {
if (Level != OptimizationLevel::O0) {
MPM.addPass(AMDGPUAttributorPass(*this));
}
});
PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (Level != OptimizationLevel::O0) {
MPM.addPass(AMDGPUAttributorPass(
*this, Phase == ThinOrFullLTOPhase::FullLTOPostLink ||
Phase == ThinOrFullLTOPhase::ThinLTOPostLink));
}
});

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
Expand Down
22 changes: 20 additions & 2 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1963,7 +1963,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
return LT.first * ST->getMVEVectorCostFactor(CostKind);

// Otherwise we use a legal convert followed by a min+max
// If we can we use a legal convert followed by a min+max
if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
(ST->hasFP64() && LT.second == MVT::f64) ||
(ST->hasFullFP16() && LT.second == MVT::f16) ||
Expand All @@ -1984,7 +1984,25 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Cost += getIntrinsicInstrCost(Attrs2, CostKind);
return LT.first * Cost;
}
break;
// Otherwise we need to follow the default expansion that clamps the value
// using a float min/max with a fcmp+sel for nan handling when signed.
Type *FPTy = ICA.getArgTypes()[0];
Type *RetTy = ICA.getReturnType();
IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
Cost += getIntrinsicInstrCost(Attrs2, CostKind);
Cost +=
getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
RetTy, FPTy, TTI::CastContextHint::None, CostKind);
if (IsSigned) {
Type *CondTy = RetTy->getWithNewBitWidth(1);
Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
CmpInst::FCMP_UNO, CostKind);
Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
CmpInst::FCMP_UNO, CostKind);
}
return Cost;
}
}

Expand Down
5 changes: 1 addition & 4 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1500,10 +1500,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
// Don't try to simplify calls without uses. It will not do anything useful,
// but will result in the following folds being skipped.
if (!CI.use_empty()) {
SmallVector<Value *, 4> Args;
Args.reserve(CI.arg_size());
for (Value *Op : CI.args())
Args.push_back(Op);
SmallVector<Value *, 8> Args(CI.args());
if (Value *V = simplifyCall(&CI, CI.getCalledOperand(), Args,
SQ.getWithInstruction(&CI)))
return replaceInstUsesWith(CI, V);
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Bindings/llvm-c/echo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ define void @types() {
%9 = alloca [3 x i22], align 4
%10 = alloca ptr addrspace(5), align 8
%11 = alloca <5 x ptr>, align 64
%12 = alloca x86_mmx, align 8
%12 = alloca <1 x i64>, align 8
ret void
}

Expand Down
2 changes: 0 additions & 2 deletions llvm/test/Bitcode/compatibility.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1112,8 +1112,6 @@ define void @typesystem() {
; CHECK: %t5 = alloca x86_fp80
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca ptr
; CHECK: %t8 = alloca ptr
%t9 = alloca <4 x i32>
Expand Down
14 changes: 13 additions & 1 deletion llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,19 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
; CHECK-NEXT: call void [[FPTR]]()
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
; CHECK: 2:
; CHECK-NEXT: call void @also_empty()
; CHECK-NEXT: br label [[TMP6:%.*]]
; CHECK: 3:
; CHECK-NEXT: br i1 true, label [[TMP4:%.*]], label [[TMP5:%.*]]
; CHECK: 4:
; CHECK-NEXT: call void @empty()
; CHECK-NEXT: br label [[TMP6]]
; CHECK: 5:
; CHECK-NEXT: unreachable
; CHECK: 6:
; CHECK-NEXT: ret void
;
%fptr = select i1 %cond, ptr @empty, ptr @also_empty
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,8 @@ define void @test(<1 x i64> %c64, <1 x i64> %mask1, ptr %P) {
; CHECK-NEXT: popl %edi
; CHECK-NEXT: retl
entry:
%tmp4 = bitcast <1 x i64> %mask1 to x86_mmx ; <x86_mmx> [#uses=1]
%tmp6 = bitcast <1 x i64> %c64 to x86_mmx ; <x86_mmx> [#uses=1]
tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, ptr %P )
tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %mask1, <1 x i64> %c64, ptr %P )
ret void
}

declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | FileCheck %s

@R = external global x86_mmx ; <ptr> [#uses=1]
@R = external global <1 x i64> ; <ptr> [#uses=1]

define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
; CHECK-LABEL: foo:
Expand All @@ -14,13 +14,11 @@ define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
; CHECK-NEXT: emms
; CHECK-NEXT: retq
entry:
%tmp4 = bitcast <1 x i64> %B to x86_mmx ; <<4 x i16>> [#uses=1]
%tmp6 = bitcast <1 x i64> %A to x86_mmx ; <<4 x i16>> [#uses=1]
%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 ) ; <x86_mmx> [#uses=1]
store x86_mmx %tmp7, ptr @R
%tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w( <1 x i64> %A, <1 x i64> %B ) ; <<1 x i64>> [#uses=1]
store <1 x i64> %tmp7, ptr @R
tail call void @llvm.x86.mmx.emms( )
ret void
}

declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
declare void @llvm.x86.mmx.emms()
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ entry:
tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
%tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <x86_mmx> [#uses=1]
%tmp1 = tail call <1 x i64> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <<1 x i64>> [#uses=1]
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind ; <i32> [#uses=1]
%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef ) nounwind ; <i32> [#uses=1]
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind
tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef, i32 undef, i32 %tmp3 ) nounwind
tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind
%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind ; <i32> [#uses=0]
%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> %tmp1 ) nounwind ; <i32> [#uses=0]
ret i32 undef
}
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ entry:
br i1 false, label %bb.nph144.split, label %bb133

bb.nph144.split: ; preds = %entry
%tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
%tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, ptr null ) nounwind
%tmp = bitcast <8 x i8> zeroinitializer to <1 x i64>
%tmp2 = bitcast <8 x i8> zeroinitializer to <1 x i64>
tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp, <1 x i64> %tmp2, ptr null ) nounwind
unreachable

bb133: ; preds = %entry
ret void
}

declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ entry:

; This is how to get MMX instructions.

define <2 x double> @a2(x86_mmx %x) nounwind {
define <2 x double> @a2(<1 x i64> %x) nounwind {
; CHECK-LABEL: a2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
Expand All @@ -42,11 +42,11 @@ define <2 x double> @a2(x86_mmx %x) nounwind {
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
%y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %x)
ret <2 x double> %y
}

define x86_mmx @b2(<2 x double> %x) nounwind {
define <1 x i64> @b2(<2 x double> %x) nounwind {
; CHECK-LABEL: b2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
Expand All @@ -61,9 +61,9 @@ define x86_mmx @b2(<2 x double> %x) nounwind {
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
ret x86_mmx %y
%y = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
ret <1 x i64> %y
}

declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>)
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "i386-apple-macosx10.6.6"

%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
%0 = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }

define i32 @pixman_fill_mmx(ptr nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp {
entry:
%conv = zext i32 %xor to i64
%shl = shl nuw i64 %conv, 32
%or = or i64 %shl, %conv
%0 = bitcast i64 %or to x86_mmx
%0 = bitcast i64 %or to <1 x i64>
; CHECK: movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
%1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0
%1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %0) nounwind, !srcloc !0
%asmresult = extractvalue %0 %1, 0
%asmresult6 = extractvalue %0 %1, 1
%asmresult7 = extractvalue %0 %1, 2
Expand All @@ -34,7 +34,7 @@ entry:
; CHECK-NEXT: movq {{%mm[0-7]}},
; CHECK-NEXT: movq {{%mm[0-7]}},
; CHECK-NEXT: movq {{%mm[0-7]}},
tail call void asm sideeffect "movq\09$1,\09 ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1
tail call void asm sideeffect "movq\09$1,\09 ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, <1 x i64> %0, <1 x i64> %asmresult, <1 x i64> %asmresult6, <1 x i64> %asmresult7, <1 x i64> %asmresult8, <1 x i64> %asmresult9, <1 x i64> %asmresult10, <1 x i64> %asmresult11) nounwind, !srcloc !1
tail call void @llvm.x86.mmx.emms() nounwind
ret i32 1
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-vbroadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1011,7 +1011,7 @@ define float @broadcast_lifetime() nounwind {
ret float %7
}

define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
; X86-LABEL: broadcast_x86_mmx:
; X86: ## %bb.0: ## %bb
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
Expand All @@ -1023,7 +1023,7 @@ define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
bb:
%tmp1 = bitcast x86_mmx %tmp to i64
%tmp1 = bitcast <1 x i64> %tmp to i64
%tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
%tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx2-vbroadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1449,7 +1449,7 @@ eintry:
ret void
}

define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
; X86-LABEL: broadcast_x86_mmx:
; X86: ## %bb.0: ## %bb
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
Expand All @@ -1466,7 +1466,7 @@ define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
; X64-AVX512VL-NEXT: vpbroadcastq %rdi, %xmm0
; X64-AVX512VL-NEXT: retq
bb:
%tmp1 = bitcast x86_mmx %tmp to i64
%tmp1 = bitcast <1 x i64> %tmp to i64
%tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
%tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/CodeGen/X86/bitcast-mmx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ define i32 @t0(i64 %x) nounwind {
; X64-NEXT: retq
entry:
%0 = bitcast i64 %x to <4 x i16>
%1 = bitcast <4 x i16> %0 to x86_mmx
%2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18)
%3 = bitcast x86_mmx %2 to <4 x i16>
%1 = bitcast <4 x i16> %0 to <1 x i64>
%2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 -18)
%3 = bitcast <1 x i64> %2 to <4 x i16>
%4 = bitcast <4 x i16> %3 to <1 x i64>
%5 = extractelement <1 x i64> %4, i32 0
%6 = bitcast i64 %5 to <2 x i32>
Expand Down Expand Up @@ -52,9 +52,9 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = bitcast i64 %x to x86_mmx
%1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
%2 = bitcast x86_mmx %1 to i64
%0 = bitcast i64 %x to <1 x i64>
%1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %n)
%2 = bitcast <1 x i64> %1 to i64
ret i64 %2
}

Expand Down Expand Up @@ -88,11 +88,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
entry:
%0 = insertelement <2 x i32> undef, i32 %w, i32 0
%1 = insertelement <2 x i32> %0, i32 0, i32 1
%2 = bitcast <2 x i32> %1 to x86_mmx
%3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n)
%4 = bitcast i64 %x to x86_mmx
%5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3)
%6 = bitcast x86_mmx %5 to i64
%2 = bitcast <2 x i32> %1 to <1 x i64>
%3 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %2, i32 %n)
%4 = bitcast i64 %x to <1 x i64>
%5 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %4, <1 x i64> %3)
%6 = bitcast <1 x i64> %5 to i64
ret i64 %6
}

Expand Down Expand Up @@ -123,14 +123,14 @@ define i64 @t3(ptr %y, ptr %n) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %y, align 8
%0 = load <1 x i64>, ptr %y, align 8
%1 = load i32, ptr %n, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}

declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone {
entry:
%0 = bitcast <2 x i32> %a to x86_mmx
%1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0)
%2 = bitcast x86_mmx %1 to <2 x i32>
%0 = bitcast <2 x i32> %a to <1 x i64>
%1 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %0, <1 x i64> %0)
%2 = bitcast <1 x i64> %1 to <2 x i32>
ret <2 x i32> %2
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/fast-isel-bc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

; PR4684

declare void @func2(x86_mmx)
declare void @func2(<1 x i64>)

; This isn't spectacular, but it's MMX code at -O0...

Expand All @@ -28,7 +28,7 @@ define void @func1() nounwind {
; X64-NEXT: callq _func2
; X64-NEXT: popq %rax
; X64-NEXT: retq
%tmp0 = bitcast <2 x i32> <i32 0, i32 2> to x86_mmx
call void @func2(x86_mmx %tmp0)
%tmp0 = bitcast <2 x i32> <i32 0, i32 2> to <1 x i64>
call void @func2(<1 x i64> %tmp0)
ret void
}
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
; ALL-NEXT: movntq %mm0, (%rsi)
; ALL-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a0
%1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
store x86_mmx %1, ptr %a1, align 8, !nontemporal !1
%0 = load <1 x i64>, ptr %a0
%1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
store <1 x i64> %1, ptr %a1, align 8, !nontemporal !1
ret void
}
declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone

;
; 128-bit Vector Stores
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ define void @t3() nounwind {
; X86-64-NEXT: xorl %eax, %eax
; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL
%tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
%tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
%tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
%tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
%tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
ret void
}

define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
define void @t4(<1 x i64> %v1, <1 x i64> %v2) nounwind {
; X86-64-LABEL: t4:
; X86-64: ## %bb.0:
; X86-64-NEXT: movq %rdi, %xmm0
Expand All @@ -28,11 +28,11 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
; X86-64-NEXT: movq %xmm1, %rdi
; X86-64-NEXT: xorl %eax, %eax
; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL
%v1a = bitcast x86_mmx %v1 to <8 x i8>
%v2b = bitcast x86_mmx %v2 to <8 x i8>
%v1a = bitcast <1 x i64> %v1 to <8 x i8>
%v2b = bitcast <1 x i64> %v2 to <8 x i8>
%tmp3 = add <8 x i8> %v1a, %v2b
%tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
%tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
%tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
%tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
ret void
}

Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/mmx-arg-passing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.

@u1 = external global x86_mmx
@u1 = external global <1 x i64>

define void @t1(x86_mmx %v1) nounwind {
define void @t1(<1 x i64> %v1) nounwind {
; X86-32-LABEL: t1:
; X86-32: ## %bb.0:
; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
Expand All @@ -25,11 +25,11 @@ define void @t1(x86_mmx %v1) nounwind {
; X86-64-NEXT: movq _u1@GOTPCREL(%rip), %rax
; X86-64-NEXT: movq %rdi, (%rax)
; X86-64-NEXT: retq
store x86_mmx %v1, ptr @u1, align 8
store <1 x i64> %v1, ptr @u1, align 8
ret void
}

@u2 = external global x86_mmx
@u2 = external global <1 x i64>

define void @t2(<1 x i64> %v1) nounwind {
; X86-32-LABEL: t2:
Expand All @@ -46,7 +46,6 @@ define void @t2(<1 x i64> %v1) nounwind {
; X86-64-NEXT: movq _u2@GOTPCREL(%rip), %rax
; X86-64-NEXT: movq %rdi, (%rax)
; X86-64-NEXT: retq
%tmp = bitcast <1 x i64> %v1 to x86_mmx
store x86_mmx %tmp, ptr @u2, align 8
store <1 x i64> %v1, ptr @u2, align 8
ret void
}
307 changes: 153 additions & 154 deletions llvm/test/CodeGen/X86/mmx-arith.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

define void @bar() {
entry:
%0 = bitcast double 0.0 to x86_mmx
%1 = call x86_mmx @foo(x86_mmx %0)
%0 = bitcast double 0.0 to <1 x i64>
%1 = call <1 x i64> @foo(<1 x i64> %0)
ret void
}

declare x86_mmx @foo(x86_mmx)
declare <1 x i64> @foo(<1 x i64>)
50 changes: 23 additions & 27 deletions llvm/test/CodeGen/X86/mmx-bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ define i64 @t0(ptr %p) {
; CHECK-NEXT: paddq %mm0, %mm0
; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%t = load x86_mmx, ptr %p
%u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
%s = bitcast x86_mmx %u to i64
%t = load <1 x i64>, ptr %p
%u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %t)
%s = bitcast <1 x i64> %u to i64
ret i64 %s
}

Expand All @@ -21,9 +21,9 @@ define i64 @t1(ptr %p) {
; CHECK-NEXT: paddd %mm0, %mm0
; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%t = load x86_mmx, ptr %p
%u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
%s = bitcast x86_mmx %u to i64
%t = load <1 x i64>, ptr %p
%u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %t)
%s = bitcast <1 x i64> %u to i64
ret i64 %s
}

Expand All @@ -34,9 +34,9 @@ define i64 @t2(ptr %p) {
; CHECK-NEXT: paddw %mm0, %mm0
; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%t = load x86_mmx, ptr %p
%u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
%s = bitcast x86_mmx %u to i64
%t = load <1 x i64>, ptr %p
%u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %t)
%s = bitcast <1 x i64> %u to i64
ret i64 %s
}

Expand All @@ -47,13 +47,13 @@ define i64 @t3(ptr %p) {
; CHECK-NEXT: paddb %mm0, %mm0
; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%t = load x86_mmx, ptr %p
%u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
%s = bitcast x86_mmx %u to i64
%t = load <1 x i64>, ptr %p
%u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %t)
%s = bitcast <1 x i64> %u to i64
ret i64 %s
}

@R = external global x86_mmx
@R = external global <1 x i64>

define void @t4(<1 x i64> %A, <1 x i64> %B) {
; CHECK-LABEL: t4:
Expand All @@ -66,10 +66,8 @@ define void @t4(<1 x i64> %A, <1 x i64> %B) {
; CHECK-NEXT: emms
; CHECK-NEXT: retq
entry:
%tmp2 = bitcast <1 x i64> %A to x86_mmx
%tmp3 = bitcast <1 x i64> %B to x86_mmx
%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3)
store x86_mmx %tmp7, ptr @R
%tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %A, <1 x i64> %B)
store <1 x i64> %tmp7, ptr @R
tail call void @llvm.x86.mmx.emms()
ret void
}
Expand All @@ -88,7 +86,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
ret i64 %conv
}

declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)

define <1 x i64> @t6(i64 %t) {
; CHECK-LABEL: t6:
Expand All @@ -98,16 +96,14 @@ define <1 x i64> @t6(i64 %t) {
; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%t1 = insertelement <1 x i64> undef, i64 %t, i32 0
%t0 = bitcast <1 x i64> %t1 to x86_mmx
%t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
%t3 = bitcast x86_mmx %t2 to <1 x i64>
ret <1 x i64> %t3
%t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
ret <1 x i64> %t2
}

declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
declare void @llvm.x86.mmx.emms()

134 changes: 67 additions & 67 deletions llvm/test/CodeGen/X86/mmx-build-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64

declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)

;
; v2i32
Expand All @@ -35,9 +35,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand All @@ -58,9 +58,9 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -92,9 +92,9 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 undef, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand All @@ -119,9 +119,9 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -153,9 +153,9 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -194,9 +194,9 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
%4 = insertelement <4 x i16> %3, i16 %a3, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand Down Expand Up @@ -229,9 +229,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 0, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand All @@ -254,9 +254,9 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 undef, i32 1
%3 = insertelement <4 x i16> %2, i16 undef, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand All @@ -281,9 +281,9 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 0, i32 1
%3 = insertelement <4 x i16> %2, i16 undef, i32 2
%4 = insertelement <4 x i16> %3, i16 0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand Down Expand Up @@ -316,9 +316,9 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
%4 = insertelement <4 x i16> %3, i16 undef, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand Down Expand Up @@ -353,9 +353,9 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
%2 = insertelement <4 x i16> %1, i16 undef, i32 1
%3 = insertelement <4 x i16> %2, i16 %a0, i32 2
%4 = insertelement <4 x i16> %3, i16 %a0, i32 3
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
store x86_mmx %6, ptr%p0
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
store <1 x i64> %6, ptr%p0
ret void
}

Expand Down Expand Up @@ -414,9 +414,9 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 %a5, i32 5
%7 = insertelement <8 x i8> %6, i8 %a6, i32 6
%8 = insertelement <8 x i8> %7, i8 %a7, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand Down Expand Up @@ -469,9 +469,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 %a5, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 %a7, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand Down Expand Up @@ -522,9 +522,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 undef, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand All @@ -551,9 +551,9 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 0, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand Down Expand Up @@ -582,9 +582,9 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 0, i32 5
%7 = insertelement <8 x i8> %6, i8 0, i32 6
%8 = insertelement <8 x i8> %7, i8 undef, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand Down Expand Up @@ -626,9 +626,9 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
%6 = insertelement <8 x i8> %5, i8 %a0, i32 5
%7 = insertelement <8 x i8> %6, i8 %a0, i32 6
%8 = insertelement <8 x i8> %7, i8 %a0, i32 7
%9 = bitcast <8 x i8> %8 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
store x86_mmx %10, ptr%p0
%9 = bitcast <8 x i8> %8 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
store <1 x i64> %10, ptr%p0
ret void
}

Expand Down Expand Up @@ -669,9 +669,9 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -707,9 +707,9 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float 0.0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -742,9 +742,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float undef, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -780,9 +780,9 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float 0.0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}

Expand Down Expand Up @@ -815,8 +815,8 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind {
; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
%4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
store x86_mmx %4, ptr%p0
%3 = bitcast <2 x float> %2 to <1 x i64>
%4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
store <1 x i64> %4, ptr%p0
ret void
}
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/X86/mmx-coalescing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ entry:
%SA2 = getelementptr inbounds %SA, ptr %pSA, i64 0, i32 4
%v3 = load ptr, ptr %SA2, align 8
%v4 = bitcast <1 x i64> %v0 to <4 x i16>
%v5 = bitcast <4 x i16> %v4 to x86_mmx
%v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
%v7 = bitcast x86_mmx %v6 to <4 x i16>
%v5 = bitcast <4 x i16> %v4 to <1 x i64>
%v6 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v5, i8 -18)
%v7 = bitcast <1 x i64> %v6 to <4 x i16>
%v8 = bitcast <4 x i16> %v7 to <1 x i64>
%v9 = extractelement <1 x i64> %v8, i32 0
%v10 = bitcast i64 %v9 to <2 x i32>
Expand All @@ -55,18 +55,18 @@ entry:
if.A:
%pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
%v17 = extractelement <1 x i64> %pa, i32 0
%v18 = bitcast i64 %v17 to x86_mmx
%v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
%v20 = bitcast x86_mmx %v19 to i64
%v18 = bitcast i64 %v17 to <1 x i64>
%v19 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %v18, i32 %B) #2
%v20 = bitcast <1 x i64> %v19 to i64
%v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
%cmp3 = icmp eq i64 %v20, 0
br i1 %cmp3, label %if.C, label %merge

if.B:
%v34 = bitcast <1 x i64> %v8 to <4 x i16>
%v35 = bitcast <4 x i16> %v34 to x86_mmx
%v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
%v37 = bitcast x86_mmx %v36 to <4 x i16>
%v35 = bitcast <4 x i16> %v34 to <1 x i64>
%v36 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v35, i8 -18)
%v37 = bitcast <1 x i64> %v36 to <4 x i16>
%v38 = bitcast <4 x i16> %v37 to <1 x i64>
br label %if.C

Expand All @@ -80,9 +80,9 @@ if.C:
merge:
%vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
%v130 = bitcast <1 x i64> %vy to <4 x i16>
%v131 = bitcast <4 x i16> %v130 to x86_mmx
%v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
%v133 = bitcast x86_mmx %v132 to <4 x i16>
%v131 = bitcast <4 x i16> %v130 to <1 x i64>
%v132 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v131, i8 -18)
%v133 = bitcast <1 x i64> %v132 to <4 x i16>
%v134 = bitcast <4 x i16> %v133 to <1 x i64>
%v135 = extractelement <1 x i64> %v134, i32 0
%v136 = bitcast i64 %v135 to <2 x i32>
Expand All @@ -91,5 +91,5 @@ merge:
}


declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
62 changes: 31 additions & 31 deletions llvm/test/CodeGen/X86/mmx-cvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
%3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
%4 = bitcast <4 x i32> %3 to <2 x i64>
%5 = extractelement <2 x i64> %4, i32 0
%6 = bitcast i64 %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
%8 = bitcast x86_mmx %7 to i64
%6 = bitcast i64 %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
%8 = bitcast <1 x i64> %7 to i64
%9 = insertelement <1 x i64> undef, i64 %8, i32 0
store <1 x i64> %9, ptr %1
ret void
Expand All @@ -49,9 +49,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
%3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
%4 = bitcast <4 x i32> %3 to <2 x i64>
%5 = extractelement <2 x i64> %4, i32 0
%6 = bitcast i64 %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
%8 = bitcast x86_mmx %7 to i64
%6 = bitcast i64 %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
%8 = bitcast <1 x i64> %7 to i64
%9 = insertelement <1 x i64> undef, i64 %8, i32 0
store <1 x i64> %9, ptr %1
ret void
Expand All @@ -73,9 +73,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: retq
%3 = fptosi <2 x double> %0 to <2 x i32>
%4 = bitcast <2 x i32> %3 to x86_mmx
%5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
%6 = bitcast x86_mmx %5 to i64
%4 = bitcast <2 x i32> %3 to <1 x i64>
%5 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %4, <1 x i64> %4)
%6 = bitcast <1 x i64> %5 to i64
%7 = insertelement <1 x i64> undef, i64 %6, i32 0
store <1 x i64> %7, ptr %1
ret void
Expand All @@ -99,9 +99,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
%3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
%4 = bitcast <4 x i32> %3 to <2 x i64>
%5 = extractelement <2 x i64> %4, i32 0
%6 = bitcast i64 %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
%8 = bitcast x86_mmx %7 to i64
%6 = bitcast i64 %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
%8 = bitcast <1 x i64> %7 to i64
%9 = insertelement <1 x i64> undef, i64 %8, i32 0
store <1 x i64> %9, ptr %1
ret void
Expand All @@ -125,9 +125,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
%3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
%4 = bitcast <4 x i32> %3 to <2 x i64>
%5 = extractelement <2 x i64> %4, i32 0
%6 = bitcast i64 %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
%8 = bitcast x86_mmx %7 to i64
%6 = bitcast i64 %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
%8 = bitcast <1 x i64> %7 to i64
%9 = insertelement <1 x i64> undef, i64 %8, i32 0
store <1 x i64> %9, ptr %1
ret void
Expand All @@ -150,9 +150,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
; X64-NEXT: retq
%3 = fptosi <4 x float> %0 to <4 x i32>
%4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%5 = bitcast <2 x i32> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
%7 = bitcast x86_mmx %6 to i64
%5 = bitcast <2 x i32> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
%7 = bitcast <1 x i64> %6 to i64
%8 = insertelement <1 x i64> undef, i64 %7, i32 0
store <1 x i64> %8, ptr %1
ret void
Expand All @@ -176,9 +176,9 @@ define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
%3 = fptosi <4 x float> %0 to <4 x i32>
%4 = bitcast <4 x i32> %3 to <2 x i64>
%5 = extractelement <2 x i64> %4, i32 0
%6 = bitcast i64 %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
%8 = bitcast x86_mmx %7 to i64
%6 = bitcast i64 %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
%8 = bitcast <1 x i64> %7 to i64
%9 = insertelement <1 x i64> undef, i64 %8, i32 0
store <1 x i64> %9, ptr %1
ret void
Expand Down Expand Up @@ -210,9 +210,9 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind {
; X64-NEXT: movq2dq %mm0, %xmm0
; X64-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-NEXT: retq
%2 = load x86_mmx, ptr %0, align 8
%3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
%4 = bitcast x86_mmx %3 to i64
%2 = load <1 x i64>, ptr %0, align 8
%3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
%4 = bitcast <1 x i64> %3 to i64
%5 = insertelement <2 x i64> undef, i64 %4, i32 0
%6 = bitcast <2 x i64> %5 to <4 x i32>
%7 = shufflevector <4 x i32> %6, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
Expand All @@ -237,9 +237,9 @@ define <4 x float> @sitofp_v2i32_v2f32(ptr) nounwind {
; X64-NEXT: movq2dq %mm0, %xmm0
; X64-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%2 = load x86_mmx, ptr %0, align 8
%3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
%4 = bitcast x86_mmx %3 to <2 x i32>
%2 = load <1 x i64>, ptr %0, align 8
%3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
%4 = bitcast <1 x i64> %3 to <2 x i32>
%5 = shufflevector <2 x i32> %4, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = sitofp <4 x i32> %5 to <4 x float>
ret <4 x float> %6
Expand Down Expand Up @@ -269,17 +269,17 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
; X64-NEXT: movq2dq %mm0, %xmm0
; X64-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%2 = load x86_mmx, ptr %0, align 8
%3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
%4 = bitcast x86_mmx %3 to i64
%2 = load <1 x i64>, ptr %0, align 8
%3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
%4 = bitcast <1 x i64> %3 to i64
%5 = insertelement <2 x i64> undef, i64 %4, i32 0
%6 = insertelement <2 x i64> %5, i64 0, i32 1
%7 = bitcast <2 x i64> %6 to <4 x i32>
%8 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %7)
ret <4 x float> %8
}

declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
Expand Down
168 changes: 84 additions & 84 deletions llvm/test/CodeGen/X86/mmx-fold-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ define i64 @t0(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)

define i64 @t1(ptr %a, ptr %b) nounwind {
; X86-LABEL: t1:
Expand Down Expand Up @@ -64,13 +64,13 @@ define i64 @t1(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32)

define i64 @t2(ptr %a, ptr %b) nounwind {
; X86-LABEL: t2:
Expand Down Expand Up @@ -99,13 +99,13 @@ define i64 @t2(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32)

define i64 @t3(ptr %a, ptr %b) nounwind {
; X86-LABEL: t3:
Expand Down Expand Up @@ -134,13 +134,13 @@ define i64 @t3(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32)

define i64 @t4(ptr %a, ptr %b) nounwind {
; X86-LABEL: t4:
Expand Down Expand Up @@ -169,13 +169,13 @@ define i64 @t4(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32)

define i64 @t5(ptr %a, ptr %b) nounwind {
; X86-LABEL: t5:
Expand Down Expand Up @@ -204,13 +204,13 @@ define i64 @t5(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32)

define i64 @t6(ptr %a, ptr %b) nounwind {
; X86-LABEL: t6:
Expand Down Expand Up @@ -239,13 +239,13 @@ define i64 @t6(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32)

define i64 @t7(ptr %a, ptr %b) nounwind {
; X86-LABEL: t7:
Expand Down Expand Up @@ -274,15 +274,15 @@ define i64 @t7(ptr %a, ptr %b) nounwind {
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a, align 8
%0 = load <1 x i64>, ptr %a, align 8
%1 = load i32, ptr %b, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %0, i32 %1)
%3 = bitcast x86_mmx %2 to i64
%2 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %0, i32 %1)
%3 = bitcast <1 x i64> %2 to i64
ret i64 %3
}
declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)

define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
define i64 @tt0(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt0:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -312,16 +312,16 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
declare void @llvm.x86.mmx.emms()

define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
define i64 @tt1(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt1:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -351,15 +351,15 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)

define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
define i64 @tt2(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt2:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -389,15 +389,15 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)

define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
define i64 @tt3(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt3:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -427,15 +427,15 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)

define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
define i64 @tt4(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt4:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -465,15 +465,15 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)

define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
define i64 @tt5(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt5:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -503,15 +503,15 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)

define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
define i64 @tt6(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt6:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -541,15 +541,15 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>)

define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
define i64 @tt7(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt7:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -579,15 +579,15 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>)

define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
define i64 @tt8(<1 x i64> %t, ptr %q) nounwind {
; X86-LABEL: tt8:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -617,13 +617,13 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%v = load x86_mmx, ptr %q
%u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
%s = bitcast x86_mmx %u to i64
%v = load <1 x i64>, ptr %q
%u = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %t, <1 x i64> %v)
%s = bitcast <1 x i64> %u to i64
call void @llvm.x86.mmx.emms()
ret i64 %s
}
declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>)

define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind {
; X86-LABEL: test_psrlq_by_volatile_shift_amount:
Expand Down Expand Up @@ -653,8 +653,8 @@ entry:
call void @llvm.lifetime.start(i64 4, ptr nonnull %0)
store volatile i32 1, ptr %0, align 4
%1 = load volatile i32, ptr %0, align 4
%2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %1)
store x86_mmx %2, ptr %t, align 8
%2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> <i64 255>, i32 %1)
store <1 x i64> %2, ptr %t, align 8
call void @llvm.lifetime.end(i64 4, ptr nonnull %0)
ret void
}
Expand All @@ -663,7 +663,7 @@ declare void @llvm.lifetime.start(i64, ptr nocapture)
declare void @llvm.lifetime.end(i64, ptr nocapture)

; Make sure we shrink this vector load and fold it.
define x86_mmx @vec_load(ptr %x) {
define <1 x i64> @vec_load(ptr %x) {
; X86-LABEL: vec_load:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
Expand Down Expand Up @@ -694,10 +694,10 @@ define x86_mmx @vec_load(ptr %x) {
%y = extractelement <4 x float> %z, i32 0
%a = insertelement <2 x float> undef, float %y, i32 0
%b = insertelement <2 x float> %a, float %y, i32 1
%c = bitcast <2 x float> %b to x86_mmx
%d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
ret x86_mmx %d
%c = bitcast <2 x float> %b to <1 x i64>
%d = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %c, <1 x i64> %c)
ret <1 x i64> %d
}

declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)

52 changes: 26 additions & 26 deletions llvm/test/CodeGen/X86/mmx-fold-zero.ll
Original file line number Diff line number Diff line change
Expand Up @@ -115,32 +115,32 @@ define double @mmx_zero(double, double, double, double) nounwind {
; X64-LARGE-NEXT: paddw %mm2, %mm0
; X64-LARGE-NEXT: movq2dq %mm0, %xmm0
; X64-LARGE-NEXT: retq
%5 = bitcast double %0 to x86_mmx
%6 = bitcast double %1 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
%8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
%9 = bitcast double %2 to x86_mmx
%10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
%11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
%12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
%13 = bitcast double %3 to x86_mmx
%14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
%15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
%16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
%17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
%18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
%19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
%20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
%21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
%22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
%23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
%24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
%25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
%26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
%27 = bitcast x86_mmx %26 to double
%5 = bitcast double %0 to <1 x i64>
%6 = bitcast double %1 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
%8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
%9 = bitcast double %2 to <1 x i64>
%10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
%11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
%12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
%13 = bitcast double %3 to <1 x i64>
%14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
%15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
%16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
%17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
%18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
%19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
%20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
%21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
%22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
%23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
%24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
%25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
%26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
%27 = bitcast <1 x i64> %26 to double
ret double %27
}

declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)
865 changes: 431 additions & 434 deletions llvm/test/CodeGen/X86/mmx-intrinsics.ll

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions llvm/test/CodeGen/X86/mmx-only.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

; Test that turning off sse doesn't turn off mmx.

declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone

define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
; CHECK-LABEL: @test88
; CHECK: pcmpgtd
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
%1 = bitcast <1 x i64> %a to <2 x i32>
%mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
%mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
%2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
%3 = bitcast x86_mmx %2 to <2 x i32>
%mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
%mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
%2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
%3 = bitcast <1 x i64> %2 to <2 x i32>
%4 = bitcast <2 x i32> %3 to <1 x i64>
%5 = extractelement <1 x i64> %4, i32 0
ret i64 %5
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s
; This test ensures that the MXCSR is implicitly used by MMX FP instructions.

define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
define <1 x i64> @mxcsr_mmx(<4 x float> %a0) {
; CHECK: MMX_CVTPS2PIrr %{{[0-9]}}, implicit $mxcsr
; CHECK: MMX_CVTPI2PSrr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
; CHECK: MMX_CVTTPS2PIrr killed %{{[0-9]}}, implicit $mxcsr
; CHECK: MMX_CVTPI2PDrr killed %{{[0-9]$}}
; CHECK: MMX_CVTPD2PIrr killed %{{[0-9]}}, implicit $mxcsr
%1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
%2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
%3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
%4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
%5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
ret x86_mmx %5
%1 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
%2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %1)
%3 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %2)
%4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %3)
%5 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
ret <1 x i64> %5
}

define half @mxcsr_f16c(float %a) {
Expand Down Expand Up @@ -41,11 +41,11 @@ define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double
ret <8 x double> %res
}

declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>)
declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>)
declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>)
declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,11 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
; X64-NEXT: movntq %mm0, (%rsi)
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a0
%1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
store x86_mmx %1, ptr %a1, align 8, !nontemporal !0
%0 = load <1 x i64>, ptr %a0
%1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
store <1 x i64> %1, ptr %a1, align 8, !nontemporal !0
ret void
}
declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone

!0 = !{i32 1}
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/pr13859.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ entry:
%a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
%a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
%a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
%a40 = bitcast <4 x i16> %a39 to x86_mmx
%a41 = bitcast x86_mmx %a40 to <1 x i64>
%a40 = bitcast <4 x i16> %a39 to <1 x i64>

%a47 = trunc i32 %a32 to i1
br i1 %a47, label %a48, label %a49
Expand All @@ -23,6 +22,6 @@ a48:
unreachable

a49:
store <1 x i64> %a41, ptr %dest, align 8 ; !!!
store <1 x i64> %a40, ptr %dest, align 8 ; !!!
ret void
}
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/pr23246.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
; PR23246
; We're really only interested in doing something sane with the shuffle.

define <2 x i64> @test(x86_mmx %a) #0 {
define <2 x i64> @test(<1 x i64> %a) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: retq
entry:
%b = bitcast x86_mmx %a to <1 x i64>
%s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
%s = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
ret <2 x i64> %s
}

Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/pr29222.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ define i32 @PR29222(i32) nounwind {
; X64-AVX-NEXT: retq
%2 = insertelement <2 x i32> undef, i32 %0, i32 0
%3 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
%4 = bitcast <2 x i32> %3 to x86_mmx
%5 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %4, x86_mmx %4)
%6 = bitcast x86_mmx %5 to i64
%4 = bitcast <2 x i32> %3 to <1 x i64>
%5 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %4, <1 x i64> %4)
%6 = bitcast <1 x i64> %5 to i64
%7 = insertelement <2 x i64> undef, i64 %6, i32 0
%8 = bitcast <2 x i64> %7 to <8 x i16>
%9 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %8, <8 x i16> undef)
Expand All @@ -73,5 +73,5 @@ define i32 @PR29222(i32) nounwind {
ret i32 %11
}

declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>)
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/pr35982.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ define float @PR35982_emms(<1 x i64>) nounwind {
%2 = bitcast <1 x i64> %0 to <2 x i32>
%3 = extractelement <2 x i32> %2, i32 0
%4 = extractelement <1 x i64> %0, i32 0
%5 = bitcast i64 %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5)
%7 = bitcast x86_mmx %6 to <2 x i32>
%5 = bitcast i64 %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %5, <1 x i64> %5)
%7 = bitcast <1 x i64> %6 to <2 x i32>
%8 = extractelement <2 x i32> %7, i32 0
tail call void @llvm.x86.mmx.emms()
%9 = sitofp i32 %3 to float
Expand All @@ -46,5 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind {
ret float %11
}

declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>)
declare void @llvm.x86.mmx.emms()
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/select-mmx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ define i64 @test47(i64 %arg) {
; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
%cond = icmp eq i64 %arg, 0
%slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
%psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
%retc = bitcast x86_mmx %psll to i64
%slct = select i1 %cond, <1 x i64> bitcast (i64 7 to <1 x i64>), <1 x i64> bitcast (i64 0 to <1 x i64>)
%psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
%retc = bitcast <1 x i64> %psll to i64
ret i64 %retc
}

Expand Down Expand Up @@ -104,13 +104,13 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
%cond = icmp eq i64 %arg, 0
%xmmx = bitcast i64 %x to x86_mmx
%ymmx = bitcast i64 %y to x86_mmx
%slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
%psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
%retc = bitcast x86_mmx %psll to i64
%xmmx = bitcast i64 %x to <1 x i64>
%ymmx = bitcast i64 %y to <1 x i64>
%slct = select i1 %cond, <1 x i64> %xmmx, <1 x i64> %ymmx
%psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
%retc = bitcast <1 x i64> %psll to i64
ret i64 %retc
}

declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)

780 changes: 390 additions & 390 deletions llvm/test/CodeGen/X86/stack-folding-mmx.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/X86/vec_extract-mmx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ define i32 @test0(ptr %v4) nounwind {
entry:
%v5 = load <1 x i64>, ptr %v4, align 8
%v12 = bitcast <1 x i64> %v5 to <4 x i16>
%v13 = bitcast <4 x i16> %v12 to x86_mmx
%v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
%v15 = bitcast x86_mmx %v14 to <4 x i16>
%v13 = bitcast <4 x i16> %v12 to <1 x i64>
%v14 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v13, i8 -18)
%v15 = bitcast <1 x i64> %v14 to <4 x i16>
%v16 = bitcast <4 x i16> %v15 to <1 x i64>
%v17 = extractelement <1 x i64> %v16, i32 0
%v18 = bitcast i64 %v17 to <2 x i32>
Expand Down Expand Up @@ -52,12 +52,12 @@ entry:
%0 = load i32, ptr %ptr, align 4
%1 = insertelement <2 x i32> undef, i32 %0, i32 0
%2 = insertelement <2 x i32> %1, i32 0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
%4 = bitcast x86_mmx %3 to i64
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = bitcast <1 x i64> %3 to i64
%5 = bitcast i64 %4 to <4 x i16>
%6 = bitcast <4 x i16> %5 to x86_mmx
%7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24)
%8 = bitcast x86_mmx %7 to <4 x i16>
%6 = bitcast <4 x i16> %5 to <1 x i64>
%7 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %6, i8 -24)
%8 = bitcast <1 x i64> %7 to <4 x i16>
%9 = bitcast <4 x i16> %8 to <1 x i64>
%10 = extractelement <1 x i64> %9, i32 0
%11 = bitcast i64 %10 to <2 x i32>
Expand All @@ -82,9 +82,9 @@ define i32 @test2(ptr nocapture readonly %ptr) nounwind {
; X64-NEXT: emms
; X64-NEXT: retq
entry:
%0 = load x86_mmx, ptr %ptr, align 8
%1 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %0, i8 -24)
%2 = bitcast x86_mmx %1 to <4 x i16>
%0 = load <1 x i64>, ptr %ptr, align 8
%1 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %0, i8 -24)
%2 = bitcast <1 x i64> %1 to <4 x i16>
%3 = bitcast <4 x i16> %2 to <1 x i64>
%4 = extractelement <1 x i64> %3, i32 0
%5 = bitcast i64 %4 to <2 x i32>
Expand All @@ -93,7 +93,7 @@ entry:
ret i32 %6
}

define i32 @test3(x86_mmx %a) nounwind {
define i32 @test3(<1 x i64> %a) nounwind {
; X86-LABEL: test3:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
Expand All @@ -104,13 +104,13 @@ define i32 @test3(x86_mmx %a) nounwind {
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
%tmp0 = bitcast <1 x i64> %a to <2 x i32>
%tmp1 = extractelement <2 x i32> %tmp0, i32 0
ret i32 %tmp1
}

; Verify we don't muck with extractelts from the upper lane.
define i32 @test4(x86_mmx %a) nounwind {
define i32 @test4(<1 x i64> %a) nounwind {
; X86-LABEL: test4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
Expand All @@ -122,10 +122,10 @@ define i32 @test4(x86_mmx %a) nounwind {
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
%tmp0 = bitcast <1 x i64> %a to <2 x i32>
%tmp1 = extractelement <2 x i32> %tmp0, i32 1
ret i32 %tmp1
}

declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
declare void @llvm.x86.mmx.emms()
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vec_insert-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ define void @t1(i32 %a, ptr %P) nounwind {
%tmp12 = shl i32 %a, 12
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
%tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
%tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
store x86_mmx %tmp23, ptr %P
%tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
store <1 x i64> %tmp23, ptr %P
ret void
}

Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vec_insert-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; MMX insertelement is not available; these are promoted to xmm.
; (Without SSE they are split to two ints, and the code is much better.)

define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
define <1 x i64> @mmx_movzl(<1 x i64> %x) nounwind {
; X86-LABEL: mmx_movzl:
; X86: ## %bb.0:
; X86-NEXT: movl $32, %eax
Expand All @@ -16,9 +16,9 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X64: ## %bb.0:
; X64-NEXT: movl $32, %eax
; X64-NEXT: retq
%tmp = bitcast x86_mmx %x to <2 x i32>
%tmp = bitcast <1 x i64> %x to <2 x i32>
%tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
%tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
%tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
ret x86_mmx %tmp9
%tmp9 = bitcast <2 x i32> %tmp8 to <1 x i64>
ret <1 x i64> %tmp9
}
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vec_insert-mmx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64

; This is not an MMX operation; promoted to xmm.
define x86_mmx @t0(i32 %A) nounwind {
define <1 x i64> @t0(i32 %A) nounwind {
; X86-LABEL: t0:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
Expand All @@ -17,8 +17,8 @@ define x86_mmx @t0(i32 %A) nounwind {
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
ret x86_mmx %tmp4
%tmp4 = bitcast <2 x i32> %tmp3 to <1 x i64>
ret <1 x i64> %tmp4
}

define <8 x i8> @t1(i8 zeroext %x) nounwind {
Expand Down
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ entry:
%tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>
%tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >
%tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>
%tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
%tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, ptr null)
%tmp556 = bitcast <8 x i8> %tmp555 to <1 x i64>
%tmp557 = bitcast <8 x i8> zeroinitializer to <1 x i64>
tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp557, <1 x i64> %tmp556, ptr null)
ret void
}

Expand Down Expand Up @@ -115,19 +115,19 @@ define <4 x float> @pr35869() nounwind {
; X64-NEXT: punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
; X64-NEXT: cvtpi2ps %mm0, %xmm0
; X64-NEXT: retq
%1 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to x86_mmx), x86_mmx bitcast (<8 x i8> zeroinitializer to x86_mmx))
%2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx bitcast (<4 x i16> zeroinitializer to x86_mmx), x86_mmx %1)
%3 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2)
%4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, x86_mmx %3)
%1 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to <1 x i64>), <1 x i64> bitcast (<8 x i8> zeroinitializer to <1 x i64>))
%2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> bitcast (<4 x i16> zeroinitializer to <1 x i64>), <1 x i64> %1)
%3 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %1, <1 x i64> %2)
%4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, <1 x i64> %3)
%5 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%6 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2)
%7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, x86_mmx %6)
%6 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %1, <1 x i64> %2)
%7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, <1 x i64> %6)
ret <4 x float> %7
}

declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx)
declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx)
declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>)
declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>)
declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
70 changes: 35 additions & 35 deletions llvm/test/CodeGen/X86/x86-64-psub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
%3 = bitcast <8 x i8> %2 to x86_mmx
%3 = bitcast <8 x i8> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
%5 = bitcast <8 x i8> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <8 x i8>
%5 = bitcast <8 x i8> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <8 x i8>
%8 = bitcast <8 x i8> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -66,11 +66,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
%3 = bitcast <4 x i16> %2 to x86_mmx
%3 = bitcast <4 x i16> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <4 x i16>
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <4 x i16>
%8 = bitcast <4 x i16> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -100,11 +100,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <2 x i32>
%3 = bitcast <2 x i32> %2 to x86_mmx
%3 = bitcast <2 x i32> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <2 x i32>
%5 = bitcast <2 x i32> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <2 x i32>
%5 = bitcast <2 x i32> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <2 x i32>
%8 = bitcast <2 x i32> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -134,11 +134,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
%3 = bitcast <8 x i8> %2 to x86_mmx
%3 = bitcast <8 x i8> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
%5 = bitcast <8 x i8> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <8 x i8>
%5 = bitcast <8 x i8> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <8 x i8>
%8 = bitcast <8 x i8> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -168,11 +168,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
%3 = bitcast <4 x i16> %2 to x86_mmx
%3 = bitcast <4 x i16> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <4 x i16>
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <4 x i16>
%8 = bitcast <4 x i16> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -202,11 +202,11 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
%3 = bitcast <8 x i8> %2 to x86_mmx
%3 = bitcast <8 x i8> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
%5 = bitcast <8 x i8> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <8 x i8>
%5 = bitcast <8 x i8> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <8 x i8>
%8 = bitcast <8 x i8> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
Expand Down Expand Up @@ -236,26 +236,26 @@ entry:
%__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
%__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
%2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
%3 = bitcast <4 x i16> %2 to x86_mmx
%3 = bitcast <4 x i16> %2 to <1 x i64>
%4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
%5 = bitcast <4 x i16> %4 to x86_mmx
%6 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %3, x86_mmx %5) nounwind
%7 = bitcast x86_mmx %6 to <4 x i16>
%5 = bitcast <4 x i16> %4 to <1 x i64>
%6 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %3, <1 x i64> %5) nounwind
%7 = bitcast <1 x i64> %6 to <4 x i16>
%8 = bitcast <4 x i16> %7 to <1 x i64>
%retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
ret i64 %retval.0.extract.i15
}

declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone

declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
Loading