From 652842ec9ac1a6730335ad89827eb4133c0253fd Mon Sep 17 00:00:00 2001 From: Simon Dardis Date: Thu, 2 Nov 2017 12:47:22 +0000 Subject: [PATCH 001/238] [mips] Use register scavenging with MSA. MSA stores and loads to the stack are more likely to require an emergency GPR spill slot due to the smaller offsets available with those instructions. Handle this by overestimating the size of the stack by determining the largest offset presuming that all callee save registers are spilled and accounting of incoming arguments when determining whether an emergency spill slot is required. Reviewers: atanasyan Differential Revision: https://reviews.llvm.org/D39056 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317204 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsFrameLowering.cpp | 35 ++-- lib/Target/Mips/MipsSEFrameLowering.cpp | 8 +- test/CodeGen/Mips/msa/emergency-spill.mir | 221 ++++++++++++++++++++++ test/CodeGen/Mips/msa/frameindex.ll | 49 +++-- 4 files changed, 272 insertions(+), 41 deletions(-) create mode 100644 test/CodeGen/Mips/msa/emergency-spill.mir diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index ef05166503b24..27a85970da6f8 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -107,38 +107,31 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const { return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF); } +// Estimate the size of the stack, including the incoming arguments. We need to +// account for register spills, local objects, reserved call frame and incoming +// arguments. This is required to determine the largest possible positive offset +// from $sp so that it can be determined if an emergency spill slot for stack +// addresses is required. uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - int64_t Offset = 0; + int64_t Size = 0; - // Iterate over fixed sized objects. + // Iterate over fixed sized objects which are incoming arguments. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - Offset = std::max(Offset, -MFI.getObjectOffset(I)); + if (MFI.getObjectOffset(I) > 0) + Size += MFI.getObjectSize(I); // Conservatively assume all callee-saved registers will be saved. for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) { - unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); - Offset = alignTo(Offset + Size, Size); + unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); + Size = alignTo(Size + RegSize, RegSize); } - unsigned MaxAlign = MFI.getMaxAlignment(); - - // Check that MaxAlign is not zero if there is a stack object that is not a - // callee-saved spill. - assert(!MFI.getObjectIndexEnd() || MaxAlign); - - // Iterate over other objects. - for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) - Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign); - - // Call frame. - if (MFI.adjustsStack() && hasReservedCallFrame(MF)) - Offset = alignTo(Offset + MFI.getMaxCallFrameSize(), - std::max(MaxAlign, getStackAlignment())); - - return alignTo(Offset, getStackAlignment()); + // Get the size of the rest of the frame objects and any possible reserved + // call frame, accounting for alignment. + return Size + MFI.estimateStackSize(MF); } // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index 0b19b18449e08..ca19089c9120b 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -893,10 +893,12 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, } // Set scavenging frame index if necessary. - uint64_t MaxSPOffset = MF.getInfo()->getIncomingArgSize() + - estimateStackSize(MF); + uint64_t MaxSPOffset = estimateStackSize(MF); - if (isInt<16>(MaxSPOffset)) + // MSA has a minimum offset of 10 bits signed. If there is a variable + // sized object on the stack, the estimation cannot account for it. + if (isIntN(STI.hasMSA() ? 10 : 16, MaxSPOffset) && + !MF.getFrameInfo().hasVarSizedObjects()) return; const TargetRegisterClass &RC = diff --git a/test/CodeGen/Mips/msa/emergency-spill.mir b/test/CodeGen/Mips/msa/emergency-spill.mir new file mode 100644 index 0000000000000..502b60f673e29 --- /dev/null +++ b/test/CodeGen/Mips/msa/emergency-spill.mir @@ -0,0 +1,221 @@ +# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null + +# Test that estimated size of the stack leads to the creation of an emergency +# spill when MSA is in use. Previously, this test case would fail during +# register scavenging due to the lack of a spill slot. +--- | + define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 { + entry: + %retval = alloca <16 x i8>, align 16 + %a = alloca <16 x i8>, align 16 + %b = alloca <16 x i8>, align 16 + %a.addr = alloca <16 x i8>, align 16 + %b.addr = alloca <16 x i8>, align 16 + %c.addr = alloca i32, align 4 + %g = alloca <16 x i8>*, align 8 + %d = alloca i8*, align 8 + %0 = bitcast <16 x i8>* %a to { i64, i64 }* + %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0 + store i64 %a.coerce0, i64* %1, align 16 + %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1 + store i64 %a.coerce1, i64* %2, align 8 + %a1 = load <16 x i8>, <16 x i8>* %a, align 16 + %3 = bitcast <16 x i8>* %b to { i64, i64 }* + %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0 + store i64 %b.coerce0, i64* %4, align 16 + %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1 + store i64 %b.coerce1, i64* %5, align 8 + %b2 = load <16 x i8>, <16 x i8>* %b, align 16 + store <16 x i8> %a1, <16 x i8>* %a.addr, align 16 + store <16 x i8> %b2, <16 x i8>* %b.addr, align 16 + store i32 %c, i32* %c.addr, align 4 + %6 = alloca i8, i64 6400, align 16 + %7 = bitcast i8* %6 to <16 x i8>* + store <16 x i8>* %7, <16 x i8>** %g, align 8 + %8 = load <16 x i8>*, <16 x i8>** %g, align 8 + call void @h(<16 x i8>* %b.addr, <16 x i8>* %8) + %9 = load <16 x i8>*, <16 x i8>** %g, align 8 + %10 = bitcast <16 x i8>* %9 to i8* + store i8* %10, i8** %d, align 8 + %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16 + %12 = load i8*, i8** %d, align 8 + %arrayidx = getelementptr inbounds i8, i8* %12, i64 0 + %13 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %13 to i32 + %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv) + %add = add <16 x i8> %11, %14 + %15 = load i8*, i8** %d, align 8 + %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1 + %16 = load i8, i8* %arrayidx3, align 1 + %conv4 = sext i8 %16 to i32 + %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4) + %add5 = add <16 x i8> %add, %17 + %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + %add6 = add <16 x i8> %18, %add5 + store <16 x i8> %add6, <16 x i8>* %b.addr, align 16 + %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + store <16 x i8> %19, <16 x i8>* %retval, align 16 + %20 = bitcast <16 x i8>* %retval to { i64, i64 }* + %21 = load { i64, i64 }, { i64, i64 }* %20, align 16 + ret { i64, i64 } %21 + } + + declare void @h(<16 x i8>*, <16 x i8>*) + + declare <16 x i8> @llvm.mips.fill.b(i32) + + declare void @llvm.stackprotector(i8*, i8**) + +... +--- +name: test +alignment: 3 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%a0_64', virtual-reg: '' } + - { reg: '%a1_64', virtual-reg: '' } + - { reg: '%a2_64', virtual-reg: '' } + - { reg: '%a3_64', virtual-reg: '' } + - { reg: '%t0_64', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: false + hasCalls: true + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 8, name: '', type: default, offset: 0, size: 6400, + alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +constants: +body: | + bb.0.entry: + liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64 + + SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16) + SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2) + %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a) + SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16) + SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5) + %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b) + ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr) + ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr) + %at_64 = LEA_ADDiu64 %stack.8, 0 + SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0 + JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr) + SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d) + %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d) + %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx) + %w1 = FILL_B killed %v0 + %w0 = ADDV_B killed %w0, killed %w1 + %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3) + %w1 = FILL_B killed %at + %w0 = ADDV_B killed %w0, killed %w1 + %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + %w0 = ADDV_B killed %w1, killed %w0 + ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval) + %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16) + %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16) + RetRA implicit %v0_64, implicit %v1_64 + +... diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll index f903381f9ef03..9c2228d3bf639 100644 --- a/test/CodeGen/Mips/msa/frameindex.ll +++ b/test/CodeGen/Mips/msa/frameindex.ll @@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm10: %1 = alloca <16 x i8> - %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes + %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp) @@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm10: %1 = alloca <16 x i8> - %2 = alloca [497 x i8] ; Push the frame just over 512 bytes + %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512 @@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm16: %1 = alloca <16 x i8> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm16: %1 = alloca <16 x i8> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm10: %1 = alloca <8 x i16> - %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes + %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp) @@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm10: %1 = alloca <8 x i16> - %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes + %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024 @@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm16: %1 = alloca <8 x i16> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm16: %1 = alloca <8 x i16> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm10: %1 = alloca <4 x i32> - %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes + %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp) @@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm10: %1 = alloca <4 x i32> - %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes + %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048 @@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm16: %1 = alloca <4 x i32> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot-- right up to 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm16: %1 = alloca <4 x i32> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm10: %1 = alloca <2 x i64> - %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes - + %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp) store volatile <2 x i64> %3, <2 x i64>* %1 @@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm10: %1 = alloca <2 x i64> - %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes + %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096 @@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm16: %1 = alloca <2 x i64> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm16: %1 = alloca <2 x i64> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 From a223e9099142f78a1c2463b83d5351cdfa3d2fc1 Mon Sep 17 00:00:00 2001 From: Ayman Musa Date: Thu, 2 Nov 2017 13:07:06 +0000 Subject: [PATCH 002/238] [X86] Fix bug in legalize vector types - Split large loads When splitting a large load to smaller legally-typed loads, the last load should be padded to reach the size of the previous one so a CONCAT_VECTORS node could reunite them again. The code currently pads the last load to reach the size of the first load (instead of the previous). Differential Revision: https://reviews.llvm.org/D38495 Change-Id: Ib60b55ed26ce901fabf68108daf52683fbd5013f git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317206 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 2 +- test/CodeGen/X86/pr34653.ll | 209 ++++++++++++++++++ test/CodeGen/X86/pr34657.ll | 20 ++ 3 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/pr34653.ll create mode 100644 test/CodeGen/X86/pr34657.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5d6c4998ecd5c..b55414b51b8b8 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3844,7 +3844,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl &LdChain, } LdOps.push_back(L); - + LdOp = L; LdWidth -= NewVTWidth; } diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll new file mode 100644 index 0000000000000..4b16ffd33d501 --- /dev/null +++ b/test/CodeGen/X86/pr34653.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s + +declare fastcc <38 x double> @test() + +define void @pr34653() { +; CHECK-LABEL: pr34653: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00 +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: callq test +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vmovaps %xmm5, %xmm6 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10 +; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11 +; CHECK-NEXT: vmovaps %xmm11, %xmm12 +; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13 +; CHECK-NEXT: vmovaps %xmm13, %xmm14 +; CHECK-NEXT: vmovaps %xmm10, %xmm15 +; CHECK-NEXT: vmovaps %xmm15, %xmm2 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] +; CHECK-NEXT: # kill: %YMM10 %YMM10 %ZMM10 +; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm10, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM9 %YMM9 %ZMM9 +; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM8 %YMM8 %ZMM8 +; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM7 %YMM7 %ZMM7 +; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %v = call fastcc <38 x double> @test() + %v.0 = extractelement <38 x double> %v, i32 0 + ret void +} + diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll new file mode 100644 index 0000000000000..a63bc2a08dde4 --- /dev/null +++ b/test/CodeGen/X86/pr34657.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s + +define <112 x i8> @pr34657() local_unnamed_addr { +; CHECK-LABEL: pr34657 +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vmovups (%rax), %xmm0 +; CHECK-NEXT: vmovups (%rax), %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovups (%rax), %zmm2 +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm2, (%rdi) +; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2 + ret <112 x i8> %wide.vec51 +} From f08c3d1d13d0fdc28dff010a88bd9f960c5ea7a9 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 2 Nov 2017 15:02:51 +0000 Subject: [PATCH 003/238] [ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass. Summary: This is mostly a noop (most of the test diffs are renamed blocks). There are a few temporary register renames (eax<->ecx) and a few blocks are shuffled around. See the discussion in PR33325 for more details. Reviewers: spatel Subscribers: mgorny Differential Revision: https://reviews.llvm.org/D39456 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317211 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/LinkAllPasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 +- lib/CodeGen/CodeGenPrepare.cpp | 710 --------------- lib/CodeGen/TargetPassConfig.cpp | 10 +- lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/ExpandMemCmp.cpp | 828 ++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 +++-- test/CodeGen/X86/memcmp.ll | 240 +++-- .../Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 + .../X86/memcmp.ll | 519 +++++------ 13 files changed, 1352 insertions(+), 1200 deletions(-) create mode 100644 lib/Transforms/Scalar/ExpandMemCmp.cpp create mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg rename test/Transforms/{CodeGenPrepare => ExpandMemCmp}/X86/memcmp.ll (56%) diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index c3ad8fe41af80..67a077081f77f 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,6 +128,7 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); +void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 765e63926daec..ce70f53ccb043 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,6 +180,7 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); + (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 8ef65774a93ef..4b365858787e5 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,10 +422,16 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains +// MergeICmps - Merge integer comparison chains into a memcmp // Pass *createMergeICmpsPass(); +//===----------------------------------------------------------------------===// +// +// ExpandMemCmp - Expand memcmp() to load/stores. +// +Pass *createExpandMemCmpPass(); + //===----------------------------------------------------------------------===// // // ValuePropagation - Propagate CFG-derived value information diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 51f2a320b299f..973ddebd987cf 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,12 +123,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -189,11 +183,6 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - namespace { using SetOfInstrs = SmallPtrSet; @@ -1697,699 +1686,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } -namespace { - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -} // end anonymous namespace - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2542,12 +1838,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } - LibFunc Func; - if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { - ModifiedDT = true; - return true; - } return false; } diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index c5101b1ecfc22..59e88ba3bdae4 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { - addPass(createMergeICmpsPass()); + if (getOptLevel() != CodeGenOpt::None) { + // The MergeICmpsPass tries to create memcmp calls by grouping sequences of + // loads and compares. ExpandMemCmpPass then tries to expand those calls + // into optimally-sized loads and compares. The transforms are enabled by a + // target lowering hook. + if (EnableMergeICmps) + addPass(createMergeICmpsPass()); + addPass(createExpandMemCmpPass()); } // Run GC lowering passes for builtin collectors diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005d3..164163d213126 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_library(LLVMScalarOpts DeadStoreElimination.cpp DivRemPairs.cpp EarlyCSE.cpp + ExpandMemCmp.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp diff --git a/lib/Transforms/Scalar/ExpandMemCmp.cpp b/lib/Transforms/Scalar/ExpandMemCmp.cpp new file mode 100644 index 0000000000000..0cd8c11422f7e --- /dev/null +++ b/lib/Transforms/Scalar/ExpandMemCmp.cpp @@ -0,0 +1,828 @@ +//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "expandmemcmp" + +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + +namespace { + + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + + + +class ExpandMemCmpPass : public FunctionPass { +public: + static char ID; + + ExpandMemCmpPass() : FunctionPass(ID) { + initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) { + return false; + } + const TargetLowering* TL = + TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); + + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + auto PA = runImpl(F, TLI, TTI, TL); + return !PA.areAllPreserved(); + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const TargetLowering* TL); + // Returns true if a change was made. + bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL); +}; + +bool ExpandMemCmpPass::runOnBlock( + BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL) { + for (Instruction& I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + LibFunc Func; + if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { + return true; + } + } + return false; +} + + +PreservedAnalyses ExpandMemCmpPass::runImpl( + Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, + const TargetLowering* TL) { + const DataLayout& DL = F.getParent()->getDataLayout(); + bool MadeChanges = false; + for (auto BBIt = F.begin(); BBIt != F.end();) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + MadeChanges = true; + // If changes were made, restart the function from the beginning, since + // the structure of the function was changed. + BBIt = F.begin(); + } else { + ++BBIt; + } + } + return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace + +char ExpandMemCmpPass::ID = 0; +INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) + +Pass *llvm::createExpandMemCmpPass() { + return new ExpandMemCmpPass(); +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace20685..4b694cecea6f5 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); + initializeExpandMemCmpPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 85b69c37aa01e..9056e2cab49db 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -gc-lowering +; START-AFTER: -machine-branch-prob -expandmemcmp ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Lower Garbage Collection Instructions +; START-AFTER-NEXT: Expand memcmp() to load/stores ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Lower Garbage Collection Instructions +; START-BEFORE-NEXT: Expand memcmp() to load/stores ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 77d9fa69182b8..3f5eeba7055cd 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: .LBB5_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB10_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: .LBB10_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB11_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: .LBB14_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: .LBB20_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: .LBB20_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: .LBB21_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: .LBB21_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: .LBB23_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: .LBB24_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: .LBB24_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: .LBB26_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: .LBB27_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 393e4c42d8b94..84fd45b0a08cb 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB7_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB7_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB7_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_1: # %res_block +; X64-NEXT: .LBB7_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB12_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_1: # %res_block +; X64-NEXT: .LBB12_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB14_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB14_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: .LBB15_2: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB18_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: .LBB22_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: .LBB22_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: .LBB23_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: .LBB25_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: .LBB25_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: .LBB25_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: .LBB26_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: .LBB26_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: .LBB26_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: .LBB28_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: .LBB29_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg new file mode 100644 index 0000000000000..e71f3cc4c41e7 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll similarity index 56% rename from test/Transforms/CodeGenPrepare/X86/memcmp.ll rename to test/Transforms/ExpandMemCmp/X86/memcmp.ll index a4f635c956df9..1abfb20f36961 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 +; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) @@ -23,30 +23,33 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) @@ -74,30 +77,33 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) @@ -106,36 +112,37 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* ; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 ; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) ; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] -; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) @@ -153,34 +160,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32-NEXT: br label [[LOADBB:%.*]] ; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* ; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 ; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] -; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( @@ -207,30 +215,33 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 ; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; X64-NEXT: br label [[ENDBLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) @@ -243,36 +254,37 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 ; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) ; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) @@ -294,36 +306,37 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 ; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) ; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) @@ -363,34 +376,35 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 ; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] -; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) @@ -417,22 +431,23 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -465,22 +480,23 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -495,24 +511,25 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* ; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 ; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -540,24 +557,25 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32-NEXT: br label [[LOADBB:%.*]] ; X32: res_block: ; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* ; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 ; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X32: endblock: ; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -589,22 +607,23 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 ; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -625,24 +644,25 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* ; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 ; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -676,24 +696,25 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* ; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 ; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 From c0222867301e7d88ec925dea7d306468ff3ea172 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 2 Nov 2017 15:53:10 +0000 Subject: [PATCH 004/238] Revert "[ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass." undefined reference to `llvm::TargetPassConfig::ID' on clang-ppc64le-linux-multistage This reverts commit eea333c33fa73ad225ef28607795984829f65688. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317213 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 - include/llvm/LinkAllPasses.h | 1 - include/llvm/Transforms/Scalar.h | 8 +- lib/CodeGen/CodeGenPrepare.cpp | 710 +++++++++++++++ lib/CodeGen/TargetPassConfig.cpp | 10 +- lib/Transforms/Scalar/CMakeLists.txt | 1 - lib/Transforms/Scalar/ExpandMemCmp.cpp | 828 ------------------ lib/Transforms/Scalar/Scalar.cpp | 1 - test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 ++--- test/CodeGen/X86/memcmp.ll | 240 ++--- .../X86/memcmp.ll | 519 ++++++----- .../Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 - 13 files changed, 1200 insertions(+), 1352 deletions(-) delete mode 100644 lib/Transforms/Scalar/ExpandMemCmp.cpp rename test/Transforms/{ExpandMemCmp => CodeGenPrepare}/X86/memcmp.ll (56%) delete mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 67a077081f77f..c3ad8fe41af80 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,7 +128,6 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); -void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index ce70f53ccb043..765e63926daec 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,7 +180,6 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); - (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 4b365858787e5..8ef65774a93ef 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,16 +422,10 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains into a memcmp +// MergeICmps - Merge integer comparison chains // Pass *createMergeICmpsPass(); -//===----------------------------------------------------------------------===// -// -// ExpandMemCmp - Expand memcmp() to load/stores. -// -Pass *createExpandMemCmpPass(); - //===----------------------------------------------------------------------===// // // ValuePropagation - Propagate CFG-derived value information diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987cf..51f2a320b299f 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,6 +123,12 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -183,6 +189,11 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { using SetOfInstrs = SmallPtrSet; @@ -1686,6 +1697,699 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } +namespace { + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +} // end anonymous namespace + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1838,6 +2542,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } + LibFunc Func; + if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } return false; } diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 59e88ba3bdae4..c5101b1ecfc22 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,14 +600,8 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None) { - // The MergeICmpsPass tries to create memcmp calls by grouping sequences of - // loads and compares. ExpandMemCmpPass then tries to expand those calls - // into optimally-sized loads and compares. The transforms are enabled by a - // target lowering hook. - if (EnableMergeICmps) - addPass(createMergeICmpsPass()); - addPass(createExpandMemCmpPass()); + if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { + addPass(createMergeICmpsPass()); } // Run GC lowering passes for builtin collectors diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 164163d213126..d79ae851005d3 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -9,7 +9,6 @@ add_llvm_library(LLVMScalarOpts DeadStoreElimination.cpp DivRemPairs.cpp EarlyCSE.cpp - ExpandMemCmp.cpp FlattenCFGPass.cpp Float2Int.cpp GuardWidening.cpp diff --git a/lib/Transforms/Scalar/ExpandMemCmp.cpp b/lib/Transforms/Scalar/ExpandMemCmp.cpp deleted file mode 100644 index 0cd8c11422f7e..0000000000000 --- a/lib/Transforms/Scalar/ExpandMemCmp.cpp +++ /dev/null @@ -1,828 +0,0 @@ -//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to partially inline the fast path of well-known library -// functions, such as using square-root instructions for cases where sqrt() -// does not need to set errno. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" - -using namespace llvm; - -#define DEBUG_TYPE "expandmemcmp" - -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - -namespace { - - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - - - -class ExpandMemCmpPass : public FunctionPass { -public: - static char ID; - - ExpandMemCmpPass() : FunctionPass(ID) { - initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) return false; - - auto *TPC = getAnalysisIfAvailable(); - if (!TPC) { - return false; - } - const TargetLowering* TL = - TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); - - const TargetLibraryInfo *TLI = - &getAnalysis().getTLI(); - const TargetTransformInfo *TTI = - &getAnalysis().getTTI(F); - auto PA = runImpl(F, TLI, TTI, TL); - return !PA.areAllPreserved(); - } - -private: - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - FunctionPass::getAnalysisUsage(AU); - } - - PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, - const TargetLowering* TL); - // Returns true if a change was made. - bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL); -}; - -bool ExpandMemCmpPass::runOnBlock( - BasicBlock &BB, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, const TargetLowering* TL, - const DataLayout& DL) { - for (Instruction& I : BB) { - CallInst *CI = dyn_cast(&I); - if (!CI) { - continue; - } - LibFunc Func; - if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { - return true; - } - } - return false; -} - - -PreservedAnalyses ExpandMemCmpPass::runImpl( - Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const TargetLowering* TL) { - const DataLayout& DL = F.getParent()->getDataLayout(); - bool MadeChanges = false; - for (auto BBIt = F.begin(); BBIt != F.end();) { - if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { - MadeChanges = true; - // If changes were made, restart the function from the beginning, since - // the structure of the function was changed. - BBIt = F.begin(); - } else { - ++BBIt; - } - } - return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} - -} // namespace - -char ExpandMemCmpPass::ID = 0; -INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", - "Expand memcmp() to load/stores", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", - "Expand memcmp() to load/stores", false, false) - -Pass *llvm::createExpandMemCmpPass() { - return new ExpandMemCmpPass(); -} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 4b694cecea6f5..c1034ace20685 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,7 +48,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); - initializeExpandMemCmpPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 9056e2cab49db..85b69c37aa01e 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -expandmemcmp +; START-AFTER: -machine-branch-prob -gc-lowering ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Expand memcmp() to load/stores +; START-AFTER-NEXT: Lower Garbage Collection Instructions ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Expand memcmp() to load/stores +; START-BEFORE-NEXT: Lower Garbage Collection Instructions ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 3f5eeba7055cd..77d9fa69182b8 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB5_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB5_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB5_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_2: # %res_block +; X64-NEXT: .LBB5_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB10_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB10_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB10_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB10_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_2: # %res_block +; X64-NEXT: .LBB10_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB11_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: .LBB11_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: incl %ecx +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: incl %eax ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_2: # %res_block +; X86-NEXT: .LBB13_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB14_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_2: # %res_block +; X64-NEXT: .LBB14_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,27 +505,28 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB15_3 -; X64-NEXT: .LBB15_2: # %res_block +; X64-NEXT: jne .LBB15_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB15_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -545,27 +546,28 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -699,19 +701,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_2: # %res_block +; X64-SSE2-NEXT: .LBB20_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -719,18 +721,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_2: # %res_block +; X64-AVX2-NEXT: .LBB20_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -755,18 +757,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_2: # %res_block +; X64-SSE2-NEXT: .LBB21_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +776,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_2: # %res_block +; X64-AVX2-NEXT: .LBB21_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -831,7 +833,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -839,8 +841,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -848,7 +850,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_2: # %res_block +; X86-SSE2-NEXT: .LBB23_1: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -857,14 +859,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -872,7 +874,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block +; X64-SSE2-NEXT: .LBB23_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -907,21 +909,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_2: # %res_block +; X86-SSE2-NEXT: .LBB24_1: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -930,20 +932,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_2: # %res_block +; X64-SSE2-NEXT: .LBB24_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1007,20 +1009,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_2: # %res_block +; X64-AVX2-NEXT: .LBB26_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1057,20 +1059,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_2: # %res_block +; X64-AVX2-NEXT: .LBB27_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 84fd45b0a08cb..393e4c42d8b94 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: cmpw (%eax), %dx -; X86-NEXT: jne .LBB7_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 2(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 2(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: cmpw (%ecx), %dx +; X86-NEXT: jne .LBB7_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 2(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 2(%ecx), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB7_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB7_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_2: # %res_block +; X64-NEXT: .LBB7_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB12_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movb 4(%ecx), %dl -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpb 4(%eax), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB12_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movb 4(%eax), %dl +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpb 4(%ecx), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB12_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB12_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_2: # %res_block +; X64-NEXT: .LBB12_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,21 +394,23 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_2: # %res_block +; X86-NEXT: jne .LBB13_1 +; X86-NEXT: # BB#3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB13_1: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -429,21 +431,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: cmpl (%eax), %edx -; X86-NEXT: jne .LBB14_2 -; X86-NEXT: # BB#1: # %loadbb1 -; X86-NEXT: movl 4(%ecx), %edx -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl 4(%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: cmpl (%ecx), %edx +; X86-NEXT: jne .LBB14_1 +; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl 4(%ecx), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_2: # %res_block -; X86-NEXT: movl $1, %ecx +; X86-NEXT: .LBB14_1: # %res_block +; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -460,15 +462,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: +; X86: # BB#0: # %loadbb ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_2 -; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: jne .LBB15_1 +; X86-NEXT: # BB#2: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_2: # %res_block +; X86-NEXT: .LBB15_1: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -500,16 +502,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB16_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_2: # %res_block +; X64-NEXT: .LBB16_1: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -532,27 +534,28 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB17_3 -; X64-NEXT: .LBB17_2: # %res_block +; X64-NEXT: jne .LBB17_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB17_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -572,27 +575,28 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: +; X64: # BB#0: # %loadbb ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_2 -; X64-NEXT: # BB#1: # %loadbb1 +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#2: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: je .LBB18_3 -; X64-NEXT: .LBB18_2: # %res_block +; X64-NEXT: jne .LBB18_1 +; X64-NEXT: # BB#3: # %endblock +; X64-NEXT: retq +; X64-NEXT: .LBB18_1: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax -; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -750,19 +754,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_2: # %res_block +; X64-SSE2-NEXT: .LBB22_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -770,18 +774,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: +; X64-AVX: # BB#0: # %loadbb ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_2 -; X64-AVX-NEXT: # BB#1: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_2: # %res_block +; X64-AVX-NEXT: .LBB22_1: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -806,18 +810,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_2: # %res_block +; X64-SSE2-NEXT: .LBB23_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -825,18 +829,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: +; X64-AVX: # BB#0: # %loadbb ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_2 -; X64-AVX-NEXT: # BB#1: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_1 +; X64-AVX-NEXT: # BB#2: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_2: # %res_block +; X64-AVX-NEXT: .LBB23_1: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -894,7 +898,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -902,8 +906,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -911,7 +915,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_2: # %res_block +; X86-SSE2-NEXT: .LBB25_1: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -919,14 +923,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -934,7 +938,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_2: # %res_block +; X64-SSE2-NEXT: .LBB25_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -942,20 +946,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: +; X64-AVX1: # BB#0: # %loadbb ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_2 -; X64-AVX1-NEXT: # BB#1: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_2: # %res_block +; X64-AVX1-NEXT: .LBB25_1: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1002,21 +1006,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: +; X86-SSE2: # BB#0: # %loadbb ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_2 -; X86-SSE2-NEXT: # BB#1: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_1 +; X86-SSE2-NEXT: # BB#2: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_2: # %res_block +; X86-SSE2-NEXT: .LBB26_1: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1024,20 +1028,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: +; X64-SSE2: # BB#0: # %loadbb ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_2 -; X64-SSE2-NEXT: # BB#1: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_1 +; X64-SSE2-NEXT: # BB#2: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_2: # %res_block +; X64-SSE2-NEXT: .LBB26_1: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1045,20 +1049,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: +; X64-AVX1: # BB#0: # %loadbb ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_2 -; X64-AVX1-NEXT: # BB#1: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_1 +; X64-AVX1-NEXT: # BB#2: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_2: # %res_block +; X64-AVX1-NEXT: .LBB26_1: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1132,20 +1136,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_2: # %res_block +; X64-AVX2-NEXT: .LBB28_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1193,20 +1197,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: +; X64-AVX2: # BB#0: # %loadbb ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_2 -; X64-AVX2-NEXT: # BB#1: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_1 +; X64-AVX2-NEXT: # BB#2: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_2: # %res_block +; X64-AVX2-NEXT: .LBB29_1: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll similarity index 56% rename from test/Transforms/ExpandMemCmp/X86/memcmp.ll rename to test/Transforms/CodeGenPrepare/X86/memcmp.ll index 1abfb20f36961..a4f635c956df9 100644 --- a/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 +; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) @@ -23,33 +23,30 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp3( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) @@ -77,33 +74,30 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp5( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) @@ -112,37 +106,36 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp6( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 ; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 +; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] ; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 ; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 -; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] -; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] +; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) @@ -160,35 +153,34 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: br label [[LOADBB:%.*]] +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: -; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 ; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 +; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] ; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) -; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] -; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] +; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( @@ -215,33 +207,30 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp9( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] ; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 ; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] ; X64-NEXT: br label [[ENDBLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) @@ -254,37 +243,36 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp10( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 +; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] ; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] +; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 ; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 -; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] -; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) @@ -306,37 +294,36 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp12( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 +; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] ; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] +; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 ; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 -; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] -; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] +; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) @@ -376,35 +363,34 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp16( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] +; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] -; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) -; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) -; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* +; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 ; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 +; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] ; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) -; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] -; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] +; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) @@ -431,23 +417,22 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -480,23 +465,22 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -511,25 +495,24 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: br label [[LOADBB:%.*]] +; ALL-NEXT: loadbb: +; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 ; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 +; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] ; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -557,25 +540,24 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: br label [[LOADBB:%.*]] +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] +; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X32: res_block: ; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] -; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 ; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 +; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] ; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X32: endblock: ; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -607,23 +589,22 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq9( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] ; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] -; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -644,25 +625,24 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq10( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 ; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 +; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] ; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -696,25 +676,24 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq12( -; X64-NEXT: br label [[LOADBB:%.*]] +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] +; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] -; X64: loadbb: -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] -; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] -; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* +; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 ; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 +; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] ; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] -; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] +; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg deleted file mode 100644 index e71f3cc4c41e7..0000000000000 --- a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'X86' in config.root.targets: - config.unsupported = True - From 2880b72d32a55c01c386ebf5eca64df58ec32dae Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 16:23:31 +0000 Subject: [PATCH 005/238] [RS4GC] Strip off invariant.start because memory locations arent invariant Summary: Invariant.start on memory locations has the property that the memory location is unchanging. However, this is not true in the face of rewriting statepoints for GC. Teach RS4GC about removing invariant.start so that optimizations after RS4GC does not incorrect sink a load from the memory location past a statepoint. Added test showcasing the issue. Reviewers: reames, apilipenko, dneilson Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39388 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317215 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/RewriteStatepointsForGC.cpp | 48 +++++++++++++---- .../drop-invalid-metadata.ll | 53 +++++++++++++++++++ 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 1ca77cfec3292..9a064829deeea 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn + // stripNonValidData asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributesAndMetadata(M); + stripNonValidData(M); } return Changed; @@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// heap. stripNonValidData (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - void stripNonValidAttributesAndMetadata(Module &M); + /// Apart from attributes and metadata, we also remove instructions that imply + /// constant physical memory: llvm.invariant.start. + void stripNonValidData(Module &M); - // Helpers for stripNonValidAttributesAndMetadata - void stripNonValidAttributesAndMetadataFromBody(Function &F); + // Helpers for stripNonValidData + void stripNonValidDataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + // Set of invariantstart instructions that we need to remove. + // Use this to avoid invalidating the instruction iterator. + SmallVector InvariantStartInstructions; + for (Instruction &I : instructions(F)) { + // invariant.start on memory location implies that the referenced memory + // location is constant and unchanging. This is no longer true after + // RewriteStatepointsForGC runs because there can be calls to gc.statepoint + // which frees the entire heap and the presence of invariant.start allows + // the optimizer to sink the load of a memory location past a statepoint, + // which is incorrect. + if (auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + InvariantStartInstructions.push_back(II); + continue; + } + if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2422,6 +2440,18 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } + + // Delete the invariant.start instructions and any corresponding uses that + // don't have further uses, for example invariant.end. + for (auto *II : InvariantStartInstructions) { + for (auto *U : II->users()) + if (auto *I = dyn_cast(U)) + if (U->hasNUses(0)) + I->eraseFromParent(); + // We cannot just delete the remaining uses of II, so we RAUW undef. + II->replaceAllUsesWith(UndefValue::get(II->getType())); + II->eraseFromParent(); + } } /// Returns true if this function should be rewritten by this pass. The main @@ -2438,7 +2468,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { +void RewriteStatepointsForGC::stripNonValidData(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2447,7 +2477,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesAndMetadataFromBody(F); + stripNonValidDataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 105afa9def5c1..4f3ab6a4bebf1 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,6 +75,59 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } +; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is +; incorrect. remove the invariant.start and RAUW undef. +define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +; CHECK-LABEL: taken: +; CHECK-NOT: llvm.invariant.end +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +; CHECK: gc.statepoint +untaken: + %foo = call i32 @escaping.invariant.start({}* %invst) + call void @dummy(i32 %foo) + ret void +} + +; invariant.start and end is removed. No other uses. +define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start2 +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +; CHECK-LABEL: taken: +; CHECK-NOT: llvm.invariant.end +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +untaken: + ret void +} +declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly +declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind +declare i32 @escaping.invariant.start({}*) nounwind +declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly From 685fd434908418296567408861b455e61af41ae7 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 16:45:51 +0000 Subject: [PATCH 006/238] Revert "[RS4GC] Strip off invariant.start because memory locations arent invariant" This reverts commit r317215, investigating the test failure. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317217 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/RewriteStatepointsForGC.cpp | 48 ++++------------- .../drop-invalid-metadata.ll | 53 ------------------- 2 files changed, 9 insertions(+), 92 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 9a064829deeea..1ca77cfec3292 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidData asserts that shouldRewriteStatepointsIn + // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidData(M); + stripNonValidAttributesAndMetadata(M); } return Changed; @@ -146,17 +146,15 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidData (conservatively) restores + /// heap. stripNonValidAttributesAndMetadata (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - /// Apart from attributes and metadata, we also remove instructions that imply - /// constant physical memory: llvm.invariant.start. - void stripNonValidData(Module &M); + void stripNonValidAttributesAndMetadata(Module &M); - // Helpers for stripNonValidData - void stripNonValidDataFromBody(Function &F); + // Helpers for stripNonValidAttributesAndMetadata + void stripNonValidAttributesAndMetadataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2387,30 +2385,14 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); - // Set of invariantstart instructions that we need to remove. - // Use this to avoid invalidating the instruction iterator. - SmallVector InvariantStartInstructions; - for (Instruction &I : instructions(F)) { - // invariant.start on memory location implies that the referenced memory - // location is constant and unchanging. This is no longer true after - // RewriteStatepointsForGC runs because there can be calls to gc.statepoint - // which frees the entire heap and the presence of invariant.start allows - // the optimizer to sink the load of a memory location past a statepoint, - // which is incorrect. - if (auto *II = dyn_cast(&I)) - if (II->getIntrinsicID() == Intrinsic::invariant_start) { - InvariantStartInstructions.push_back(II); - continue; - } - if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2440,18 +2422,6 @@ void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } - - // Delete the invariant.start instructions and any corresponding uses that - // don't have further uses, for example invariant.end. - for (auto *II : InvariantStartInstructions) { - for (auto *U : II->users()) - if (auto *I = dyn_cast(U)) - if (U->hasNUses(0)) - I->eraseFromParent(); - // We cannot just delete the remaining uses of II, so we RAUW undef. - II->replaceAllUsesWith(UndefValue::get(II->getType())); - II->eraseFromParent(); - } } /// Returns true if this function should be rewritten by this pass. The main @@ -2468,7 +2438,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidData(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2477,7 +2447,7 @@ void RewriteStatepointsForGC::stripNonValidData(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidDataFromBody(F); + stripNonValidAttributesAndMetadataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 4f3ab6a4bebf1..105afa9def5c1 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,59 +75,6 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } -; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is -; incorrect. remove the invariant.start and RAUW undef. -define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { -; CHECK-LABEL: test_inv_start -; CHECK-NOT: invariant.start -; CHECK: gc.statepoint - %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p - %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) - %v2 = load i32, i32 addrspace(1)* %v1 - call void @baz(i32 %x) - br i1 %cond, label %taken, label %untaken - -; CHECK-LABEL: taken: -; CHECK-NOT: llvm.invariant.end -taken: - store i32 %v2, i32 addrspace(1)* %q, align 16 - call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) - ret void - -; CHECK-LABEL: untaken: -; CHECK: gc.statepoint -untaken: - %foo = call i32 @escaping.invariant.start({}* %invst) - call void @dummy(i32 %foo) - ret void -} - -; invariant.start and end is removed. No other uses. -define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { -; CHECK-LABEL: test_inv_start2 -; CHECK-NOT: invariant.start -; CHECK: gc.statepoint - %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p - %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) - %v2 = load i32, i32 addrspace(1)* %v1 - call void @baz(i32 %x) - br i1 %cond, label %taken, label %untaken - -; CHECK-LABEL: taken: -; CHECK-NOT: llvm.invariant.end -taken: - store i32 %v2, i32 addrspace(1)* %q, align 16 - call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) - ret void - -; CHECK-LABEL: untaken: -untaken: - ret void -} -declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly -declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind -declare i32 @escaping.invariant.start({}*) nounwind -declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly From 9cca1f183a2e5d6b131cc2fabd246f8908021902 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 2 Nov 2017 17:12:34 +0000 Subject: [PATCH 007/238] [dsymutil] Add a manpage for dsymutil git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317221 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CMakeLists.txt | 11 ++-- docs/CommandGuide/index.rst | 1 + docs/CommandGuide/llvm-dsymutil.rst | 86 ++++++++++++++++++++++++++++ docs/CommandGuide/llvm-dwarfdump.rst | 2 +- 4 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 docs/CommandGuide/llvm-dsymutil.rst diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index f1f93c7a228b0..6e430459e5dd2 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -3,7 +3,7 @@ if (DOXYGEN_FOUND) if (LLVM_ENABLE_DOXYGEN) set(abs_top_srcdir ${CMAKE_CURRENT_SOURCE_DIR}) set(abs_top_builddir ${CMAKE_CURRENT_BINARY_DIR}) - + if (HAVE_DOT) set(DOT ${LLVM_PATH_DOT}) endif() @@ -21,20 +21,20 @@ if (LLVM_ENABLE_DOXYGEN) set(enable_external_search "NO") set(extra_search_mappings "") endif() - + # If asked, configure doxygen for the creation of a Qt Compressed Help file. option(LLVM_ENABLE_DOXYGEN_QT_HELP "Generate a Qt Compressed Help file." OFF) if (LLVM_ENABLE_DOXYGEN_QT_HELP) set(LLVM_DOXYGEN_QCH_FILENAME "org.llvm.qch" CACHE STRING "Filename of the Qt Compressed help file") - set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING + set(LLVM_DOXYGEN_QHP_NAMESPACE "org.llvm" CACHE STRING "Namespace under which the intermediate Qt Help Project file lives") set(LLVM_DOXYGEN_QHP_CUST_FILTER_NAME "${PACKAGE_STRING}" CACHE STRING "See http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-filters") set(LLVM_DOXYGEN_QHP_CUST_FILTER_ATTRS "${PACKAGE_NAME},${PACKAGE_VERSION}" CACHE STRING "See http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes") - find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator + find_program(LLVM_DOXYGEN_QHELPGENERATOR_PATH qhelpgenerator DOC "Path to the qhelpgenerator binary") if (NOT LLVM_DOXYGEN_QHELPGENERATOR_PATH) message(FATAL_ERROR "Failed to find qhelpgenerator binary") @@ -55,7 +55,7 @@ if (LLVM_ENABLE_DOXYGEN) set(llvm_doxygen_qhp_cust_filter_name "") set(llvm_doxygen_qhp_cust_filter_attrs "") endif() - + option(LLVM_DOXYGEN_SVG "Use svg instead of png files for doxygen graphs." OFF) if (LLVM_DOXYGEN_SVG) @@ -113,6 +113,7 @@ if (LLVM_ENABLE_SPHINX) if (${SPHINX_OUTPUT_MAN}) add_sphinx_target(man llvm) add_sphinx_target(man llvm-dwarfdump) + add_sphinx_target(man llvm-dsymutil) endif() endif() diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst index 5a0a98ceb1f98..a706ba1d675dc 100644 --- a/docs/CommandGuide/index.rst +++ b/docs/CommandGuide/index.rst @@ -30,6 +30,7 @@ Basic Commands llvm-stress llvm-symbolizer llvm-dwarfdump + llvm-dsymutil Debugging Tools ~~~~~~~~~~~~~~~ diff --git a/docs/CommandGuide/llvm-dsymutil.rst b/docs/CommandGuide/llvm-dsymutil.rst new file mode 100644 index 0000000000000..19340e194b82b --- /dev/null +++ b/docs/CommandGuide/llvm-dsymutil.rst @@ -0,0 +1,86 @@ +llvm-dsymutil - manipulate archived DWARF debug symbol files +============================================================ + +SYNOPSIS +-------- + +:program:`llvm-dsymutil` [*options*] [*filename*] + +DESCRIPTION +----------- + +:program:`llvm-dsymutil` links the DWARF debug information found in the object +files for the executable input file by using debug symbols information +contained in its symbol table. + +OPTIONS +------- +.. option:: -arch= + + Link DWARF debug information only for specified CPU architecture + types. This option can be specified multiple times, once for each + desired architecture. All cpu architectures will be linked by + default. + +.. option:: -dump-debug-map + + Parse and dump the debug map to standard output. Not DWARF link + will take place. + +.. option:: -f, -flat + + Produce a flat dSYM file (not a bundle). + +.. option:: -no-odr + + Do not use ODR (One Definition Rule) for type uniquing. + +.. option:: -no-output + + Do the link in memory, but do not emit the result file. + +.. option:: -no-swiftmodule-timestamp + + Don't check timestamp for swiftmodule files. + +.. option:: -j , -num-threads= + + Specifies the maximum number (n) of simultaneous threads to use + when linking multiple architectures. + +.. option:: -o= + + Specify the output file. default: .dwarf + +.. option:: -oso-prepend-path= + + Specify a directory to prepend to the paths of object files. + +.. option:: -s, -symtab + + Dumps the symbol table found in executable or object file(s) and + exits. + +.. option:: -v, -verbose + + Verbosity level + +.. option:: --version + + Display the version of the tool. + +.. option:: -y + + Treat the input file is a YAML debug map rather than a binary. + + +EXIT STATUS +----------- + +:program:`llvm-dsymutil` returns 0 if the DWARF debug information was linked +successfully. Otherwise, it returns 1. + +SEE ALSO +-------- + +:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst index a3b62664cbe54..4e7791573e65c 100644 --- a/docs/CommandGuide/llvm-dwarfdump.rst +++ b/docs/CommandGuide/llvm-dwarfdump.rst @@ -139,4 +139,4 @@ successfully. Otherwise, it returns 1. SEE ALSO -------- -:manpage:`dsymutil(1)` +:manpage:`llvm-dsymutil(1)` From 0416327f19718d0834f85aa644a2572f67a94acb Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Thu, 2 Nov 2017 17:52:27 +0000 Subject: [PATCH 008/238] [TargetParser][AArch64] Reorder enum to preserve 5.0.0 libLLVM ABI. This is required for backporting r311659 to the 5.0.1 release. PR35060 Differential Revision: https://reviews.llvm.org/D39558 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317222 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/TargetParser.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h index 6b56a635ff056..b3f91433bd937 100644 --- a/include/llvm/Support/TargetParser.h +++ b/include/llvm/Support/TargetParser.h @@ -167,10 +167,10 @@ enum ArchExtKind : unsigned { AEK_PROFILE = 1 << 6, AEK_RAS = 1 << 7, AEK_LSE = 1 << 8, - AEK_RDM = 1 << 9, - AEK_SVE = 1 << 10, - AEK_DOTPROD = 1 << 11, - AEK_RCPC = 1 << 12 + AEK_SVE = 1 << 9, + AEK_DOTPROD = 1 << 10, + AEK_RCPC = 1 << 11, + AEK_RDM = 1 << 12 }; StringRef getCanonicalArchName(StringRef Arch); From 4c88213d82fbff2542c5aaa8ffb7b0d93c66b6cb Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Thu, 2 Nov 2017 18:04:44 +0000 Subject: [PATCH 009/238] Fixed line length style issue. Reviewers: zturner Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39395 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317223 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/MemoryBuffer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h index 73f0251a6b6e3..59c93f15d7b83 100644 --- a/include/llvm/Support/MemoryBuffer.h +++ b/include/llvm/Support/MemoryBuffer.h @@ -136,7 +136,8 @@ class MemoryBuffer { /// Map a subrange of the specified file as a MemoryBuffer. static ErrorOr> - getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, bool IsVolatile = false); + getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, + bool IsVolatile = false); //===--------------------------------------------------------------------===// // Provided for performance analysis. From 0c059eff813ce99b6882cc6812a2770a2f45dff4 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 18:24:04 +0000 Subject: [PATCH 010/238] Strip off invariant.start because memory locations arent invariant The original change was reverted in rL317217 because of the failure in the RS4GC testcase. I couldn't reproduce the failure on my local machine (macbook) but could reproduce it on a linux box. The failure was around removing the uses of invariant.start. The fix here is to just RAUW undef (which was the first implementation in D39388). This is perfectly valid IR as discussed in the review. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317225 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Scalar/RewriteStatepointsForGC.cpp | 42 ++++++++++++---- .../drop-invalid-metadata.ll | 48 +++++++++++++++++++ 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 1ca77cfec3292..44acfc8857971 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn + // stripNonValidData asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributesAndMetadata(M); + stripNonValidData(M); } return Changed; @@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass { /// metadata implying dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// heap. stripNonValidData (conservatively) restores /// correctness by erasing all attributes in the module that externally imply /// dereferenceability. Similar reasoning also applies to the noalias /// attributes and metadata. gc.statepoint can touch the entire heap including /// noalias objects. - void stripNonValidAttributesAndMetadata(Module &M); + /// Apart from attributes and metadata, we also remove instructions that imply + /// constant physical memory: llvm.invariant.start. + void stripNonValidData(Module &M); - // Helpers for stripNonValidAttributesAndMetadata - void stripNonValidAttributesAndMetadataFromBody(Function &F); + // Helpers for stripNonValidData + void stripNonValidDataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); // Certain metadata on instructions are invalid after running RS4GC. @@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + // Set of invariantstart instructions that we need to remove. + // Use this to avoid invalidating the instruction iterator. + SmallVector InvariantStartInstructions; + for (Instruction &I : instructions(F)) { + // invariant.start on memory location implies that the referenced memory + // location is constant and unchanging. This is no longer true after + // RewriteStatepointsForGC runs because there can be calls to gc.statepoint + // which frees the entire heap and the presence of invariant.start allows + // the optimizer to sink the load of a memory location past a statepoint, + // which is incorrect. + if (auto *II = dyn_cast(&I)) + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + InvariantStartInstructions.push_back(II); + continue; + } + if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2422,6 +2440,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } + + // Delete the invariant.start instructions and RAUW undef. + for (auto *II : InvariantStartInstructions) { + II->replaceAllUsesWith(UndefValue::get(II->getType())); + II->eraseFromParent(); + } } /// Returns true if this function should be rewritten by this pass. The main @@ -2438,7 +2462,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { +void RewriteStatepointsForGC::stripNonValidData(Module &M) { #ifndef NDEBUG assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2447,7 +2471,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesAndMetadataFromBody(F); + stripNonValidDataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll index 105afa9def5c1..ebc15865a67da 100644 --- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll +++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll @@ -75,6 +75,54 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3 ret void } +; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is +; incorrect. remove the invariant.start and RAUW undef. +define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +; CHECK-LABEL: untaken: +; CHECK: gc.statepoint +untaken: + %foo = call i32 @escaping.invariant.start({}* %invst) + call void @dummy(i32 %foo) + ret void +} + +; invariant.start is removed and the uses are undef'ed. +define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" { +; CHECK-LABEL: test_inv_start2 +; CHECK-NOT: invariant.start +; CHECK: gc.statepoint + %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p + %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1) + %v2 = load i32, i32 addrspace(1)* %v1 + call void @baz(i32 %x) + br i1 %cond, label %taken, label %untaken + +taken: + store i32 %v2, i32 addrspace(1)* %q, align 16 + call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1) + ret void + +untaken: + ret void +} +declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)* nocapture) nounwind readonly +declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind +declare i32 @escaping.invariant.start({}*) nounwind +declare void @dummy(i32) declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) ; Function Attrs: nounwind readonly From 2f759d471a7e20388901944f6b64d6c74c8a00ae Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 2 Nov 2017 18:44:54 +0000 Subject: [PATCH 011/238] [dsymutil][doc] Improve wording in manpage and rename file. - Improve wording - Rename llvm-dsymutil to dsymutil - Name -arch= argument Differential revision: https://reviews.llvm.org/D39561 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317226 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CMakeLists.txt | 2 +- docs/CommandGuide/dsymutil.rst | 89 ++++++++++++++++++++++++++++ docs/CommandGuide/index.rst | 2 +- docs/CommandGuide/llvm-dsymutil.rst | 86 --------------------------- docs/CommandGuide/llvm-dwarfdump.rst | 2 +- test/tools/dsymutil/cmdline.test | 2 +- tools/dsymutil/dsymutil.cpp | 4 +- 7 files changed, 95 insertions(+), 92 deletions(-) create mode 100644 docs/CommandGuide/dsymutil.rst delete mode 100644 docs/CommandGuide/llvm-dsymutil.rst diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 6e430459e5dd2..0f2681e0cd86b 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -113,7 +113,7 @@ if (LLVM_ENABLE_SPHINX) if (${SPHINX_OUTPUT_MAN}) add_sphinx_target(man llvm) add_sphinx_target(man llvm-dwarfdump) - add_sphinx_target(man llvm-dsymutil) + add_sphinx_target(man dsymutil) endif() endif() diff --git a/docs/CommandGuide/dsymutil.rst b/docs/CommandGuide/dsymutil.rst new file mode 100644 index 0000000000000..3cbbcb078942e --- /dev/null +++ b/docs/CommandGuide/dsymutil.rst @@ -0,0 +1,89 @@ +dsymutil - manipulate archived DWARF debug symbol files +======================================================= + +SYNOPSIS +-------- + +| :program:`dsymutil` [*options*] *executable* + +DESCRIPTION +----------- + +:program:`dsymutil` links the DWARF debug information found in the object files +for an executable *executable* by using debug symbols information contained in +its symbol table. By default, the linked debug information is placed in a +``.dSYM`` bundle with the same name as the executable. + +OPTIONS +------- +.. option:: -arch= + + Link DWARF debug information only for specified CPU architecture types. + Architectures may be specified by name. When using this option, an error will + be returned if any architectures can not be properly linked. This option can + be specified multiple times, once for each desired architecture. All CPU + architectures will be linked by default and any architectures that can't be + properly linked will cause :program:`dsymutil` to return an error. + +.. option:: -dump-debug-map + + Dump the executable debug-map (the list of the object files containing the + debug information) in YAML format and exit. Not DWARF link will take place. + +.. option:: -f, -flat + + Produce a flat dSYM file. A ``.dwarf`` extension will be appended to the + executable name unless the output file is specified using the -o option. + +.. option:: -no-odr + + Do not use ODR (One Definition Rule) for uniquing C++ types. + +.. option:: -no-output + + Do the link in memory, but do not emit the result file. + +.. option:: -no-swiftmodule-timestamp + + Don't check the timestamp for swiftmodule files. + +.. option:: -j , -num-threads= + + Specifies the maximum number (``n``) of simultaneous threads to use when + linking multiple architectures. + +.. option:: -o + + Specifies an alternate ``path`` to place the dSYM bundle. The default dSYM + bundle path is created by appending ``.dSYM`` to the executable name. + +.. option:: -oso-prepend-path= + + Specifies a ``path`` to prepend to all debug symbol object file paths. + +.. option:: -s, -symtab + + Dumps the symbol table found in executable or object file(s) and exits. + +.. option:: -v, -verbose + + Display verbose information when linking. + +.. option:: --version + + Display the version of the tool. + +.. option:: -y + + Treat *executable* as a YAML debug-map rather than an executable. + +EXIT STATUS +----------- + +:program:`dsymutil` returns 0 if the DWARF debug information was linked +successfully. Otherwise, it returns 1. + +SEE ALSO +-------- + +:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst index a706ba1d675dc..805df00c1738c 100644 --- a/docs/CommandGuide/index.rst +++ b/docs/CommandGuide/index.rst @@ -30,7 +30,7 @@ Basic Commands llvm-stress llvm-symbolizer llvm-dwarfdump - llvm-dsymutil + dsymutil Debugging Tools ~~~~~~~~~~~~~~~ diff --git a/docs/CommandGuide/llvm-dsymutil.rst b/docs/CommandGuide/llvm-dsymutil.rst deleted file mode 100644 index 19340e194b82b..0000000000000 --- a/docs/CommandGuide/llvm-dsymutil.rst +++ /dev/null @@ -1,86 +0,0 @@ -llvm-dsymutil - manipulate archived DWARF debug symbol files -============================================================ - -SYNOPSIS --------- - -:program:`llvm-dsymutil` [*options*] [*filename*] - -DESCRIPTION ------------ - -:program:`llvm-dsymutil` links the DWARF debug information found in the object -files for the executable input file by using debug symbols information -contained in its symbol table. - -OPTIONS -------- -.. option:: -arch= - - Link DWARF debug information only for specified CPU architecture - types. This option can be specified multiple times, once for each - desired architecture. All cpu architectures will be linked by - default. - -.. option:: -dump-debug-map - - Parse and dump the debug map to standard output. Not DWARF link - will take place. - -.. option:: -f, -flat - - Produce a flat dSYM file (not a bundle). - -.. option:: -no-odr - - Do not use ODR (One Definition Rule) for type uniquing. - -.. option:: -no-output - - Do the link in memory, but do not emit the result file. - -.. option:: -no-swiftmodule-timestamp - - Don't check timestamp for swiftmodule files. - -.. option:: -j , -num-threads= - - Specifies the maximum number (n) of simultaneous threads to use - when linking multiple architectures. - -.. option:: -o= - - Specify the output file. default: .dwarf - -.. option:: -oso-prepend-path= - - Specify a directory to prepend to the paths of object files. - -.. option:: -s, -symtab - - Dumps the symbol table found in executable or object file(s) and - exits. - -.. option:: -v, -verbose - - Verbosity level - -.. option:: --version - - Display the version of the tool. - -.. option:: -y - - Treat the input file is a YAML debug map rather than a binary. - - -EXIT STATUS ------------ - -:program:`llvm-dsymutil` returns 0 if the DWARF debug information was linked -successfully. Otherwise, it returns 1. - -SEE ALSO --------- - -:manpage:`llvm-dwarfdump(1)` diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst index 4e7791573e65c..a3b62664cbe54 100644 --- a/docs/CommandGuide/llvm-dwarfdump.rst +++ b/docs/CommandGuide/llvm-dwarfdump.rst @@ -139,4 +139,4 @@ successfully. Otherwise, it returns 1. SEE ALSO -------- -:manpage:`llvm-dsymutil(1)` +:manpage:`dsymutil(1)` diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test index dea28cf3d9089..f66858e9ae5da 100644 --- a/test/tools/dsymutil/cmdline.test +++ b/test/tools/dsymutil/cmdline.test @@ -3,7 +3,7 @@ HELP: OVERVIEW: manipulate archived DWARF debug symbol files. HELP: USAGE: llvm-dsymutil{{[^ ]*}} [options] HELP-NOT: -reverse-iterate HELP: Specific Options: -HELP: -arch= +HELP: -arch= HELP: -dump-debug-map HELP: -flat HELP: -no-odr diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index b6d6c909abcf1..769668c8a9f28 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -93,8 +93,8 @@ static list ArchFlags( "arch", desc("Link DWARF debug information only for specified CPU architecture\n" "types. This option can be specified multiple times, once for each\n" - "desired architecture. All cpu architectures will be linked by\n" - "default."), + "desired architecture. All CPU architectures will be linked by\n" + "default."), value_desc("arch"), ZeroOrMore, cat(DsymCategory)); static opt From 161385fddde9e408c5ec777889c24c435e811332 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 19:13:32 +0000 Subject: [PATCH 012/238] [X86] Change getHostCPUName fallback code to not select 'x86-64' for unknown CPUs in family 6 that has 64-bit support but not any newer SSE features. Use 'core2' instead We know that's the earliest CPU with 64-bit support. x86-64 has taken on a role of representing a more modern 64-bit CPU so we probably shouldn't be using that when we can't identify things. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317229 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index d8fb3e1dc1d6f..7fbe9ad6a5ce8 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -794,8 +794,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, break; } if (Features2 & (1 << (FEATURE_EM64T - 32))) { - *Type = INTEL_X86_64; - break; // x86-64 + *Type = INTEL_CORE2; // "core2" + *Subtype = INTEL_CORE2_65; + break; + } + if (Features & (1 << FEATURE_SSE3)) { + *Type = INTEL_CORE_DUO; + break; } if (Features & (1 << FEATURE_SSE2)) { *Type = INTEL_PENTIUM_M; From 2bbdf002305de458a85a418024608a57183284fe Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 19:13:34 +0000 Subject: [PATCH 013/238] [X86] Simplify the pentium4 code in getHostCPUName to be based on feature flags. Don't use 'x86-64' ever. 'x86-64' has started to reflect a sort of generic tuning flag for more modern 64-bit CPUs. We probably shouldn't be using it as the name of an unidentifiable pentium4. So use nocona for all 64-bit pentium4s instead. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317230 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 7fbe9ad6a5ce8..c167df5a4449d 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -351,7 +351,6 @@ enum ProcessorTypes { INTEL_PENTIUM_IV, INTEL_PENTIUM_M, INTEL_CORE_DUO, - INTEL_X86_64, INTEL_NOCONA, INTEL_PRESCOTT, AMD_i486, @@ -819,40 +818,15 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, } break; case 15: { - switch (Model) { - case 0: // Pentium 4 processor, Intel Xeon processor. All processors are - // model 00h and manufactured using the 0.18 micron process. - case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon - // processor MP, and Intel Celeron processor. All processors are - // model 01h and manufactured using the 0.18 micron process. - case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M, - // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron - // processor, and Mobile Intel Celeron processor. All processors - // are model 02h and manufactured using the 0.13 micron process. - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64 - : INTEL_PENTIUM_IV); - break; - - case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D - // processor. All processors are model 03h and manufactured using - // the 90 nm process. - case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition, - // Pentium D processor, Intel Xeon processor, Intel Xeon - // processor MP, Intel Celeron D processor. All processors are - // model 04h and manufactured using the 90 nm process. - case 6: // Pentium 4 processor, Pentium D processor, Pentium processor - // Extreme Edition, Intel Xeon processor, Intel Xeon processor - // MP, Intel Celeron D processor. All processors are model 06h - // and manufactured using the 65 nm process. - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_NOCONA - : INTEL_PRESCOTT); + if (Features2 & (1 << (FEATURE_EM64T - 32))) { + *Type = INTEL_NOCONA; break; - - default: - *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64 - : INTEL_PENTIUM_IV); + } + if (Features & (1 << FEATURE_SSE3)) { + *Type = INTEL_PRESCOTT; break; } + *Type = INTEL_PENTIUM_IV; break; } default: @@ -1150,8 +1124,6 @@ StringRef sys::getHostCPUName() { return "knl"; case INTEL_KNM: return "knm"; - case INTEL_X86_64: - return "x86-64"; case INTEL_NOCONA: return "nocona"; case INTEL_PRESCOTT: From ce68f2c6292da52ec79f7318092f4b3f8bc02dd7 Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Thu, 2 Nov 2017 20:05:20 +0000 Subject: [PATCH 014/238] [test] Move llvm-lib tests into tools/llvm-lib. NFC. Similarly to SVN r317189 for llvm-dlltool, these are probably easier to find in a tools subdirectory with a name identical to the tool, than in a toplevel directory with a different name. This matches the move of LibDriver itself in SVN r302995. Differential Revision: https://reviews.llvm.org/D39531 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317262 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/{LibDriver => tools/llvm-lib}/Inputs/a.s | 0 test/{LibDriver => tools/llvm-lib}/Inputs/b.s | 0 test/{LibDriver => tools/llvm-lib}/Inputs/cl-gl.obj | Bin .../llvm-lib}/Inputs/resource.res | Bin .../llvm-lib}/infer-output-path.test | 0 test/{LibDriver => tools/llvm-lib}/invalid.test | 0 test/{LibDriver => tools/llvm-lib}/libpath.test | 0 test/{LibDriver => tools/llvm-lib}/lit.local.cfg | 0 test/{LibDriver => tools/llvm-lib}/no-inputs.test | 0 test/{LibDriver => tools/llvm-lib}/resource.test | 0 test/{LibDriver => tools/llvm-lib}/thin.test | 0 test/{LibDriver => tools/llvm-lib}/use-paths.test | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename test/{LibDriver => tools/llvm-lib}/Inputs/a.s (100%) rename test/{LibDriver => tools/llvm-lib}/Inputs/b.s (100%) rename test/{LibDriver => tools/llvm-lib}/Inputs/cl-gl.obj (100%) rename test/{LibDriver => tools/llvm-lib}/Inputs/resource.res (100%) rename test/{LibDriver => tools/llvm-lib}/infer-output-path.test (100%) rename test/{LibDriver => tools/llvm-lib}/invalid.test (100%) rename test/{LibDriver => tools/llvm-lib}/libpath.test (100%) rename test/{LibDriver => tools/llvm-lib}/lit.local.cfg (100%) rename test/{LibDriver => tools/llvm-lib}/no-inputs.test (100%) rename test/{LibDriver => tools/llvm-lib}/resource.test (100%) rename test/{LibDriver => tools/llvm-lib}/thin.test (100%) rename test/{LibDriver => tools/llvm-lib}/use-paths.test (100%) diff --git a/test/LibDriver/Inputs/a.s b/test/tools/llvm-lib/Inputs/a.s similarity index 100% rename from test/LibDriver/Inputs/a.s rename to test/tools/llvm-lib/Inputs/a.s diff --git a/test/LibDriver/Inputs/b.s b/test/tools/llvm-lib/Inputs/b.s similarity index 100% rename from test/LibDriver/Inputs/b.s rename to test/tools/llvm-lib/Inputs/b.s diff --git a/test/LibDriver/Inputs/cl-gl.obj b/test/tools/llvm-lib/Inputs/cl-gl.obj similarity index 100% rename from test/LibDriver/Inputs/cl-gl.obj rename to test/tools/llvm-lib/Inputs/cl-gl.obj diff --git a/test/LibDriver/Inputs/resource.res b/test/tools/llvm-lib/Inputs/resource.res similarity index 100% rename from test/LibDriver/Inputs/resource.res rename to test/tools/llvm-lib/Inputs/resource.res diff --git a/test/LibDriver/infer-output-path.test b/test/tools/llvm-lib/infer-output-path.test similarity index 100% rename from test/LibDriver/infer-output-path.test rename to test/tools/llvm-lib/infer-output-path.test diff --git a/test/LibDriver/invalid.test b/test/tools/llvm-lib/invalid.test similarity index 100% rename from test/LibDriver/invalid.test rename to test/tools/llvm-lib/invalid.test diff --git a/test/LibDriver/libpath.test b/test/tools/llvm-lib/libpath.test similarity index 100% rename from test/LibDriver/libpath.test rename to test/tools/llvm-lib/libpath.test diff --git a/test/LibDriver/lit.local.cfg b/test/tools/llvm-lib/lit.local.cfg similarity index 100% rename from test/LibDriver/lit.local.cfg rename to test/tools/llvm-lib/lit.local.cfg diff --git a/test/LibDriver/no-inputs.test b/test/tools/llvm-lib/no-inputs.test similarity index 100% rename from test/LibDriver/no-inputs.test rename to test/tools/llvm-lib/no-inputs.test diff --git a/test/LibDriver/resource.test b/test/tools/llvm-lib/resource.test similarity index 100% rename from test/LibDriver/resource.test rename to test/tools/llvm-lib/resource.test diff --git a/test/LibDriver/thin.test b/test/tools/llvm-lib/thin.test similarity index 100% rename from test/LibDriver/thin.test rename to test/tools/llvm-lib/thin.test diff --git a/test/LibDriver/use-paths.test b/test/tools/llvm-lib/use-paths.test similarity index 100% rename from test/LibDriver/use-paths.test rename to test/tools/llvm-lib/use-paths.test From 15f5deb8cb6fb5c575a3c7cda87a5a723b5ada2b Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 2 Nov 2017 20:22:03 +0000 Subject: [PATCH 015/238] Fix llvm-dsymutil test in -DLLVM_ENABLE_THREADS=OFF mode After r316999, tools/dsymutil/X86/alias.test started failing in builds that have threading disabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317263 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/dsymutil/dsymutil.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index 769668c8a9f28..9d9a24183798f 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -338,7 +338,6 @@ int main(int argc, char **argv) { NumThreads = 1; NumThreads = std::min(NumThreads, DebugMapPtrsOrErr->size()); - llvm::ThreadPool Threads(NumThreads); // If there is more than one link to execute, we need to generate // temporary files. @@ -366,17 +365,19 @@ int main(int argc, char **argv) { // FIXME: The DwarfLinker can have some very deep recursion that can max // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. - if (NumThreads == 1) + if (NumThreads == 1) { LinkLambda(); - else + } else { + llvm::ThreadPool Threads(NumThreads); Threads.async(LinkLambda); + Threads.wait(); + } if (NeedsTempFiles) TempFiles.emplace_back(Map->getTriple().getArchName().str(), OutputFile); } - Threads.wait(); if (NeedsTempFiles && !MachOUtils::generateUniversalBinary( From c626458f76209b25c24dcbeb4545534f8bc120ba Mon Sep 17 00:00:00 2001 From: Shoaib Meenai Date: Thu, 2 Nov 2017 20:33:36 +0000 Subject: [PATCH 016/238] [cmake] Remove policy conditionals LLVM now requires a minimum of cmake 3.4.3, and all the policies currently being set are present in that cmake version, so the conditionals will always be true and are therefore unnecessary. The movation is that the conditionals can give the false impression that the policy settings are optional, whereas for example it's necessary to set CMP0056 in order for `check_linker_flags` to operate correctly after r316972. Inline the project version and language setting in the process. Differential Revision: https://reviews.llvm.org/D39442 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317264 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0456503831100..e27562dc8b5d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,26 +2,20 @@ cmake_minimum_required(VERSION 3.4.3) -if(POLICY CMP0022) - cmake_policy(SET CMP0022 NEW) # automatic when 2.8.12 is required -endif() +cmake_policy(SET CMP0022 NEW) -if (POLICY CMP0051) - # CMake 3.1 and higher include generator expressions of the form - # $ in the SOURCES property. These need to be - # stripped everywhere that access the SOURCES property, so we just - # defer to the OLD behavior of not including generator expressions - # in the output for now. - cmake_policy(SET CMP0051 OLD) -endif() +cmake_policy(SET CMP0048 NEW) -if(POLICY CMP0056) - cmake_policy(SET CMP0056 NEW) -endif() +# CMake 3.1 and higher include generator expressions of the form +# $ in the SOURCES property. These need to be +# stripped everywhere that access the SOURCES property, so we just +# defer to the OLD behavior of not including generator expressions +# in the output for now. +cmake_policy(SET CMP0051 OLD) -if(POLICY CMP0057) - cmake_policy(SET CMP0057 NEW) -endif() +cmake_policy(SET CMP0056 NEW) + +cmake_policy(SET CMP0057 NEW) if(NOT DEFINED LLVM_VERSION_MAJOR) set(LLVM_VERSION_MAJOR 6) @@ -36,13 +30,6 @@ if(NOT DEFINED LLVM_VERSION_SUFFIX) set(LLVM_VERSION_SUFFIX svn) endif() -if (POLICY CMP0048) - cmake_policy(SET CMP0048 NEW) - set(cmake_3_0_PROJ_VERSION - VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}) - set(cmake_3_0_LANGUAGES LANGUAGES) -endif() - if (NOT PACKAGE_VERSION) set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}") @@ -56,9 +43,8 @@ if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQ endif() project(LLVM - ${cmake_3_0_PROJ_VERSION} - ${cmake_3_0_LANGUAGES} - C CXX ASM) + VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH} + LANGUAGES C CXX ASM) if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "No build type selected, default to Debug") From 37bbee84d83c14043e07ea9d76bb7789c697eb6d Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Thu, 2 Nov 2017 20:48:06 +0000 Subject: [PATCH 017/238] AMDGPU: Remove outdated fixme (it was already fixed) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317266 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ade909cc84e3a..5f5636e119a9c 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -87,9 +87,6 @@ class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { - // FIXME: This should be removed and getPreloadedValue moved here. - friend class SIRegisterInfo; - unsigned TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same From 2e63034efd79807891a4d201daeb434c2d26c609 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 20:58:58 +0000 Subject: [PATCH 018/238] Add missing header guards. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317267 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/DebugInfo.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h index 15f6b57d88315..2c2fdbdf173b3 100644 --- a/include/llvm-c/DebugInfo.h +++ b/include/llvm-c/DebugInfo.h @@ -14,6 +14,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_C_DEBUGINFO_H +#define LLVM_C_DEBUGINFO_H + #include "llvm-c/Core.h" #ifdef __cplusplus @@ -200,3 +203,5 @@ LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line, #ifdef __cplusplus } // end extern "C" #endif + +#endif From b69a2a9ae35ca7d19399c8d23287f10a8bdf0f45 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Thu, 2 Nov 2017 21:21:02 +0000 Subject: [PATCH 019/238] [LoopPredication] Enable predication when latchCheckIV is wider than rangeCheck Summary: This patch allows us to predicate range checks that have a type narrower than the latch check type. We leverage SCEV analysis to identify a truncate for the latchLimit and latchStart. There is also safety checks in place which requires the start and limit to be known at compile time. We require this to make sure that the SCEV truncate expr for the IV corresponding to the latch does not cause us to lose information about the IV range. Added tests show the loop predication over range checks that are of various types and are narrower than the latch type. This enhancement has been in our downstream tree for a while. Reviewers: apilipenko, sanjoy, mkazantsev Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39500 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317269 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopPredication.cpp | 106 ++++++++++++++-- test/Transforms/LoopPredication/widened.ll | 138 +++++++++++++++++++++ 2 files changed, 234 insertions(+), 10 deletions(-) create mode 100644 test/Transforms/LoopPredication/widened.ll diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index 9a623be234fe8..e680fbed1138f 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -174,6 +174,9 @@ using namespace llvm; +static cl::opt EnableIVTruncation("loop-predication-enable-iv-truncation", + cl::Hidden, cl::init(true)); + namespace { class LoopPredication { /// Represents an induction variable check: @@ -212,6 +215,22 @@ class LoopPredication { IRBuilder<> &Builder); bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + // When the IV type is wider than the range operand type, we can still do loop + // predication, by generating SCEVs for the range and latch that are of the + // same type. We achieve this by generating a SCEV truncate expression for the + // latch IV. This is done iff truncation of the IV is a safe operation, + // without loss of information. + // Another way to achieve this is by generating a wider type SCEV for the + // range check operand, however, this needs a more involved check that + // operands do not overflow. This can lead to loss of information when the + // range operand is of the form: add i32 %offset, %iv. We need to prove that + // sext(x + y) is same as sext(x) + sext(y). + // This function returns true if we can safely represent the IV type in + // the RangeCheckType without loss of information. + bool isSafeToTruncateWideIVType(Type *RangeCheckType); + // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do + // so. + Optional generateLoopLatchCheck(Type *RangeCheckType); public: LoopPredication(ScalarEvolution *SE) : SE(SE){}; bool runOnLoop(Loop *L); @@ -301,6 +320,34 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander, return Builder.CreateICmp(Pred, LHSV, RHSV); } +Optional +LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { + + auto *LatchType = LatchCheck.IV->getType(); + if (RangeCheckType == LatchType) + return LatchCheck; + // For now, bail out if latch type is narrower than range type. + if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType)) + return None; + if (!isSafeToTruncateWideIVType(RangeCheckType)) + return None; + // We can now safely identify the truncated version of the IV and limit for + // RangeCheckType. + LoopICmp NewLatchCheck; + NewLatchCheck.Pred = LatchCheck.Pred; + NewLatchCheck.IV = dyn_cast( + SE->getTruncateExpr(LatchCheck.IV, RangeCheckType)); + if (!NewLatchCheck.IV) + return None; + NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType); + DEBUG(dbgs() << "IV of type: " << *LatchType + << "can be represented as range check type:" << *RangeCheckType + << "\n"); + DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); + DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); + return NewLatchCheck; +} + /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -325,22 +372,31 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, return None; } auto *RangeCheckIV = RangeCheck->IV; - auto *Ty = RangeCheckIV->getType(); - if (Ty != LatchCheck.IV->getType()) { - DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n"); - return None; - } if (!RangeCheckIV->isAffine()) { DEBUG(dbgs() << "Range check IV is not affine!\n"); return None; } auto *Step = RangeCheckIV->getStepRecurrence(*SE); - if (Step != LatchCheck.IV->getStepRecurrence(*SE)) { + // We cannot just compare with latch IV step because the latch and range IVs + // may have different types. + if (!Step->isOne()) { DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } - assert(Step->isOne() && "must be one"); + auto *Ty = RangeCheckIV->getType(); + auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty); + if (!CurrLatchCheckOpt) { + DEBUG(dbgs() << "Failed to generate a loop latch check " + "corresponding to range type: " + << *Ty << "\n"); + return None; + } + LoopICmp CurrLatchCheck = *CurrLatchCheckOpt; + // At this point the range check step and latch step should have the same + // value and type. + assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) && + "Range and latch should have same step recurrence!"); // Generate the widened condition: // guardStart u< guardLimit && // latchLimit guardLimit - 1 - guardStart + latchStart @@ -348,8 +404,8 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, // header comment for the reasoning. const SCEV *GuardStart = RangeCheckIV->getStart(); const SCEV *GuardLimit = RangeCheck->Limit; - const SCEV *LatchStart = LatchCheck.IV->getStart(); - const SCEV *LatchLimit = LatchCheck.Limit; + const SCEV *LatchStart = CurrLatchCheck.IV->getStart(); + const SCEV *LatchLimit = CurrLatchCheck.Limit; // guardLimit - guardStart + latchStart - 1 const SCEV *RHS = @@ -357,7 +413,7 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); ICmpInst::Predicate LimitCheckPred; - switch (LatchCheck.Pred) { + switch (CurrLatchCheck.Pred) { case ICmpInst::ICMP_ULT: LimitCheckPred = ICmpInst::ICMP_ULE; break; @@ -510,6 +566,36 @@ Optional LoopPredication::parseLoopLatchICmp() { return Result; } +// Returns true if its safe to truncate the IV to RangeCheckType. +bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) { + if (!EnableIVTruncation) + return false; + assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) > + DL->getTypeSizeInBits(RangeCheckType) && + "Expected latch check IV type to be larger than range check operand " + "type!"); + // The start and end values of the IV should be known. This is to guarantee + // that truncating the wide type will not lose information. + auto *Limit = dyn_cast(LatchCheck.Limit); + auto *Start = dyn_cast(LatchCheck.IV->getStart()); + if (!Limit || !Start) + return false; + // This check makes sure that the IV does not change sign during loop + // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE, + // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the + // IV wraps around, and the truncation of the IV would lose the range of + // iterations between 2^32 and 2^64. + bool Increasing; + if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing)) + return false; + // The active bits should be less than the bits in the RangeCheckType. This + // guarantees that truncating the latch check to RangeCheckType is a safe + // operation. + auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType); + return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize && + Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; diff --git a/test/Transforms/LoopPredication/widened.ll b/test/Transforms/LoopPredication/widened.ll new file mode 100644 index 0000000000000..33c4e27061333 --- /dev/null +++ b/test/Transforms/LoopPredication/widened.ll @@ -0,0 +1,138 @@ +; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s +declare void @llvm.experimental.guard(i1, ...) + +declare i32 @length(i8*) + +declare i16 @short_length(i8*) +; Consider range check of type i16 and i32, while IV is of type i64 +; We can loop predicate this because the IV range is within i16 and within i32. +define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) { +; CHECK-LABEL: iv_wider_type_rc_two_narrow_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i16 @short_length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %iv.trunc.16 = trunc i64 %iv to i16 + %indexA = add i32 %iv.trunc.32, %offA + %indexB = add i16 %iv.trunc.16, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcB = icmp ult i16 %indexB, %lengthB + %wide.chk = and i1 %rcA, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i16 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + store i8 %eltA, i8* %addrB + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv.next, 16 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + + +; Consider an IV of type long and an array access into int array. +; IV is of type i64 while the range check operands are of type i32 and i64. +define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max) +{ +; CHECK-LABEL: iv_rc_different_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1 +; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1 +; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]] +; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult i64 0, %max +; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]] +; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1 +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i32 @length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]] +; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]] +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc, %offA + %indexB = add i32 %iv.trunc, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcIV = icmp ult i64 %iv, %max + %wide.chk = and i1 %rcA, %rcIV + %rcB = icmp ult i32 %indexB, %lengthB + %wide.chk.final = and i1 %wide.chk, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i32 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + %eltB = load i8, i8* %addrB + %result = xor i8 %eltA, %eltB + store i8 %result, i8* %addrA + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv, 15 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + +; cannot narrow the IV to the range type, because we lose information. +; for (i64 i= 5; i>= 2; i++) +; this loop wraps around after reaching 2^64. +define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) { +; CHECK-LABEL: iv_rc_different_type +entry: + %lengthA = call i32 @length(i8* %arrA) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: %rcA = icmp ult i32 %indexA, %lengthA +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) + %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc.32, %offA + %rcA = icmp ult i32 %indexA, %lengthA + call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %res = add i8 %eltA, 2 + store i8 %eltA, i8* %addrA + %iv.next = add i64 %iv, 1 + %latch.check = icmp sge i64 %iv.next, 2 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} From dc666ea9df629f7b5ec1506993f15d406a52acc6 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 21:35:37 +0000 Subject: [PATCH 020/238] Clean up comments in include/llvm-c/DebugInfo.h Patch by Harlan Haskins! Differential Revision: https://reviews.llvm.org/D39568 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317271 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/DebugInfo.h | 143 ++++++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/include/llvm-c/DebugInfo.h b/include/llvm-c/DebugInfo.h index 2c2fdbdf173b3..a27b351577a94 100644 --- a/include/llvm-c/DebugInfo.h +++ b/include/llvm-c/DebugInfo.h @@ -23,7 +23,9 @@ extern "C" { #endif -/// Debug info flags. +/** + * Debug info flags. + */ typedef enum { LLVMDIFlagZero = 0, LLVMDIFlagPrivate = 1, @@ -58,7 +60,9 @@ typedef enum { LLVMDIFlagVirtualInheritance } LLVMDIFlags; -/// Source languages known by DWARF. +/** + * Source languages known by DWARF. + */ typedef enum { LLVMDWARFSourceLanguageC89, LLVMDWARFSourceLanguageC, @@ -106,68 +110,85 @@ typedef enum { LLVMDWARFSourceLanguageBORLAND_Delphi } LLVMDWARFSourceLanguage; -/// The amount of debug information to emit. +/** + * The amount of debug information to emit. + */ typedef enum { LLVMDWARFEmissionNone = 0, LLVMDWARFEmissionFull, LLVMDWARFEmissionLineTablesOnly } LLVMDWARFEmissionKind; -/// The current debug metadata version number. +/** + * The current debug metadata version number. + */ unsigned LLVMDebugMetadataVersion(void); -/// The version of debug metadata that's present in the provided \c Module. +/** + * The version of debug metadata that's present in the provided \c Module. + */ unsigned LLVMGetModuleDebugMetadataVersion(LLVMModuleRef Module); -/// Strip debug info in the module if it exists. -/// -/// To do this, we remove all calls to the debugger intrinsics and any named -/// metadata for debugging. We also remove debug locations for instructions. -/// Return true if module is modified. +/** + * Strip debug info in the module if it exists. + * To do this, we remove all calls to the debugger intrinsics and any named + * metadata for debugging. We also remove debug locations for instructions. + * Return true if module is modified. + */ LLVMBool LLVMStripModuleDebugInfo(LLVMModuleRef Module); -/// Construct a builder for a module, and do not allow for unresolved nodes -/// attached to the module. +/** + * Construct a builder for a module, and do not allow for unresolved nodes + * attached to the module. + */ LLVMDIBuilderRef LLVMCreateDIBuilderDisallowUnresolved(LLVMModuleRef M); -/// Construct a builder for a module and collect unresolved nodes attached -/// to the module in order to resolve cycles during a call to -/// \c LLVMDIBuilderFinalize. +/** + * Construct a builder for a module and collect unresolved nodes attached + * to the module in order to resolve cycles during a call to + * \c LLVMDIBuilderFinalize. + */ LLVMDIBuilderRef LLVMCreateDIBuilder(LLVMModuleRef M); -/// Deallocates the DIBuilder and everything it owns. -/// @note You must call \c LLVMDIBuilderFinalize before this +/** + * Deallocates the DIBuilder and everything it owns. + * @note You must call \c LLVMDIBuilderFinalize before this + */ void LLVMDisposeDIBuilder(LLVMDIBuilderRef Builder); -/// Construct any deferred debug info descriptors. +/** + * Construct any deferred debug info descriptors. + */ void LLVMDIBuilderFinalize(LLVMDIBuilderRef Builder); -/// A CompileUnit provides an anchor for all debugging -/// information generated during this instance of compilation. -/// \param Lang Source programming language, eg. -/// \c LLVMDWARFSourceLanguageC99 -/// \param FileRef File info. -/// \param Producer Identify the producer of debugging information -/// and code. Usually this is a compiler -/// version string. -/// \param ProducerLen The length of the C string passed to \c Producer. -/// \param isOptimized A boolean flag which indicates whether optimization -/// is enabled or not. -/// \param Flags This string lists command line options. This -/// string is directly embedded in debug info -/// output which may be used by a tool -/// analyzing generated debugging information. -/// \param FlagsLen The length of the C string passed to \c Flags. -/// \param RuntimeVer This indicates runtime version for languages like -/// Objective-C. -/// \param SplitName The name of the file that we'll split debug info -/// out into. -/// \param SplitNameLen The length of the C string passed to \c SplitName. -/// \param Kind The kind of debug information to generate. -/// \param DWOId The DWOId if this is a split skeleton compile unit. -/// \param SplitDebugInlining Whether to emit inline debug info. -/// \param DebugInfoForProfiling Whether to emit extra debug info for -/// profile collection. +/** + * A CompileUnit provides an anchor for all debugging + * information generated during this instance of compilation. + * \param Lang Source programming language, eg. + * \c LLVMDWARFSourceLanguageC99 + * \param FileRef File info. + * \param Producer Identify the producer of debugging information + * and code. Usually this is a compiler + * version string. + * \param ProducerLen The length of the C string passed to \c Producer. + * \param isOptimized A boolean flag which indicates whether optimization + * is enabled or not. + * \param Flags This string lists command line options. This + * string is directly embedded in debug info + * output which may be used by a tool + * analyzing generated debugging information. + * \param FlagsLen The length of the C string passed to \c Flags. + * \param RuntimeVer This indicates runtime version for languages like + * Objective-C. + * \param SplitName The name of the file that we'll split debug info + * out into. + * \param SplitNameLen The length of the C string passed to \c SplitName. + * \param Kind The kind of debug information to generate. + * \param DWOId The DWOId if this is a split skeleton compile unit. + * \param SplitDebugInlining Whether to emit inline debug info. + * \param DebugInfoForProfiling Whether to emit extra debug info for + * profile collection. + */ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit( LLVMDIBuilderRef Builder, LLVMDWARFSourceLanguage Lang, LLVMMetadataRef FileRef, const char *Producer, size_t ProducerLen, @@ -176,32 +197,36 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit( LLVMDWARFEmissionKind Kind, unsigned DWOId, LLVMBool SplitDebugInlining, LLVMBool DebugInfoForProfiling); -/// Create a file descriptor to hold debugging information for a file. -/// \param Builder The DIBuilder. -/// \param Filename File name. -/// \param FilenameLen The length of the C string passed to \c Filename. -/// \param Directory Directory. -/// \param DirectoryLen The length of the C string passed to \c Directory. +/** + * Create a file descriptor to hold debugging information for a file. + * \param Builder The DIBuilder. + * \param Filename File name. + * \param FilenameLen The length of the C string passed to \c Filename. + * \param Directory Directory. + * \param DirectoryLen The length of the C string passed to \c Directory. + */ LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen, const char *Directory, size_t DirectoryLen); -/// Creates a new DebugLocation that describes a source location. -/// \param Line The line in the source file. -/// \param Column The column in the source file. -/// \param Scope The scope in which the location resides. -/// \param InlinedAt The scope where this location was inlined, if at all. -/// (optional). -/// \note If the item to which this location is attached cannot be -/// attributed to a source line, pass 0 for the line and column. +/** + * Creates a new DebugLocation that describes a source location. + * \param Line The line in the source file. + * \param Column The column in the source file. + * \param Scope The scope in which the location resides. + * \param InlinedAt The scope where this location was inlined, if at all. + * (optional). + * \note If the item to which this location is attached cannot be + * attributed to a source line, pass 0 for the line and column. + */ LLVMMetadataRef LLVMDIBuilderCreateDebugLocation(LLVMContextRef Ctx, unsigned Line, unsigned Column, LLVMMetadataRef Scope, LLVMMetadataRef InlinedAt); #ifdef __cplusplus -} // end extern "C" +} /* end extern "C" */ #endif #endif From fbb50d9079f4281847a47d0aba5c29455237da63 Mon Sep 17 00:00:00 2001 From: Shoaib Meenai Date: Thu, 2 Nov 2017 21:43:32 +0000 Subject: [PATCH 021/238] [tools] Add option to install binutils symlinks The LLVM tools can be used as a replacement for binutils, in which case it's convenient to create symlinks with the binutils names. Add support for these symlinks in the build system. As with any other llvm tool symlinks, the user can limit the installed symlinks by only adding the desired ones to `LLVM_TOOLCHAIN_TOOLS`. Differential Revision: https://reviews.llvm.org/D39530 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317272 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 3 +++ docs/CMake.rst | 4 ++++ tools/llvm-ar/CMakeLists.txt | 6 ++++++ tools/llvm-cxxfilt/CMakeLists.txt | 4 ++++ tools/llvm-dwp/CMakeLists.txt | 4 ++++ tools/llvm-nm/CMakeLists.txt | 4 ++++ tools/llvm-objcopy/CMakeLists.txt | 4 ++++ tools/llvm-objdump/CMakeLists.txt | 4 ++++ tools/llvm-readobj/CMakeLists.txt | 4 ++++ tools/llvm-size/CMakeLists.txt | 4 ++++ tools/llvm-strings/CMakeLists.txt | 3 +++ tools/llvm-symbolizer/CMakeLists.txt | 4 ++++ 12 files changed, 48 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e27562dc8b5d5..6328f1e18c0b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,6 +179,9 @@ set(CMAKE_MODULE_PATH # for use by clang_complete, YouCompleteMe, etc. set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +option(LLVM_INSTALL_BINUTILS_SYMLINKS + "Install symlinks from the binutils tool names to the corresponding LLVM tools." OFF) + option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF) option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF) diff --git a/docs/CMake.rst b/docs/CMake.rst index 473672b5f736f..05edec64da332 100644 --- a/docs/CMake.rst +++ b/docs/CMake.rst @@ -224,6 +224,10 @@ LLVM-specific variables Generate build targets for the LLVM tools. Defaults to ON. You can use this option to disable the generation of build targets for the LLVM tools. +**LLVM_INSTALL_BINUTILS_SYMLINKS**:BOOL + Install symlinks from the binutils tool names to the corresponding LLVM tools. + For example, ar will be symlinked to llvm-ar. + **LLVM_BUILD_EXAMPLES**:BOOL Build LLVM examples. Defaults to OFF. Targets for building each example are generated in any case. See documentation for *LLVM_BUILD_TOOLS* above for more diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt index 731bcbd8ac9d7..2970a59beee22 100644 --- a/tools/llvm-ar/CMakeLists.txt +++ b/tools/llvm-ar/CMakeLists.txt @@ -17,3 +17,9 @@ add_llvm_tool(llvm-ar add_llvm_tool_symlink(llvm-ranlib llvm-ar) add_llvm_tool_symlink(llvm-lib llvm-ar) add_llvm_tool_symlink(llvm-dlltool llvm-ar) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(ar llvm-ar) + add_llvm_tool_symlink(dlltool llvm-ar) + add_llvm_tool_symlink(ranlib llvm-ar) +endif() diff --git a/tools/llvm-cxxfilt/CMakeLists.txt b/tools/llvm-cxxfilt/CMakeLists.txt index 488064d08dab2..2a78acad80a81 100644 --- a/tools/llvm-cxxfilt/CMakeLists.txt +++ b/tools/llvm-cxxfilt/CMakeLists.txt @@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-cxxfilt llvm-cxxfilt.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(c++filt llvm-cxxfilt) +endif() diff --git a/tools/llvm-dwp/CMakeLists.txt b/tools/llvm-dwp/CMakeLists.txt index 98d67e04fe6a0..1b5fbddc1f750 100644 --- a/tools/llvm-dwp/CMakeLists.txt +++ b/tools/llvm-dwp/CMakeLists.txt @@ -15,3 +15,7 @@ add_llvm_tool(llvm-dwp DEPENDS intrinsics_gen ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(dwp llvm-dwp) +endif() diff --git a/tools/llvm-nm/CMakeLists.txt b/tools/llvm-nm/CMakeLists.txt index 08bcd5f308980..f093cc4328ae8 100644 --- a/tools/llvm-nm/CMakeLists.txt +++ b/tools/llvm-nm/CMakeLists.txt @@ -14,3 +14,7 @@ add_llvm_tool(llvm-nm DEPENDS intrinsics_gen ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(nm llvm-nm) +endif() diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt index 18cc2075345d7..05aa727ab9d83 100644 --- a/tools/llvm-objcopy/CMakeLists.txt +++ b/tools/llvm-objcopy/CMakeLists.txt @@ -7,3 +7,7 @@ add_llvm_tool(llvm-objcopy llvm-objcopy.cpp Object.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(objcopy llvm-objcopy) +endif() diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt index 27e6145dfc139..043a181d6392e 100644 --- a/tools/llvm-objdump/CMakeLists.txt +++ b/tools/llvm-objdump/CMakeLists.txt @@ -25,3 +25,7 @@ add_llvm_tool(llvm-objdump if(HAVE_LIBXAR) target_link_libraries(llvm-objdump ${XAR_LIB}) endif() + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(objdump llvm-objdump) +endif() diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt index 5447167417345..dafc9e10cfa12 100644 --- a/tools/llvm-readobj/CMakeLists.txt +++ b/tools/llvm-readobj/CMakeLists.txt @@ -23,3 +23,7 @@ add_llvm_tool(llvm-readobj ) add_llvm_tool_symlink(llvm-readelf llvm-readobj) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(readelf llvm-readobj) +endif() diff --git a/tools/llvm-size/CMakeLists.txt b/tools/llvm-size/CMakeLists.txt index 60345739c35a8..7ef4f1769b840 100644 --- a/tools/llvm-size/CMakeLists.txt +++ b/tools/llvm-size/CMakeLists.txt @@ -6,3 +6,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-size llvm-size.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(size llvm-size) +endif() diff --git a/tools/llvm-strings/CMakeLists.txt b/tools/llvm-strings/CMakeLists.txt index 9339892a49972..390f117513978 100644 --- a/tools/llvm-strings/CMakeLists.txt +++ b/tools/llvm-strings/CMakeLists.txt @@ -8,3 +8,6 @@ add_llvm_tool(llvm-strings llvm-strings.cpp ) +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(strings llvm-strings) +endif() diff --git a/tools/llvm-symbolizer/CMakeLists.txt b/tools/llvm-symbolizer/CMakeLists.txt index b04c45ff74421..d9b05208afd8d 100644 --- a/tools/llvm-symbolizer/CMakeLists.txt +++ b/tools/llvm-symbolizer/CMakeLists.txt @@ -14,3 +14,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-symbolizer llvm-symbolizer.cpp ) + +if(LLVM_INSTALL_BINUTILS_SYMLINKS) + add_llvm_tool_symlink(addr2line llvm-symbolizer) +endif() From da35e5e8bec2e0110f896b4ef677445187c7ab42 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 2 Nov 2017 21:56:59 +0000 Subject: [PATCH 022/238] [Hexagon] Prefer L2_loadrub_io over L4_loadrub_rr If the offset is an immediate, avoid putting it in a register to get Rs+Rt<<#0. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317275 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/HexagonPatterns.td | 134 ++++++++++++++++---------- test/CodeGen/Hexagon/isel-prefer.ll | 10 ++ 2 files changed, 92 insertions(+), 52 deletions(-) diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index d432bfef7ae96..05865c43f2d0e 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -1706,28 +1706,27 @@ multiclass Loadxim_pat; } -// Patterns to select load reg reg-indexed: Rs + Rt< { - let AddedComplexity = 40 in - def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), - (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>; - - let AddedComplexity = 20 in - def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))), - (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>; -} - -// Patterns to select load reg reg-indexed: Rs + Rt< { - let AddedComplexity = 40 in - def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), - (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>; +// Pattern to select load reg reg-indexed: Rs + Rt< + : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), + (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>; + +// Pattern to select load reg reg-indexed: Rs + Rt<<0. +class Loadxr_add_pat + : Pat<(VT (Load (add I32:$Rs, I32:$Rt))), + (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>; + +// Pattern to select load reg reg-indexed: Rs + Rt< + : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))), + (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>; - let AddedComplexity = 20 in - def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))), - (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>; -} +// Pattern to select load reg reg-indexed: Rs + Rt<<0 with value modifier. +class Loadxrm_add_pat + : Pat<(VT (Load (add I32:$Rs, I32:$Rt))), + (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>; // Pattern to select load long-offset reg-indexed: Addr + Rt<; } -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; -defm: Loadxim_pat; +let AddedComplexity = 30 in { + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; + defm: Loadxim_pat; +} let AddedComplexity = 60 in { def: Loadxu_pat; @@ -1818,26 +1819,55 @@ let AddedComplexity = 60 in { def: Loadxum_pat; } -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; -defm: Loadxr_pat; - -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; -defm: Loadxrm_pat; +let AddedComplexity = 40 in { + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; + def: Loadxr_shl_pat; +} + +let AddedComplexity = 20 in { + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; + def: Loadxr_add_pat; +} + +let AddedComplexity = 40 in { + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; + def: Loadxrm_shl_pat; +} + +let AddedComplexity = 20 in { + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; + def: Loadxrm_add_pat; +} // Absolute address diff --git a/test/CodeGen/Hexagon/isel-prefer.ll b/test/CodeGen/Hexagon/isel-prefer.ll index 062b0b3a0ea32..7094544f54b74 100644 --- a/test/CodeGen/Hexagon/isel-prefer.ll +++ b/test/CodeGen/Hexagon/isel-prefer.ll @@ -54,4 +54,14 @@ b2: ret i32 %v6 } +; CHECK-LABEL: Prefer_L2_loadrub_io: +; CHECK: memub(r0+#65) +define i64 @Prefer_L2_loadrub_io(i8* %a0) #0 { +b1: + %v2 = getelementptr i8, i8* %a0, i32 65 + %v3 = load i8, i8* %v2 + %v4 = zext i8 %v3 to i64 + ret i64 %v4 +} + attributes #0 = { nounwind readnone } From dd33e177dd838793692d7a291dc5552e30642842 Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Thu, 2 Nov 2017 22:26:51 +0000 Subject: [PATCH 023/238] Irreducible loop metadata for more accurate block frequency under PGO. Summary: Currently the block frequency analysis is an approximation for irreducible loops. The new irreducible loop metadata is used to annotate the irreducible loop headers with their header weights based on the PGO profile (currently this is approximated to be evenly weighted) and to help improve the accuracy of the block frequency analysis for irreducible loops. This patch is a basic support for this. Reviewers: davidxl Reviewed By: davidxl Subscribers: mehdi_amini, llvm-commits, eraman Differential Revision: https://reviews.llvm.org/D39028 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317278 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 23 ++ include/llvm/Analysis/BlockFrequencyInfo.h | 4 + .../llvm/Analysis/BlockFrequencyInfoImpl.h | 49 ++++- include/llvm/CodeGen/MachineBasicBlock.h | 10 + .../llvm/CodeGen/MachineBlockFrequencyInfo.h | 2 + include/llvm/IR/BasicBlock.h | 2 + include/llvm/IR/LLVMContext.h | 1 + include/llvm/IR/MDBuilder.h | 3 + include/llvm/Transforms/PGOInstrumentation.h | 2 + lib/Analysis/BlockFrequencyInfo.cpp | 5 + lib/Analysis/BlockFrequencyInfoImpl.cpp | 21 ++ lib/CodeGen/MachineBasicBlock.cpp | 8 + lib/CodeGen/MachineBlockFrequencyInfo.cpp | 6 + lib/IR/BasicBlock.cpp | 13 ++ lib/IR/LLVMContext.cpp | 1 + lib/IR/MDBuilder.cpp | 7 + .../Instrumentation/PGOInstrumentation.cpp | 28 ++- .../BlockFrequencyInfo/irreducible_pgo.ll | 208 ++++++++++++++++++ test/ThinLTO/X86/lazyload_metadata.ll | 4 +- .../PGOProfile/Inputs/irreducible.proftext | 29 +++ test/Transforms/PGOProfile/irreducible.ll | 184 ++++++++++++++++ 21 files changed, 600 insertions(+), 10 deletions(-) create mode 100644 test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll create mode 100644 test/Transforms/PGOProfile/Inputs/irreducible.proftext create mode 100644 test/Transforms/PGOProfile/irreducible.ll diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 9d910568bd5d8..6823fe5fcd771 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -5194,6 +5194,29 @@ the loop identifier metadata node directly: !1 = !{!1} ; an identifier for the inner loop !2 = !{!2} ; an identifier for the outer loop +'``irr_loop``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^ + +``irr_loop`` metadata may be attached to the terminator instruction of a basic +block that's an irreducible loop header (note that an irreducible loop has more +than once header basic blocks.) If ``irr_loop`` metadata is attached to the +terminator instruction of a basic block that is not really an irreducible loop +header, the behavior is undefined. The intent of this metadata is to improve the +accuracy of the block frequency propagation. For example, in the code below, the +block ``header0`` may have a loop header weight (relative to the other headers of +the irreducible loop) of 100: + +.. code-block:: llvm + + header0: + ... + br i1 %cmp, label %t1, label %t2, !irr_loop !0 + + ... + !0 = !{"loop_header_weight", i64 100} + +Irreducible loop header weights are typically based on profile data. + '``invariant.group``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h index d663b09d5cfeb..89370cbeeea1c 100644 --- a/include/llvm/Analysis/BlockFrequencyInfo.h +++ b/include/llvm/Analysis/BlockFrequencyInfo.h @@ -75,6 +75,10 @@ class BlockFrequencyInfo { /// the enclosing function's count (if available) and returns the value. Optional getProfileCountFromFreq(uint64_t Freq) const; + /// \brief Returns true if \p BB is an irreducible loop header + /// block. Otherwise false. + bool isIrrLoopHeader(const BasicBlock *BB); + // Set the frequency of the given basic block. void setBlockFreq(const BasicBlock *BB, uint64_t Freq); diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 7f166f4a6465f..7b916e3653b8e 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -20,6 +20,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" @@ -414,6 +415,10 @@ class BlockFrequencyInfoImplBase { /// \brief Data about each block. This is used downstream. std::vector Freqs; + /// \brief Whether each block is an irreducible loop header. + /// This is used downstream. + SparseBitVector<> IsIrrLoopHeader; + /// \brief Loop data: see initializeLoops(). std::vector Working; @@ -492,6 +497,8 @@ class BlockFrequencyInfoImplBase { /// the backedges going into each of the loop headers. void adjustLoopHeaderMass(LoopData &Loop); + void distributeIrrLoopHeaderMass(Distribution &Dist); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -520,6 +527,7 @@ class BlockFrequencyInfoImplBase { const BlockNode &Node) const; Optional getProfileCountFromFreq(const Function &F, uint64_t Freq) const; + bool isIrrLoopHeader(const BlockNode &Node); void setBlockFreq(const BlockNode &Node, uint64_t Freq); @@ -973,6 +981,10 @@ template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq); } + bool isIrrLoopHeader(const BlockT *BB) { + return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB)); + } + void setBlockFreq(const BlockT *BB, uint64_t Freq); Scaled64 getFloatingBlockFreq(const BlockT *BB) const { @@ -1140,17 +1152,39 @@ bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); if (Loop.isIrreducible()) { - BlockMass Remaining = BlockMass::getFull(); + DEBUG(dbgs() << "isIrreducible = true\n"); + Distribution Dist; + unsigned NumHeadersWithWeight = 0; for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &Mass = Working[Loop.Nodes[H].Index].getMass(); - Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H); - Remaining -= Mass; + auto &HeaderNode = Loop.Nodes[H]; + const BlockT *Block = getBlock(HeaderNode); + IsIrrLoopHeader.set(Loop.Nodes[H].Index); + Optional HeaderWeight = Block->getIrrLoopHeaderWeight(); + if (!HeaderWeight) + continue; + DEBUG(dbgs() << getBlockName(HeaderNode) + << " has irr loop header weight " << HeaderWeight.getValue() + << "\n"); + NumHeadersWithWeight++; + uint64_t HeaderWeightValue = HeaderWeight.getValue(); + if (HeaderWeightValue) + Dist.addLocal(HeaderNode, HeaderWeightValue); } + if (NumHeadersWithWeight != Loop.NumHeaders) { + // Not all headers have a weight metadata. Distribute weight evenly. + Dist = Distribution(); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + Dist.addLocal(HeaderNode, 1); + } + } + distributeIrrLoopHeaderMass(Dist); for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); - - adjustLoopHeaderMass(Loop); + if (NumHeadersWithWeight != Loop.NumHeaders) + // Not all headers have a weight metadata. Adjust header mass. + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) @@ -1285,6 +1319,9 @@ raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { BlockFrequencyInfoImplBase::getBlockProfileCount( *F->getFunction(), getNode(&BB))) OS << ", count = " << ProfileCount.getValue(); + if (Optional IrrLoopHeaderWeight = + BB.getIrrLoopHeaderWeight()) + OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue(); OS << "\n"; } diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 51a0d96deda56..0f5b04d904598 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -97,6 +97,8 @@ class MachineBasicBlock using const_probability_iterator = std::vector::const_iterator; + Optional IrrLoopHeaderWeight; + /// Keep track of the physical registers that are livein of the basicblock. using LiveInVector = std::vector; LiveInVector LiveIns; @@ -729,6 +731,14 @@ class MachineBasicBlock /// Return the MCSymbol for this basic block. MCSymbol *getSymbol() const; + Optional getIrrLoopHeaderWeight() const { + return IrrLoopHeaderWeight; + } + + void setIrrLoopHeaderWeight(uint64_t Weight) { + IrrLoopHeaderWeight = Weight; + } + private: /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index cba79c818a761..5b4b99ca0a5d8 100644 --- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -62,6 +62,8 @@ class MachineBlockFrequencyInfo : public MachineFunctionPass { Optional getBlockProfileCount(const MachineBasicBlock *MBB) const; Optional getProfileCountFromFreq(uint64_t Freq) const; + bool isIrrLoopHeader(const MachineBasicBlock *MBB); + const MachineFunction *getFunction() const; const MachineBranchProbabilityInfo *getMBPI() const; void view(const Twine &Name, bool isSimple = true) const; diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index 6714f2c97473f..77cfc9776df0e 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -398,6 +398,8 @@ class BasicBlock final : public Value, // Basic blocks are data objects also /// \brief Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() const; + Optional getIrrLoopHeaderWeight() const; + private: /// \brief Increment the internal refcount of the number of BlockAddresses /// referencing this BasicBlock by \p Amt. diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h index 9e935823c775c..a95634d32c21f 100644 --- a/include/llvm/IR/LLVMContext.h +++ b/include/llvm/IR/LLVMContext.h @@ -101,6 +101,7 @@ class LLVMContext { MD_absolute_symbol = 21, // "absolute_symbol" MD_associated = 22, // "associated" MD_callees = 23, // "callees" + MD_irr_loop = 24, // "irr_loop" }; /// Known operand bundle tag IDs, which always have the same value. All diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h index d679cef95b68b..15c1b9cb60ef6 100644 --- a/include/llvm/IR/MDBuilder.h +++ b/include/llvm/IR/MDBuilder.h @@ -173,6 +173,9 @@ class MDBuilder { /// base type, access type and offset relative to the base type. MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, uint64_t Offset, bool IsConstant = false); + + /// \brief Return metadata containing an irreducible loop header weight. + MDNode *createIrrLoopHeaderWeight(uint64_t Weight); }; } // end namespace llvm diff --git a/include/llvm/Transforms/PGOInstrumentation.h b/include/llvm/Transforms/PGOInstrumentation.h index fa7a68624ec82..c2cc76c422dae 100644 --- a/include/llvm/Transforms/PGOInstrumentation.h +++ b/include/llvm/Transforms/PGOInstrumentation.h @@ -68,6 +68,8 @@ class PGOMemOPSizeOpt : public PassInfoMixin { void setProfMetadata(Module *M, Instruction *TI, ArrayRef EdgeCounts, uint64_t MaxCount); +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count); + } // end namespace llvm #endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp index 5d2170dcf1551..41c2958952139 100644 --- a/lib/Analysis/BlockFrequencyInfo.cpp +++ b/lib/Analysis/BlockFrequencyInfo.cpp @@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return BFI->getProfileCountFromFreq(*getFunction(), Freq); } +bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) { + assert(BFI && "Expected analysis to be available"); + return BFI->isIrrLoopHeader(BB); +} + void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) { assert(BFI && "Expected analysis to be available"); BFI->setBlockFreq(BB, Freq); diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 1030407b766de..7e323022d9ce9 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() { // Swap with a default-constructed std::vector, since std::vector<>::clear() // does not actually clear heap storage. std::vector().swap(Freqs); + IsIrrLoopHeader.clear(); std::vector().swap(Working); Loops.clear(); } @@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() { /// Releases all memory not used downstream. In particular, saves Freqs. static void cleanup(BlockFrequencyInfoImplBase &BFI) { std::vector SavedFreqs(std::move(BFI.Freqs)); + SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); BFI.clear(); BFI.Freqs = std::move(SavedFreqs); + BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); } bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, @@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, return BlockCount.getLimitedValue(); } +bool +BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { + if (!Node.isValid()) + return false; + return IsIrrLoopHeader.test(Node.Index); +} + Scaled64 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { if (!Node.isValid()) @@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); } } + +void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { + BlockMass LoopMass = BlockMass::getFull(); + DitheringDistributer D(Dist, LoopMass); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index d5758da0464c4..d65916f4966b3 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -42,6 +42,8 @@ using namespace llvm; MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B) : BB(B), Number(-1), xParent(&MF) { Insts.Parent = this; + if (B) + IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight(); } MachineBasicBlock::~MachineBasicBlock() { @@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, } OS << '\n'; } + if (IrrLoopHeaderWeight) { + if (Indexes) OS << '\t'; + OS << " Irreducible loop header weight: " + << IrrLoopHeaderWeight.getValue(); + OS << '\n'; + } } void MachineBasicBlock::printAsOperand(raw_ostream &OS, diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 14cd91206d804..2c336e450569a 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None; } +bool +MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) { + assert(MBFI && "Expected analysis to be available"); + return MBFI->isIrrLoopHeader(MBB); +} + const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { return MBFI ? MBFI->getFunction() : nullptr; } diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 2b780adf6c69c..22513924a96dc 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const { const LandingPadInst *BasicBlock::getLandingPadInst() const { return dyn_cast(getFirstNonPHI()); } + +Optional BasicBlock::getIrrLoopHeaderWeight() const { + const TerminatorInst *TI = getTerminator(); + if (MDNode *MDIrrLoopHeader = + TI->getMetadata(LLVMContext::MD_irr_loop)) { + MDString *MDName = cast(MDIrrLoopHeader->getOperand(0)); + if (MDName->getString().equals("loop_header_weight")) { + auto *CI = mdconst::extract(MDIrrLoopHeader->getOperand(1)); + return Optional(CI->getValue().getZExtValue()); + } + } + return Optional(); +} diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp index a94da5452b87c..c8b7c10a9a41b 100644 --- a/lib/IR/LLVMContext.cpp +++ b/lib/IR/LLVMContext.cpp @@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { {MD_absolute_symbol, "absolute_symbol"}, {MD_associated, "associated"}, {MD_callees, "callees"}, + {MD_irr_loop, "irr_loop"}, }; for (auto &MDKind : MDKinds) { diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp index 54783e884e990..d8e64db7c5d83 100644 --- a/lib/IR/MDBuilder.cpp +++ b/lib/IR/MDBuilder.cpp @@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, } return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)}); } + +MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) { + SmallVector Vals(2); + Vals[0] = createString("loop_header_weight"); + Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)); + return MDNode::get(Context, Vals); +} diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 11a43e803a99e..c92d48396c847 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -844,8 +844,9 @@ class PGOUseFunc { PGOUseFunc(Function &Func, Module *Modu, std::unordered_multimap &ComdatMembers, BranchProbabilityInfo *BPI = nullptr, - BlockFrequencyInfo *BFI = nullptr) - : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI), + BlockFrequencyInfo *BFIin = nullptr) + : F(Func), M(Modu), BFI(BFIin), + FuncInfo(Func, ComdatMembers, false, BPI, BFIin), FreqAttr(FFA_Normal) {} // Read counts for the instrumented BB from profile. @@ -863,6 +864,9 @@ class PGOUseFunc { // Annotate the value profile call sites for one value kind. void annotateValueSites(uint32_t Kind); + // Annotate the irreducible loop header weights. + void annotateIrrLoopHeaderWeights(); + // The hotness of the function from the profile count. enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot }; @@ -894,6 +898,7 @@ class PGOUseFunc { private: Function &F; Module *M; + BlockFrequencyInfo *BFI; // This member stores the shared information with class PGOGenFunc. FuncPGOInstrumentation FuncInfo; @@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() { } } +void PGOUseFunc::annotateIrrLoopHeaderWeights() { + DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); + // Find irr loop headers + for (auto &BB : F) { + if (BFI->isIrrLoopHeader(&BB)) { + TerminatorInst *TI = BB.getTerminator(); + const UseBBInfo &BBCountInfo = getBBInfo(&BB); + setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); + } + } +} + void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { Module *M = F.getParent(); IRBuilder<> Builder(&SI); @@ -1441,6 +1458,7 @@ static bool annotateAllFunctions( Func.populateCounters(); Func.setBranchWeights(); Func.annotateValueSites(); + Func.annotateIrrLoopHeaderWeights(); PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr(); if (FreqAttr == PGOUseFunc::FFA_Cold) ColdFunctions.push_back(&F); @@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, namespace llvm { +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) { + MDBuilder MDB(M->getContext()); + TI->setMetadata(llvm::LLVMContext::MD_irr_loop, + MDB.createIrrLoopHeaderWeight(Count)); +} + template <> struct GraphTraits { using NodeRef = const BasicBlock *; using ChildIteratorType = succ_const_iterator; diff --git a/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll new file mode 100644 index 0000000000000..0a580276d952d --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll @@ -0,0 +1,208 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28 + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %for.end, %entry + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2, !prof !29 + +for.cond2: ; preds = %if.end9, %for.body + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31 + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9, !prof !32 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9, !irr_loop !33 + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2, !irr_loop !34 + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35 +} + +!27 = !{!"function_entry_count", i64 1} +!28 = !{!"branch_weights", i32 1, i32 0} +!29 = !{!"branch_weights", i32 50, i32 50} +!30 = !{!"branch_weights", i32 950, i32 100} +!31 = !{!"loop_header_weight", i64 1050} +!32 = !{!"branch_weights", i32 323, i32 627} +!33 = !{!"loop_header_weight", i64 373} +!34 = !{!"loop_header_weight", i64 1000} +!35 = !{!"branch_weights", i32 1, i32 99} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii': +; CHECK-NEXT: block-frequency-info: _Z11irreducibleii +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0 +; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1 +; CHECK-NEXT: - for.body: {{.*}} count = 100 +; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050 +; CHECK-NEXT: - for.body4: {{.*}} count = 950 +; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373 +; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000 +; CHECK-NEXT: - for.end: {{.*}} count = 100 + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ], !prof !36 + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38 + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40 + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42 +} + +!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1} +!37 = !{!"branch_weights", i32 201, i32 300} +!38 = !{!"loop_header_weight", i64 501} +!39 = !{!"branch_weights", i32 100, i32 0} +!40 = !{!"loop_header_weight", i64 100} +!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99} +!42 = !{!"loop_header_weight", i64 400} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh': +; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - for.cond1: {{.*}} count = 1 +; CHECK-NEXT: - dispatch_op: {{.*}} count = 201 +; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200 +; CHECK-NEXT: - sw.bb: {{.*}} count = 0 +; CHECK-NEXT: - TARGET_1: {{.*}} count = 299 +; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501 +; CHECK-NEXT: - if.then: {{.*}} count = 299 +; CHECK-NEXT: - TARGET_2: {{.*}} count = 98 +; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100 +; CHECK-NEXT: - if.then18: {{.*}} count = 99 +; CHECK-NEXT: - unknown_op: {{.*}} count = 0 +; CHECK-NEXT: - sw.default: {{.*}} count = 0 +; CHECK-NEXT: - exit: {{.*}} count = 1 +; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400 diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll index a6d46e5586a27..4680e46245856 100644 --- a/test/ThinLTO/X86/lazyload_metadata.ll +++ b/test/ThinLTO/X86/lazyload_metadata.ll @@ -10,13 +10,13 @@ ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY -; LAZY: 53 bitcode-reader - Number of Metadata records loaded +; LAZY: 55 bitcode-reader - Number of Metadata records loaded ; LAZY: 2 bitcode-reader - Number of MDStrings loaded ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY -; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded +; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded ; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext new file mode 100644 index 0000000000000..9b0210d9a309f --- /dev/null +++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext @@ -0,0 +1,29 @@ +:ir +_Z11irreducibleii +# Func Hash: +64451410787 +# Num Counters: +6 +# Counter Values: +1000 +950 +100 +373 +1 +0 + +_Z11irreduciblePh +# Func Hash: +104649601521 +# Num Counters: +9 +# Counter Values: +100 +300 +99 +300 +201 +1 +1 +0 +0 diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll new file mode 100644 index 0000000000000..37f6e206ee927 --- /dev/null +++ b/test/Transforms/PGOProfile/irreducible.ll @@ -0,0 +1,184 @@ +; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE + +; GEN: $__llvm_profile_raw_version = comdat any + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %entry, %for.end + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2 + +for.cond2: ; preds = %for.body, %if.end9 + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end +; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]] + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9 +; USE: br label %if.end9, +; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]] + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2 +; USE: br label %for.cond2, +; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]] + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ] + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1 + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then +; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]] + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit +; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]] + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2] +; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]] +} + +; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050} +; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373} +; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000} +; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501} +; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100} +; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400} From f79fab6f98f7bbecb85bd1e7bad70088f9f76e6b Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Thu, 2 Nov 2017 22:35:22 +0000 Subject: [PATCH 024/238] AMDGPU: Fix warning discovered by r317266 [-Wunused-private-field] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317280 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 5f5636e119a9c..fed31fbf42b24 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -140,7 +140,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { private: unsigned LDSWaveSpillSize = 0; - unsigned ScratchOffsetReg; unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; From 56898c124500c5871061bcdae65eb2033743438b Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:14:55 +0000 Subject: [PATCH 025/238] Add feature to determine if host architecture is 64-bit in llvm-lit I have a test that I'd like to add to llvm that demands using more than 32-bits worth of address space. This test can't be run on 32-bit systems because they don't have enough address space. The host triple should be used to determine this instead of config.host_arch because on Debian systems config.host_arch is not correct. This change adds the "host-arch-is-64bit" feature to allow tests to restrict themselves to the 64-bit case. Differential Revision: https://reviews.llvm.org/D39465 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317281 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.cfg.py | 3 +++ test/lit.site.cfg.py.in | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 6a5cf69b9876a..57dc1f0704920 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -168,6 +168,9 @@ def get_asan_rtlib(): config.available_features.add(arch.lower() + '-registered-target') # Features +known_arches = ["x86_64", "mips64", "ppc64", "aarch64"] +if any(config.llvm_host_triple.startswith(x) for x in known_arches): + config.available_features.add("llvm-64-bits") # Others/can-execute.txt if sys.platform not in ['win32']: diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index 19e5cd0d3c2d2..efdd016e45d99 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -29,7 +29,6 @@ config.targets_to_build = "@TARGETS_TO_BUILD@" config.native_target = "@LLVM_NATIVE_ARCH@" config.llvm_bindings = "@LLVM_BINDINGS@".split(' ') config.host_os = "@HOST_OS@" -config.host_arch = "@HOST_ARCH@" config.host_cc = "@HOST_CC@" config.host_cxx = "@HOST_CXX@" config.host_ldflags = "@HOST_LDFLAGS@" @@ -42,6 +41,7 @@ config.enable_ffi = @LLVM_ENABLE_FFI@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@" +config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. From a555cf06835827701a43bb0528d74bfc195fdeb8 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Thu, 2 Nov 2017 23:17:06 +0000 Subject: [PATCH 026/238] IndVarSimplify: preserve debug information attached to widened PHI nodes. This fixes PR35015. https://bugs.llvm.org/show_bug.cgi?id=35015 Differential Revision: https://reviews.llvm.org/D39345 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317282 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/IndVarSimplify.cpp | 10 +++ .../IndVarSimplify/scev-phi-debug-info.ll | 71 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 test/Transforms/IndVarSimplify/scev-phi-debug-info.ll diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 9ce42a0682568..abb50f27f1cca 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -48,6 +48,7 @@ #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" @@ -1624,6 +1625,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } + + // Attach any debug information to the new PHI. Since OrigPhi and WidePHI + // evaluate the same recurrence, we can just copy the debug info over. + SmallVector DbgValues; + llvm::findDbgValues(DbgValues, OrigPhi); + auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(), + ValueAsMetadata::get(WidePhi)); + for (auto &DbgValue : DbgValues) + DbgValue->setOperand(0, MDPhi); return WidePhi; } diff --git a/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll new file mode 100644 index 0000000000000..dc6aae8d8aa6a --- /dev/null +++ b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll @@ -0,0 +1,71 @@ +; RUN: opt %s -indvars -S -o - | FileCheck %s +source_filename = "/Data/llvm/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.status = type { i32, i8* } + +@status = internal unnamed_addr global [32 x %struct.status] zeroinitializer, align 16, !dbg !0 + +define void @f0() local_unnamed_addr !dbg !20 { +entry: + tail call void @llvm.dbg.value(metadata i32 0, metadata !23, metadata !DIExpression()), !dbg !24 + br label %for.cond, !dbg !24 + +for.cond: ; preds = %for.body, %entry + ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + ; CHECK: call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !23, metadata !DIExpression()), !dbg !24 + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.dbg.value(metadata i32 %i.0, metadata !23, metadata !DIExpression()), !dbg !24 + %cmp = icmp slt i32 %i.0, 32, !dbg !24 + br i1 %cmp, label %for.body, label %for.end, !dbg !24 + +for.body: ; preds = %for.cond + %idxprom = sext i32 %i.0 to i64, !dbg !24 + %value = getelementptr inbounds [32 x %struct.status], [32 x %struct.status]* @status, i64 0, i64 %idxprom, i32 0, !dbg !24 + store i32 42, i32* %value, align 16, !dbg !24 + tail call void @use(i32 %i.0), !dbg !24 + %inc = add nsw i32 %i.0, 1, !dbg !24 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !23, metadata !DIExpression()), !dbg !24 + br label %for.cond, !dbg !24 + +for.end: ; preds = %for.cond + ret void, !dbg !24 +} + +declare void @use(i32) + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #0 + +attributes #0 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!16, !17, !18} +!llvm.ident = !{!19} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "status", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5) +!3 = !DIFile(filename: "x.c", directory: "/home/davide/work/llvm/build-release/bin") +!4 = !{} +!5 = !{!0} +!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 4096, elements: !14) +!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "status", file: !3, line: 2, size: 128, elements: !8) +!8 = !{!9, !11} +!9 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !7, file: !3, line: 3, baseType: !10, size: 32) +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !7, file: !3, line: 4, baseType: !12, size: 64, offset: 64) +!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64) +!13 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) +!14 = !{!15} +!15 = !DISubrange(count: 32) +!16 = !{i32 2, !"Dwarf Version", i32 4} +!17 = !{i32 2, !"Debug Info Version", i32 3} +!18 = !{i32 1, !"wchar_size", i32 4} +!19 = !{!"clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)"} +!20 = distinct !DISubprogram(name: "f0", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22) +!21 = !DISubroutineType(types: !4) +!22 = !{!23} +!23 = !DILocalVariable(name: "i", scope: !20, file: !3, line: 8, type: !10) +!24 = !DILocation(line: 9, scope: !20) From 6d06c893037035a00e081b6740d977dcce8653f5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 2 Nov 2017 23:23:37 +0000 Subject: [PATCH 027/238] [X86] Give AVX512VL instructions priority over their AVX equivalents. I thought we had gotten all these priority bugs worked out, but I guess not. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317283 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrSSE.td | 4 ++-- test/CodeGen/X86/avx-intrinsics-x86.ll | 26 +++++++++++++++++-------- test/CodeGen/X86/sse-intrinsics-x86.ll | 13 +++++++++---- test/CodeGen/X86/sse2-intrinsics-x86.ll | 13 +++++++++---- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 451303054f56a..d4676b57455d2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3186,7 +3186,7 @@ let Predicates = prds in { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def V#NAME#PDr : PDI opc, string OpcodeStr, SDNode OpNode, // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 44eb14160ee19..b0cf4e3b29f69 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -635,10 +635,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_pd_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -646,10 +651,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_ps_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll index f178e18a25965..5ba9f9a2645e3 100644 --- a/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -475,10 +475,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { ; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_sqrt_ps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse_sqrt_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse_sqrt_ps: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index d4047faad9bbc..dcccdbfc2e674 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { ; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_sqrt_pd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_sqrt_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_sqrt_pd: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] ret <2 x double> %res } From 89fd072604faffaaf81741519ecb99ecaaa3a55e Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:24:04 +0000 Subject: [PATCH 028/238] [llvm-objcopy] Fix bug in how segment alignment was being handled Just aligning segment offsets to segment alignment is incorrect and also wastes more space than is needed. The requirement is that p_offset == p_addr modulo p_align *not* that p_offset == 0 modulo p_align. Generally speaking we've been using p_addr == 0 modulo p_align. In fact yaml2obj can't even produce a valid situation which causes llvm-objcopy to produce incorrect results because alignment and offset were both inherited from the sections the program header covers. This change fixes this bad behavior in llvm-objcopy. Differential Revision: https://reviews.llvm.org/D39132 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317284 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../check-addr-offset-align-binary.test | 40 +++++++++++ .../llvm-objcopy/check-addr-offset-align.test | 67 +++++++++++++++++++ tools/llvm-objcopy/Object.cpp | 20 +++++- 3 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 test/tools/llvm-objcopy/check-addr-offset-align-binary.test create mode 100644 test/tools/llvm-objcopy/check-addr-offset-align.test diff --git a/test/tools/llvm-objcopy/check-addr-offset-align-binary.test b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test new file mode 100644 index 0000000000000..755acceeda2ce --- /dev/null +++ b/test/tools/llvm-objcopy/check-addr-offset-align-binary.test @@ -0,0 +1,40 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy -O binary %t %t2 +# RUN: od -t x1 %t2 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x0000000000001000 + Content: "c3c3c3c3" + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x1008 + AddressAlign: 0x0000000000000008 + Content: "3232" +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x1000 + PAddr: 0x1000 + Align: 0x1000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + VAddr: 0x1008 + PAddr: 0x1008 + Align: 0x1000 + Sections: + - Section: .data + +# CHECK: 0000000 c3 c3 c3 c3 00 00 00 00 32 32 diff --git a/test/tools/llvm-objcopy/check-addr-offset-align.test b/test/tools/llvm-objcopy/check-addr-offset-align.test new file mode 100644 index 0000000000000..ca2367ba43414 --- /dev/null +++ b/test/tools/llvm-objcopy/check-addr-offset-align.test @@ -0,0 +1,67 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objcopy %t %t2 +# RUN: llvm-readobj -program-headers %t2 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x0000000000001000 + Content: "c3c3c3c3" + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x1008 + AddressAlign: 0x0000000000000008 + Content: "3232" +ProgramHeaders: + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + VAddr: 0x1000 + PAddr: 0x1000 + Align: 0x1000 + Sections: + - Section: .text + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + VAddr: 0x1008 + PAddr: 0x1008 + Align: 0x1000 + Sections: + - Section: .data + +#CHECK: ProgramHeaders [ +#CHECK-NEXT: ProgramHeader { +#CHECK-NEXT: Type: PT_LOAD +#CHECK-NEXT: Offset: 0x1000 +#CHECK-NEXT: VirtualAddress: 0x1000 +#CHECK-NEXT: PhysicalAddress: 0x1000 +#CHECK-NEXT: FileSize: 4 +#CHECK-NEXT: MemSize: 4 +#CHECK-NEXT: Flags [ +#CHECK-NEXT: PF_R +#CHECK-NEXT: PF_X +#CHECK-NEXT: ] +#CHECK-NEXT: Alignment: 4096 +#CHECK-NEXT: } +#CHECK-NEXT: ProgramHeader { +#CHECK-NEXT: Type: PT_LOAD +#CHECK-NEXT: Offset: 0x1008 +#CHECK-NEXT: VirtualAddress: 0x1008 +#CHECK-NEXT: PhysicalAddress: 0x1008 +#CHECK-NEXT: FileSize: 2 +#CHECK-NEXT: MemSize: 2 +#CHECK-NEXT: Flags [ +#CHECK-NEXT: PF_R +#CHECK-NEXT: PF_W +#CHECK-NEXT: ] +#CHECK-NEXT: Alignment: 4096 +#CHECK-NEXT: } +#CHECK-NEXT:] diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp index 22ae47f1cace7..5f9864d9cc047 100644 --- a/tools/llvm-objcopy/Object.cpp +++ b/tools/llvm-objcopy/Object.cpp @@ -685,6 +685,19 @@ template void ELFObject::sortSections() { CompareSections); } +static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) { + // Calculate Diff such that (Offset + Diff) & -Align == Addr & -Align. + if (Align == 0) + Align = 1; + auto Diff = + static_cast(Addr % Align) - static_cast(Offset % Align); + // We only want to add to Offset, however, so if Diff < 0 we can add Align and + // (Offset + Diff) & -Align == Addr & -Align will still hold. + if (Diff < 0) + Diff += Align; + return Offset + Diff; +} + template void ELFObject::assignOffsets() { // We need a temporary list of segments that has a special order to it // so that we know that anytime ->ParentSegment is set that segment has @@ -728,7 +741,7 @@ template void ELFObject::assignOffsets() { Segment->Offset = Parent->Offset + Segment->OriginalOffset - Parent->OriginalOffset; } else { - Offset = alignTo(Offset, Segment->Align == 0 ? 1 : Segment->Align); + Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align); Segment->Offset = Offset; } Offset = std::max(Offset, Segment->Offset + Segment->FileSize); @@ -829,8 +842,9 @@ template void BinaryObject::finalize() { uint64_t Offset = 0; for (auto &Segment : this->Segments) { - if (Segment->Type == PT_LOAD && Segment->firstSection() != nullptr) { - Offset = alignTo(Offset, Segment->Align); + if (Segment->Type == llvm::ELF::PT_LOAD && + Segment->firstSection() != nullptr) { + Offset = alignToAddr(Offset, Segment->VAddr, Segment->Align); Segment->Offset = Offset; Offset += Segment->FileSize; } From 0ae3f32f5642942bbc7ebd2f40e1b218eee51fef Mon Sep 17 00:00:00 2001 From: Puyan Lotfi Date: Thu, 2 Nov 2017 23:37:32 +0000 Subject: [PATCH 029/238] mir-canon: First commit. mir-canon (MIRCanonicalizerPass) is a pass designed to reorder instructions and rename operands so that two similar programs will diff more cleanly after being run through mir-canon than they would otherwise. This project is still a work in progress and there are ideas still being discussed for improving diff quality. M include/llvm/InitializePasses.h M lib/CodeGen/CMakeLists.txt M lib/CodeGen/CodeGen.cpp A lib/CodeGen/MIRCanonicalizerPass.cpp git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317285 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/MIRCanonicalizerPass.cpp | 626 +++++++++++++++++++++++++++ 4 files changed, 629 insertions(+) create mode 100644 lib/CodeGen/MIRCanonicalizerPass.cpp diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index c3ad8fe41af80..8c63ab0284dfa 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -377,6 +377,7 @@ void initializeWinEHPreparePass(PassRegistry&); void initializeWriteBitcodePassPass(PassRegistry&); void initializeWriteThinLTOBitcodePass(PassRegistry&); void initializeXRayInstrumentationPass(PassRegistry&); +void initializeMIRCanonicalizerPass(PassRegistry &); } // end namespace llvm diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 7ec7fda4e445e..2e364cd4794d5 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -113,6 +113,7 @@ add_llvm_library(LLVMCodeGen RegisterPressure.cpp RegisterScavenging.cpp RenameIndependentSubregs.cpp + MIRCanonicalizerPass.cpp RegisterUsageInfo.cpp RegUsageInfoCollector.cpp RegUsageInfoPropagate.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index f4ccb4889d357..bfab865687e7c 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -99,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegRewriterPass(Registry); initializeWinEHPreparePass(Registry); initializeXRayInstrumentationPass(Registry); + initializeMIRCanonicalizerPass(Registry); } void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp new file mode 100644 index 0000000000000..61f9f7e2c5d8b --- /dev/null +++ b/lib/CodeGen/MIRCanonicalizerPass.cpp @@ -0,0 +1,626 @@ +//===-------------- MIRCanonicalizer.cpp - MIR Canonicalizer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The purpose of this pass is to employ a canonical code transformation so +// that code compiled with slightly different IR passes can be diffed more +// effectively than otherwise. This is done by renaming vregs in a given +// LiveRange in a canonical way. This pass also does a pseudo-scheduling to +// move defs closer to their use inorder to reduce diffs caused by slightly +// different schedules. +// +// Basic Usage: +// +// llc -o - -run-pass mir-canonicalizer example.mir +// +// Reorders instructions canonically. +// Renames virtual register operands canonically. +// Strips certain MIR artifacts (optionally). +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include + +using namespace llvm; + +namespace llvm { +extern char &MIRCanonicalizerID; +} // namespace llvm + +#define DEBUG_TYPE "mir-canonicalizer" + +static cl::opt +CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u), + cl::value_desc("N"), + cl::desc("Function number to canonicalize.")); + +static cl::opt +CanonicalizeBasicBlockNumber("canon-nth-basicblock", cl::Hidden, cl::init(~0u), + cl::value_desc("N"), + cl::desc("BasicBlock number to canonicalize.")); + +namespace { + +class MIRCanonicalizer : public MachineFunctionPass { +public: + static char ID; + MIRCanonicalizer() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "Rename register operands in a canonical ordering."; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // end anonymous namespace + +enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate }; +class TypedVReg { + VRType type; + unsigned reg; + +public: + TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {} + TypedVReg(VRType type) : type(type), reg(~0U) { + assert(type != RSE_Reg && "Expected a non-register type."); + } + + bool isReg() const { return type == RSE_Reg; } + bool isFrameIndex() const { return type == RSE_FrameIndex; } + bool isCandidate() const { return type == RSE_NewCandidate; } + + VRType getType() const { return type; } + unsigned getReg() const { + assert(this->isReg() && "Expected a virtual or physical register."); + return reg; + } +}; + +char MIRCanonicalizer::ID; + +char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID; + +INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer", + "Rename Register Operands Canonically", false, false); + +INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer", + "Rename Register Operands Canonically", false, false); + +static std::vector GetRPOList(MachineFunction &MF) { + ReversePostOrderTraversal RPOT(&*MF.begin()); + std::vector RPOList; + for (auto MBB : RPOT) { + RPOList.push_back(MBB); + } + + return RPOList; +} + +// Set a dummy vreg. We use this vregs register class to generate throw-away +// vregs that are used to skip vreg numbers so that vreg numbers line up. +static unsigned GetDummyVReg(const MachineFunction &MF) { + for (auto &MBB : MF) { + for (auto &MI : MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + return MO.getReg(); + } + } + } + + return ~0U; +} + +static bool rescheduleCanonically(MachineBasicBlock *MBB) { + + bool Changed = false; + + // Calculates the distance of MI from the begining of its parent BB. + auto getInstrIdx = [](const MachineInstr &MI) { + unsigned i = 0; + for (auto &CurMI : *MI.getParent()) { + if (&CurMI == &MI) + return i; + i++; + } + return ~0U; + }; + + // Pre-Populate vector of instructions to reschedule so that we don't + // clobber the iterator. + std::vector Instructions; + for (auto &MI : *MBB) { + Instructions.push_back(&MI); + } + + for (auto *II : Instructions) { + if (II->getNumOperands() == 0) + continue; + + MachineOperand &MO = II->getOperand(0); + if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump();); + + MachineInstr *Def = II; + unsigned Distance = ~0U; + MachineInstr *UseToBringDefCloserTo = nullptr; + MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo(); + for (auto &UO : MRI->use_nodbg_operands(MO.getReg())) { + MachineInstr *UseInst = UO.getParent(); + + const unsigned DefLoc = getInstrIdx(*Def); + const unsigned UseLoc = getInstrIdx(*UseInst); + const unsigned Delta = (UseLoc - DefLoc); + + if (UseInst->getParent() != Def->getParent()) + continue; + if (DefLoc >= UseLoc) + continue; + + if (Delta < Distance) { + Distance = Delta; + UseToBringDefCloserTo = UseInst; + } + } + + const auto BBE = MBB->instr_end(); + MachineBasicBlock::iterator DefI = BBE; + MachineBasicBlock::iterator UseI = BBE; + + for (auto BBI = MBB->instr_begin(); BBI != BBE; ++BBI) { + + if (DefI != BBE && UseI != BBE) + break; + + if ((&*BBI != Def) && (&*BBI != UseToBringDefCloserTo)) + continue; + + if (&*BBI == Def) { + DefI = BBI; + continue; + } + + if (&*BBI == UseToBringDefCloserTo) { + UseI = BBI; + continue; + } + } + + if (DefI == BBE || UseI == BBE) + continue; + + DEBUG({ + dbgs() << "Splicing "; + DefI->dump(); + dbgs() << " right before: "; + UseI->dump(); + }); + + Changed = true; + MBB->splice(UseI, MBB, DefI); + } + + return Changed; +} + +/// Here we find our candidates. What makes an interesting candidate? +/// An candidate for a canonicalization tree root is normally any kind of +/// instruction that causes side effects such as a store to memory or a copy to +/// a physical register or a return instruction. We use these as an expression +/// tree root that we walk inorder to build a canonical walk which should result +/// in canoncal vreg renaming. +static std::vector populateCandidates(MachineBasicBlock *MBB) { + std::vector Candidates; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) { + MachineInstr *MI = &*II; + + bool DoesMISideEffect = false; + + if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) { + const unsigned Dst = MI->getOperand(0).getReg(); + DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst); + + for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) { + if (DoesMISideEffect) break; + DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent()); + } + } + + if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect) + continue; + + DEBUG(dbgs() << "Found Candidate: "; MI->dump();); + Candidates.push_back(MI); + } + + return Candidates; +} + +void doCandidateWalk(std::vector &VRegs, + std::queue &RegQueue, + std::vector &VisitedMIs, + const MachineBasicBlock *MBB) { + + const MachineFunction &MF = *MBB->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + while (!RegQueue.empty()) { + + auto TReg = RegQueue.front(); + RegQueue.pop(); + + if (TReg.isFrameIndex()) { + DEBUG(dbgs() << "Popping frame index.\n";); + VRegs.push_back(TypedVReg(RSE_FrameIndex)); + continue; + } + + assert(TReg.isReg() && "Expected vreg or physreg."); + unsigned Reg = TReg.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + DEBUG({ + dbgs() << "Popping vreg "; + MRI.def_begin(Reg)->dump(); + dbgs() << "\n"; + }); + + if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) { + return TR.isReg() && TR.getReg() == Reg; + })) { + VRegs.push_back(TypedVReg(Reg)); + } + } else { + DEBUG(dbgs() << "Popping physreg.\n";); + VRegs.push_back(TypedVReg(Reg)); + continue; + } + + for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) { + MachineInstr *Def = RI->getParent(); + + if (Def->getParent() != MBB) + continue; + + if (llvm::any_of(VisitedMIs, + [&](const MachineInstr *VMI) { return Def == VMI; })) { + break; + } + + DEBUG({ + dbgs() << "\n========================\n"; + dbgs() << "Visited MI: "; + Def->dump(); + dbgs() << "BB Name: " << Def->getParent()->getName() << "\n"; + dbgs() << "\n========================\n"; + }); + VisitedMIs.push_back(Def); + for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) { + + MachineOperand &MO = Def->getOperand(I); + if (MO.isFI()) { + DEBUG(dbgs() << "Pushing frame index.\n";); + RegQueue.push(TypedVReg(RSE_FrameIndex)); + } + + if (!MO.isReg()) + continue; + RegQueue.push(TypedVReg(MO.getReg())); + } + } + } +} + +// TODO: Work to remove this in the future. One day when we have named vregs +// we should be able to form the canonical name based on some characteristic +// we see in that point of the expression tree (like if we were to name based +// on some sort of value numbering scheme). +static void SkipVRegs(unsigned &VRegGapIndex, MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) { + const unsigned VR_GAP = (++VRegGapIndex * 1000); + + DEBUG({ + dbgs() << "Adjusting per-BB VR_GAP for BB" << VRegGapIndex << " to " + << VR_GAP << "\n"; + }); + + unsigned I = MRI.createVirtualRegister(RC); + const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP; + while (I != E) { + I = MRI.createVirtualRegister(RC); + } +} + +static std::map +GetVRegRenameMap(const std::vector &VRegs, + const std::vector &renamedInOtherBB, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) { + std::map VRegRenameMap; + unsigned LastRenameReg = MRI.createVirtualRegister(RC); + bool FirstCandidate = true; + + for (auto &vreg : VRegs) { + if (vreg.isFrameIndex()) { + // We skip one vreg for any frame index because there is a good chance + // (especially when comparing SelectionDAG to GlobalISel generated MIR) + // that in the other file we are just getting an incoming vreg that comes + // from a copy from a frame index. So it's safe to skip by one. + LastRenameReg = MRI.createVirtualRegister(RC); + DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";); + continue; + } else if (vreg.isCandidate()) { + + // After the first candidate, for every subsequent candidate, we skip mod + // 10 registers so that the candidates are more likely to start at the + // same vreg number making it more likely that the canonical walk from the + // candidate insruction. We don't need to skip from the first candidate of + // the BasicBlock because we already skip ahead several vregs for each BB. + while (LastRenameReg % 10) { + if (!FirstCandidate) break; + LastRenameReg = MRI.createVirtualRegister(RC); + + DEBUG({ + dbgs() << "Skipping rename for new candidate " << LastRenameReg + << "\n"; + }); + } + FirstCandidate = false; + continue; + } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) { + LastRenameReg = MRI.createVirtualRegister(RC); + DEBUG({ + dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n"; + }); + continue; + } + + auto Reg = vreg.getReg(); + if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) { + DEBUG(dbgs() << "Vreg " << Reg << " already renamed in other BB.\n";); + continue; + } + + auto Rename = MRI.createVirtualRegister(MRI.getRegClass(Reg)); + LastRenameReg = Rename; + + if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) { + DEBUG(dbgs() << "Mapping vreg ";); + if (MRI.reg_begin(Reg) != MRI.reg_end()) { + DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump();); + } else { + DEBUG(dbgs() << Reg;); + } + DEBUG(dbgs() << " to ";); + if (MRI.reg_begin(Rename) != MRI.reg_end()) { + DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump();); + } else { + DEBUG(dbgs() << Rename;); + } + DEBUG(dbgs() << "\n";); + + VRegRenameMap.insert(std::pair(Reg, Rename)); + } + } + + return VRegRenameMap; +} + +static bool doVRegRenaming(std::vector &RenamedInOtherBB, + const std::map &VRegRenameMap, + MachineRegisterInfo &MRI) { + bool Changed = false; + for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) { + + auto VReg = I->first; + auto Rename = I->second; + + RenamedInOtherBB.push_back(Rename); + + std::vector RenameMOs; + for (auto &MO : MRI.reg_operands(VReg)) { + RenameMOs.push_back(&MO); + } + + for (auto *MO : RenameMOs) { + Changed = true; + MO->setReg(Rename); + + if (!MO->isDef()) + MO->setIsKill(false); + } + } + + return Changed; +} + +static bool doDefKillClear(MachineBasicBlock *MBB) { + bool Changed = false; + + for (auto &MI : *MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (!MO.isDef() && MO.isKill()) { + Changed = true; + MO.setIsKill(false); + } + + if (MO.isDef() && MO.isDead()) { + Changed = true; + MO.setIsDead(false); + } + } + } + + return Changed; +} + +static bool runOnBasicBlock(MachineBasicBlock *MBB, + std::vector &bbNames, + std::vector &renamedInOtherBB, + unsigned &basicBlockNum, unsigned &VRegGapIndex) { + + if (CanonicalizeBasicBlockNumber != ~0U) { + if (CanonicalizeBasicBlockNumber != basicBlockNum++) + return false; + DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName() << "\n";); + } + + if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) { + DEBUG({ + dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName() + << "\n"; + }); + return false; + } + + DEBUG({ + dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << " \n\n"; + dbgs() << "\n\n================================================\n\n"; + }); + + bool Changed = false; + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const unsigned DummyVReg = GetDummyVReg(MF); + const TargetRegisterClass *DummyRC = + (DummyVReg == ~0U) ? nullptr : MRI.getRegClass(DummyVReg); + if (!DummyRC) return false; + + bbNames.push_back(MBB->getName()); + DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";); + + DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump();); + Changed |= rescheduleCanonically(MBB); + DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump();); + + std::vector Candidates = populateCandidates(MBB); + std::vector VisitedMIs; + std::copy(Candidates.begin(), Candidates.end(), + std::back_inserter(VisitedMIs)); + + std::vector VRegs; + for (auto candidate : Candidates) { + VRegs.push_back(TypedVReg(RSE_NewCandidate)); + + std::queue RegQueue; + + // Here we walk the vreg operands of a non-root node along our walk. + // The root nodes are the original candidates (stores normally). + // These are normally not the root nodes (except for the case of copies to + // physical registers). + for (unsigned i = 1; i < candidate->getNumOperands(); i++) { + if (candidate->mayStore() || candidate->isBranch()) + break; + + MachineOperand &MO = candidate->getOperand(i); + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + + DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";); + RegQueue.push(TypedVReg(MO.getReg())); + } + + // Here we walk the root candidates. We start from the 0th operand because + // the root is normally a store to a vreg. + for (unsigned i = 0; i < candidate->getNumOperands(); i++) { + + if (!candidate->mayStore() && !candidate->isBranch()) + break; + + MachineOperand &MO = candidate->getOperand(i); + + // TODO: Do we want to only add vregs here? + if (!MO.isReg() && !MO.isFI()) + continue; + + DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";); + + RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg()) : + TypedVReg(RSE_FrameIndex)); + } + + doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB); + } + + // If we have populated no vregs to rename then bail. + // The rest of this function does the vreg remaping. + if (VRegs.size() == 0) + return Changed; + + // Skip some vregs, so we can recon where we'll land next. + SkipVRegs(VRegGapIndex, MRI, DummyRC); + + auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, DummyRC); + Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI); + Changed |= doDefKillClear(MBB); + + DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump(); dbgs() << "\n";); + DEBUG(dbgs() << "\n\n================================================\n\n"); + return Changed; +} + +bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) { + + static unsigned functionNum = 0; + if (CanonicalizeFunctionNumber != ~0U) { + if (CanonicalizeFunctionNumber != functionNum++) + return false; + DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName() << "\n";); + } + + // we need a valid vreg to create a vreg type for skipping all those + // stray vreg numbers so reach alignment/canonical vreg values. + std::vector RPOList = GetRPOList(MF); + + DEBUG( + dbgs() << "\n\n NEW MACHINE FUNCTION: " << MF.getName() << " \n\n"; + dbgs() << "\n\n================================================\n\n"; + dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n"; + for (auto MBB : RPOList) { + dbgs() << MBB->getName() << "\n"; + } + dbgs() << "\n\n================================================\n\n"; + ); + + std::vector BBNames; + std::vector RenamedInOtherBB; + + unsigned GapIdx = 0; + unsigned BBNum = 0; + + bool Changed = false; + + for (auto MBB : RPOList) + Changed |= runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx); + + return Changed; +} + From 87cdca2231ed8908e603a904131c2f49c247303c Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 2 Nov 2017 23:38:13 +0000 Subject: [PATCH 030/238] [AArch64][RegisterBankInfo] Add FPR16 support in value mapping. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317286 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64GenRegisterBankInfo.def | 63 +++++++++++-------- .../AArch64/AArch64RegisterBankInfo.cpp | 8 ++- lib/Target/AArch64/AArch64RegisterBankInfo.h | 12 ++-- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 7d2cfbeff38af..8f17ae4534c2c 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -14,19 +14,21 @@ namespace llvm { RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ /* StartIdx, Length, RegBank */ - // 0: FPR 32-bit value. + // 0: FPR 16-bit value. + {0, 16, AArch64::FPRRegBank}, + // 1: FPR 32-bit value. {0, 32, AArch64::FPRRegBank}, - // 1: FPR 64-bit value. + // 2: FPR 64-bit value. {0, 64, AArch64::FPRRegBank}, - // 2: FPR 128-bit value. + // 3: FPR 128-bit value. {0, 128, AArch64::FPRRegBank}, - // 3: FPR 256-bit value. + // 4: FPR 256-bit value. {0, 256, AArch64::FPRRegBank}, - // 4: FPR 512-bit value. + // 5: FPR 512-bit value. {0, 512, AArch64::FPRRegBank}, - // 5: GPR 32-bit value. + // 6: GPR 32-bit value. {0, 32, AArch64::GPRRegBank}, - // 6: GPR 64-bit value. + // 7: GPR 64-bit value. {0, 64, AArch64::GPRRegBank}, }; @@ -37,55 +39,62 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ {nullptr, 0}, // 3-operands instructions (all binary operations should end up with one of // those mapping). - // 1: FPR 32-bit value. <-- This must match First3OpsIdx. + // 1: FPR 16-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 4: FPR 32-bit value. <-- This must match First3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 4: FPR 64-bit value. + // 7: FPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 7: FPR 128-bit value. + // 10: FPR 128-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, - // 10: FPR 256-bit value. + // 13: FPR 256-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, - // 13: FPR 512-bit value. + // 16: FPR 512-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, - // 16: GPR 32-bit value. + // 19: GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 19: GPR 64-bit value. <-- This must match Last3OpsIdx. + // 22: GPR 64-bit value. <-- This must match Last3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, // Cross register bank copies. - // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match + // 25: FPR 16-bit value to GPR 16-bit (invalid). <-- This must match // FirstCrossRegCpyIdx. + {nullptr, 1}, + {nullptr, 1}, + // 27: FPR 32-bit value to GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 24: FPR 64-bit value to GPR 64-bit value. + // 29: FPR 64-bit value to GPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, - // 26: FPR 128-bit value to GPR 128-bit value (invalid) + // 31: FPR 128-bit value to GPR 128-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 28: FPR 256-bit value to GPR 256-bit value (invalid) + // 33: FPR 256-bit value to GPR 256-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 30: FPR 512-bit value to GPR 512-bit value (invalid) + // 35: FPR 512-bit value to GPR 512-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 32: GPR 32-bit value to FPR 32-bit value. + // 37: GPR 32-bit value to FPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match + // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match // LastCrossRegCpyIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, @@ -145,16 +154,18 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, return -1; } if (RBIdx == PMI_FirstFPR) { - if (Size <= 32) + if (Size <= 16) return 0; - if (Size <= 64) + if (Size <= 32) return 1; - if (Size <= 128) + if (Size <= 64) return 2; - if (Size <= 256) + if (Size <= 128) return 3; - if (Size <= 512) + if (Size <= 256) return 4; + if (Size <= 512) + return 5; return -1; } return -1; diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 391e8ed633d7f..6e246a798c56e 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -87,9 +87,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, {PMI_GPR32, PMI_GPR64}) && "PartialMappingIdx's are incorrectly ordered"); - assert(checkPartialMappingIdx( - PMI_FirstFPR, PMI_LastFPR, - {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) && + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. @@ -102,6 +102,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); @@ -121,6 +122,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP(GPR, 32); CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); CHECK_VALUEMAP(FPR, 32); CHECK_VALUEMAP(FPR, 64); CHECK_VALUEMAP(FPR, 128); diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h index 6d74a47095a97..384b97729279b 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -25,10 +25,10 @@ class TargetRegisterInfo; class AArch64GenRegisterBankInfo : public RegisterBankInfo { protected: - enum PartialMappingIdx { PMI_None = -1, - PMI_FPR32 = 1, + PMI_FPR16 = 1, + PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, @@ -37,7 +37,7 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo { PMI_GPR64, PMI_FirstGPR = PMI_GPR32, PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR32, + PMI_FirstFPR = PMI_FPR16, PMI_LastFPR = PMI_FPR512, PMI_Min = PMI_FirstFPR, }; @@ -49,10 +49,10 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo { enum ValueMappingIdx { InvalidIdx = 0, First3OpsIdx = 1, - Last3OpsIdx = 19, + Last3OpsIdx = 22, DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 22, - LastCrossRegCpyIdx = 34, + FirstCrossRegCpyIdx = 25, + LastCrossRegCpyIdx = 39, DistanceBetweenCrossRegCpy = 2 }; From d8375d73687a9b88018cf808abbf8e639e2ad962 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Thu, 2 Nov 2017 23:38:19 +0000 Subject: [PATCH 031/238] [AArch64][RegisterBankInfo] Add mapping for G_FPEXT. This fixes http://llvm.org/PR32560. We were missing a description for half floating point type and as a result were using the FPR 32 mapping. Because of the size mismatch the generic code was complaining that the default mapping is not appropriate. Fix the mapping description so that the default mapping can be properly applied. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317287 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64GenRegisterBankInfo.def | 43 ++++++++ .../AArch64/AArch64RegisterBankInfo.cpp | 32 ++++++ lib/Target/AArch64/AArch64RegisterBankInfo.h | 15 ++- .../GlobalISel/arm64-regbankselect.mir | 104 ++++++++++++++++++ 4 files changed, 193 insertions(+), 1 deletion(-) diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 8f17ae4534c2c..39f50ade747c2 100644 --- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -98,6 +98,18 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ // LastCrossRegCpyIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, }; bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, @@ -217,4 +229,35 @@ AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize, + unsigned SrcSize) { + // We support: + // - For Scalar: + // - 16 to 32. + // - 16 to 64. + // - 32 to 64. + // => FPR 16 to FPR 32|64 + // => FPR 32 to FPR 64 + // - For vectors: + // - v4f16 to v4f32 + // - v2f32 to v2f64 + // => FPR 64 to FPR 128 + + // Check that we have been asked sensible sizes. + if (SrcSize == 16) { + assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension"); + if (DstSize == 32) + return &ValMappings[FPExt16To32Idx]; + return &ValMappings[FPExt16To64Idx]; + } + + if (SrcSize == 32) { + assert(DstSize == 64 && "Unexpected float extension"); + return &ValMappings[FPExt32To64Idx]; + } + assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension"); + return &ValMappings[FPExt64To128Idx]; +} } // End llvm namespace. diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 6e246a798c56e..83bf493c9f05d 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -175,6 +175,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); +#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ + do { \ + unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); + assert(verify(TRI) && "Invalid register bank information"); } @@ -455,6 +479,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: return getSameKindOfOperandsMapping(MI); + case TargetOpcode::G_FPEXT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), + /*NumOperands*/ 2); + } case TargetOpcode::COPY: { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h index 384b97729279b..008221dbef58a 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -53,7 +53,11 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo { DistanceBetweenRegBanks = 3, FirstCrossRegCpyIdx = 25, LastCrossRegCpyIdx = 39, - DistanceBetweenCrossRegCpy = 2 + DistanceBetweenCrossRegCpy = 2, + FPExt16To32Idx = 41, + FPExt16To64Idx = 43, + FPExt32To64Idx = 45, + FPExt64To128Idx = 47, }; static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, @@ -82,6 +86,15 @@ class AArch64GenRegisterBankInfo : public RegisterBankInfo { static const RegisterBankInfo::ValueMapping * getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + /// Get the instruction mapping for G_FPEXT. + /// + /// \pre (DstSize, SrcSize) pair is one of the following: + /// (32, 16), (64, 16), (64, 32), (128, 64) + /// + /// \return An InstructionMapping with statically allocated OperandsMapping. + static const RegisterBankInfo::ValueMapping * + getFPExtMapping(unsigned DstSize, unsigned SrcSize); + #define GET_TARGET_REGBANK_CLASS #include "AArch64GenRegisterBank.inc" }; diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 4042047dfc243..cc158a29c3e1d 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -92,6 +92,10 @@ store double %vres, double* %addr ret void } + + define void @fp16Ext32() { ret void } + define void @fp16Ext64() { ret void } + define void @fp32Ext64() { ret void } ... --- @@ -742,3 +746,103 @@ body: | RET_ReallyLR ... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext32 +name: fp16Ext32 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s32) = G_FPEXT %3 +# CHECK-NEXT: %s0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s32) = G_FPEXT %0(s16) + %s0 = COPY %2(s32) + RET_ReallyLR implicit %s0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext64 +name: fp16Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s64) = G_FPEXT %3 +# CHECK-NEXT: %d0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s64) = G_FPEXT %0(s16) + %d0 = COPY %2(s64) + RET_ReallyLR implicit %d0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp32Ext64 +name: fp32Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +# CHECK: %0:gpr(s32) = COPY %w0 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %2:fpr(s32) = COPY %0 +# CHECK-NEXT: %1:fpr(s64) = G_FPEXT %2 +# CHECK-NEXT: %d0 = COPY %1 +# CHECK-NEXT: RET_ReallyLR +body: | + bb.1: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_FPEXT %0(s32) + %d0 = COPY %1(s64) + RET_ReallyLR implicit %d0 + +... From b57c6f4150d6525d1c1390fdd84f8ca4151eb570 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Thu, 2 Nov 2017 23:44:20 +0000 Subject: [PATCH 032/238] [Verifier] Remove the -verify-debug-info cl::opt This cl::opt has been dead for a while. It's no longer possible to run the verifier without also verifying debug info. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317288 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/Verifier.cpp | 2 -- test/CodeGen/NVPTX/generic-to-nvvm-ir.ll | 2 +- test/DebugInfo/Generic/location-verifier.ll | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index c528f7167e7a3..5bb1f84d2e5c7 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -115,8 +115,6 @@ using namespace llvm; -static cl::opt VerifyDebugInfo("verify-debug-info", cl::init(true)); - namespace llvm { struct VerifierSupport { diff --git a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll index f874148c0e830..5df5183dc2fba 100644 --- a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll +++ b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll @@ -1,6 +1,6 @@ ; Verify functionality of NVPTXGenericToNVVM.cpp pass. ; -; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm -verify-debug-info | FileCheck %s +; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/test/DebugInfo/Generic/location-verifier.ll b/test/DebugInfo/Generic/location-verifier.ll index b1e0805428c6f..3c6bb425a6679 100644 --- a/test/DebugInfo/Generic/location-verifier.ll +++ b/test/DebugInfo/Generic/location-verifier.ll @@ -1,4 +1,4 @@ -; RUN: llvm-as -disable-output -verify-debug-info -o - < %s 2>&1 | FileCheck %s +; RUN: llvm-as -disable-output -o - < %s 2>&1 | FileCheck %s ; ModuleID = 'test.c' target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" From c7ddffcd3298d393e914d823a7c34d0915588bb7 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Thu, 2 Nov 2017 23:45:51 +0000 Subject: [PATCH 033/238] Reland "Add feature to determine if host architecture is 64-bit in llvm-lit" A member of config was removed in this patch which resulted in errors I didn't expect. Removing config.host_arch will take more work some I'm readding that field. Differential Revision: https://reviews.llvm.org/D39465 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317289 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.site.cfg.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in index efdd016e45d99..dff46dcff32ea 100644 --- a/test/lit.site.cfg.py.in +++ b/test/lit.site.cfg.py.in @@ -42,6 +42,7 @@ config.build_shared_libs = @BUILD_SHARED_LIBS@ config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@ config.llvm_libxml2_enabled = "@LLVM_LIBXML2_ENABLED@" config.llvm_host_triple = '@LLVM_HOST_TRIPLE@' +config.host_arch = "@HOST_ARCH@" # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. From 1bd292583c0be67468743c6ae20046b6019f4da3 Mon Sep 17 00:00:00 2001 From: Sriraman Tallam Date: Fri, 3 Nov 2017 00:10:19 +0000 Subject: [PATCH 034/238] Avoid PLT for external calls when attribute nonlazybind is used. Differential Revision: https://reviews.llvm.org/D39065 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317292 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86Subtarget.cpp | 11 +++++++++-- test/CodeGen/X86/no-plt.ll | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/no-plt.ll diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index b0ce1335bd37d..9e060f97df343 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -144,6 +144,15 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const { unsigned char X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const { + const Function *F = dyn_cast_or_null(GV); + + // Do not use the PLT when explicitly told to do so for ELF 64-bit + // target. + if (isTargetELF() && is64Bit() && F && + F->hasFnAttribute(Attribute::NonLazyBind) && + GV->isDeclarationForLinker()) + return X86II::MO_GOTPCREL; + if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; @@ -153,8 +162,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, return X86II::MO_DLLIMPORT; } - const Function *F = dyn_cast_or_null(GV); - if (isTargetELF()) { if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv())) // According to psABI, PLT stub clobbers XMM8-XMM15. diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll new file mode 100644 index 0000000000000..77ef686cc851f --- /dev/null +++ b/test/CodeGen/X86/no-plt.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \ +; RUN: | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \ +; RUN: | FileCheck -check-prefix=X64 %s + +define i32 @main() #0 { +; X64: callq *_Z3foov@GOTPCREL(%rip) +; X64: callq _Z3barv + +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %call1 = call i32 @_Z3foov() + %call2 = call i32 @_Z3barv() + ret i32 0 +} + +; Function Attrs: nonlazybind +declare i32 @_Z3foov() #1 + +declare i32 @_Z3barv() #2 + +attributes #1 = { nonlazybind } From 931b3020257e13988c2498855ffa76e16afa7712 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Fri, 3 Nov 2017 01:01:28 +0000 Subject: [PATCH 035/238] [LSR] Clarify a comment. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317295 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index bbb179d3790c4..7f03f2379e78f 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1037,7 +1037,7 @@ struct LSRFixup { Value *OperandValToReplace = nullptr; /// If this user is to use the post-incremented value of an induction - /// variable, this variable is non-null and holds the loop associated with the + /// variable, this set is non-empty and holds the loops associated with the /// induction variable. PostIncLoopSet PostIncLoops; From 06d5ebdc63193e9cc621d9d85d57b4205981f692 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 05:19:34 +0000 Subject: [PATCH 036/238] [TableGen] Add an extra blank line to DAGISel output file to separate functions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317298 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/TableGen/DAGISelMatcherEmitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp index 76370cdad6782..672f9f8620fca 100644 --- a/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -974,7 +974,7 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher, OS << " #undef TARGET_VAL\n"; OS << " SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n"; - OS << "}\n"; + OS << "}\n\n"; // Next up, emit the function for node and pattern predicates: MatcherEmitter.EmitPredicateFunctions(OS); From c43a693efb02155f32e2f61310262082d27f91f3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 06:48:02 +0000 Subject: [PATCH 037/238] [X86] Remove PALIGNR/VALIGN handling from combineBitcastForMaskedOp and move to isel patterns instead. Prefer 128-bit VALIGND/VALIGNQ over PALIGNR during lowering when possible. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317299 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 36 +++--- lib/Target/X86/X86InstrAVX512.td | 117 ++++++++++++++++++ .../X86/avx512vl-intrinsics-upgrade.ll | 4 +- 3 files changed, 133 insertions(+), 24 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b178ad6c13e75..d64cc411391d8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10716,10 +10716,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. @@ -11016,10 +11022,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). @@ -30674,26 +30686,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case X86ISD::PALIGNR: - // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. - if (!VT.is128BitVector()) - return false; - Opcode = X86ISD::VALIGN; - LLVM_FALLTHROUGH; - case X86ISD::VALIGN: { - if (EltVT != MVT::i32 && EltVT != MVT::i64) - return false; - uint64_t Imm = Op.getConstantOperandVal(2); - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); - // Make sure we can represent the same shift with the new VT. - if ((ShiftAmt % EltSize) != 0) - return false; - Imm = ShiftAmt / EltSize; - return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), - DAG.getConstant(Imm, DL, MVT::i8)); - } case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index a73ee19423d3b..17b5e10c6a49e 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -8911,6 +8911,123 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , avx512vl_i8_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; +// Fragments to help convert valignq into masked valignd. Or valignq/valignd +// into vpalignr. +def ValignqImm32XForm : SDNodeXFormgetZExtValue() * 2, SDLoc(N)); +}]>; +def ValignqImm8XForm : SDNodeXFormgetZExtValue() * 8, SDLoc(N)); +}]>; +def ValigndImm8XForm : SDNodeXFormgetZExtValue() * 4, SDLoc(N)); +}]>; + +multiclass avx512_vpalign_mask_lowering { + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rrikz") To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rmikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +multiclass avx512_vpalign_mask_lowering_mb : + avx512_vpalign_mask_lowering { + def : Pat<(From.VT (OpNode From.RC:$src1, + (bitconvert (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3)), + (!cast(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.RC:$src0)), + (!cast(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.ImmAllZerosV)), + (!cast(OpcodeStr#"rmbikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +let Predicates = [HasAVX512] in { + // For 512-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info, + v16i32_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX] in { + // For 128-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info, + v4i32x_info, ValignqImm32XForm>; + // For 256-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info, + v8i32x_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX, HasBWI] in { + // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR. + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info, + v16i8x_info, ValignqImm8XForm>; + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info, + v16i8x_info, ValigndImm8XForm>; +} + defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index b6723ee50b059..6c6fad794c856 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06] -; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5] +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] +; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] From 37104fff45a7783a64ab121f9aee29da922678dd Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:18:14 +0000 Subject: [PATCH 038/238] [llvm-nm] Print 'I' for import table data in COFF The character gets uppercased into 'I' when it's a global symbol. In GNU binutils, nm prints 'I' for symbols classified by bfd_is_ind_section - which probably isn't exactly/only import tables. When building for win32, (some incarnations of?) libtool has got rules that try to inspect linked libraries, and in order to be sure that it is linking to a DLL import library as opposed to a static library, it expects to find the string " I " in the output of $NM when run on such an import library. GNU binutils nm also flags all of the .idata$X chunks as 'i' (while this patch only makes it set on .idata$2 and .idata$6) and also flags __imp__function as 'I'. Differential Revision: https://reviews.llvm.org/D39540 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317300 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-nm/X86/importlibrary.test | 2 ++ tools/llvm-nm/llvm-nm.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test index 9111694c2c6f5..107628d09efbf 100644 --- a/test/tools/llvm-nm/X86/importlibrary.test +++ b/test/tools/llvm-nm/X86/importlibrary.test @@ -1,5 +1,7 @@ # RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s +CHECK: 00000000 I __IMPORT_DESCRIPTOR_example +CHECK: 00000000 I __NULL_IMPORT_DESCRIPTOR CHECK: 00000000 R __imp__constant CHECK: 00000000 R _constant CHECK: 00000000 D __imp__data diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 4ad0d95d67f66..1b093f501d516 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -946,6 +946,10 @@ static char getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I) { section_iterator SecI = *SecIOrErr; const coff_section *Section = Obj.getCOFFSection(*SecI); Characteristics = Section->Characteristics; + StringRef SectionName; + Obj.getSectionName(Section, SectionName); + if (SectionName.startswith(".idata")) + return 'i'; } switch (Symb.getSectionNumber()) { From f30757f3b0769ec6504e75254f95b66f6dd0f50c Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:18:21 +0000 Subject: [PATCH 039/238] [llvm-nm] Don't error out on multiple occurrances of the -g/--external-only flag GNU binutils nm doesn't error out on this, and some projects' build systems can end up doing that in some cases. Allowing that seems like a better target than trying to avoid user projects passing multiple -g parameters to $NM. Differential Revision: https://reviews.llvm.org/D39539 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317301 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-nm/X86/externalonly.test | 1 + tools/llvm-nm/llvm-nm.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/tools/llvm-nm/X86/externalonly.test b/test/tools/llvm-nm/X86/externalonly.test index c374129878654..2a1853b426fd6 100644 --- a/test/tools/llvm-nm/X86/externalonly.test +++ b/test/tools/llvm-nm/X86/externalonly.test @@ -1,4 +1,5 @@ # RUN: llvm-nm -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s +# RUN: llvm-nm -g -g %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s # CHECK-NOT: EH_frame0 # CHECK: _main diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 1b093f501d516..852043002846a 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -85,9 +85,11 @@ cl::alias DefinedOnly2("U", cl::desc("Alias for --defined-only"), cl::aliasopt(DefinedOnly), cl::Grouping); cl::opt ExternalOnly("extern-only", - cl::desc("Show only external symbols")); + cl::desc("Show only external symbols"), + cl::ZeroOrMore); cl::alias ExternalOnly2("g", cl::desc("Alias for --extern-only"), - cl::aliasopt(ExternalOnly), cl::Grouping); + cl::aliasopt(ExternalOnly), cl::Grouping, + cl::ZeroOrMore); cl::opt BSDFormat("B", cl::desc("Alias for --format=bsd"), cl::Grouping); From 691ff5f85039e136ceeca85db6c2b368e2729fba Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 3 Nov 2017 07:30:45 +0000 Subject: [PATCH 040/238] [NFC] Get rid of hard-coded value ID in test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317303 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/IRCE/add-metadata-pre-post-loops.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll index 488d4b479bab7..0225af903ef44 100644 --- a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll +++ b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll @@ -38,7 +38,7 @@ exit: ; preds = %in.bounds, %entry define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) { ; CHECK-LABEL: @single_access_with_preloop( ; CHECK-LABEL: in.bounds.preloop -; CHECK: br i1 %14, label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7 +; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7 ; CHECK-LABEL: in.bounds.postloop ; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7 entry: From 19a3ba35df240471e549b2c4d0c7da390ec0064d Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 07:33:20 +0000 Subject: [PATCH 041/238] [AArch64] Use dwarf exception handling on MinGW Ideally we should probably produce WinEH here as well, but until then, we can use dwarf exceptions, without any further changes required in clang, libunwind or libcxxabi. Differential Revision: https://reviews.llvm.org/D39535 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317304 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 4 ++- .../MCTargetDesc/AArch64WinCOFFStreamer.cpp | 8 +++++ test/CodeGen/AArch64/dwarf-cfi.ll | 36 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/AArch64/dwarf-cfi.ll diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 7fba48494384d..c5da457c38fff 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -106,13 +106,15 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { PrivateLabelPrefix = ".L"; AlignmentIsInBytes = false; SupportsDebugInformation = true; - ExceptionsType = ExceptionHandling::WinEH; + CodePointerSize = 8; } AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { CommentString = ";"; + ExceptionsType = ExceptionHandling::WinEH; } AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { CommentString = "//"; + ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 9d0f39e5f6ad9..c88363d2c250f 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -23,7 +23,15 @@ class AArch64WinCOFFStreamer : public MCWinCOFFStreamer { std::unique_ptr CE, raw_pwrite_stream &OS) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {} + + void FinishImpl() override; }; + +void AArch64WinCOFFStreamer::FinishImpl() { + EmitFrames(nullptr); + + MCWinCOFFStreamer::FinishImpl(); +} } // end anonymous namespace namespace llvm { diff --git a/test/CodeGen/AArch64/dwarf-cfi.ll b/test/CodeGen/AArch64/dwarf-cfi.ll new file mode 100644 index 0000000000000..a75bcd19c69c8 --- /dev/null +++ b/test/CodeGen/AArch64/dwarf-cfi.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple aarch64-windows-gnu -filetype=asm -o - %s | FileCheck %s + +define void @_Z1gv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + invoke void @_Z1fv() + to label %try.cont unwind label %lpad + +lpad: + %0 = landingpad { i8*, i32 } + catch i8* null + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2 + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void +} + +declare void @_Z1fv() + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_end_catch() + +; CHECK-LABEL: _Z1gv: +; CHECK: .cfi_startproc +; CHECK: .cfi_personality 0, __gxx_personality_v0 +; CHECK: .cfi_lsda 0, .Lexception0 +; CHECK: str x30, [sp, #-16]! +; CHECK: .cfi_def_cfa_offset 16 +; CHECK: .cfi_offset w30, -16 +; CHECK: ldr x30, [sp], #16 +; CHECK: .cfi_endproc From ba9125e489dab87e4706bef3408a943967e86415 Mon Sep 17 00:00:00 2001 From: Francis Visoiu Mistrih Date: Fri, 3 Nov 2017 09:46:36 +0000 Subject: [PATCH 042/238] [PEI] Simplify handling of targets with no phys regs. NFC Make doSpillCalleeSavedRegs a member function, instead of passing most of the members of PEI as arguments. Differential Review: https://reviews.llvm.org/D35642 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317309 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/PrologEpilogInserter.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index d9e9b3360a053..d611c9b45c51a 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -76,12 +76,6 @@ using namespace llvm; using MBBVector = SmallVector; -static void spillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCXFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks); - namespace { class PEI : public MachineFunctionPass { @@ -125,6 +119,7 @@ class PEI : public MachineFunctionPass { void calculateCallFrameInfo(MachineFunction &Fn); void calculateSaveRestoreBlocks(MachineFunction &Fn); + void spillCalleeSavedRegs(MachineFunction &MF); void calculateFrameObjectOffsets(MachineFunction &Fn); void replaceFrameIndices(MachineFunction &Fn); @@ -197,8 +192,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) { // Handle CSR spilling and restoring, for targets that need it. if (Fn.getTarget().usesPhysRegsForPEI()) - spillCalleeSavedRegs(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex, SaveBlocks, - RestoreBlocks); + spillCalleeSavedRegs(Fn); // Allow the target machine to make final modifications to the function // before the frame layout is finalized. @@ -505,11 +499,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, } } -static void spillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks) { +void PEI::spillCalleeSavedRegs(MachineFunction &Fn) { // We can't list this requirement in getRequiredProperties because some // targets (WebAssembly) use virtual registers past this point, and the pass // pipeline is set up without giving the passes a chance to look at the From 74ecc3ab6b5c584d09919664232287258270e39b Mon Sep 17 00:00:00 2001 From: "Ivan A. Kosarev" Date: Fri, 3 Nov 2017 10:26:25 +0000 Subject: [PATCH 043/238] [Analysis] Refine matching and merging of TBAA tags This patch combines the code that matches and merges TBAA access tags. The aim is to simplify future changes and making sure that these operations produce consistent results. Differential Revision: https://reviews.llvm.org/D39463 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317311 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/TypeBasedAliasAnalysis.cpp | 173 +++++++++++++----------- 1 file changed, 95 insertions(+), 78 deletions(-) diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index 3a3a7ad39554f..8812ca207ba31 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -314,17 +314,8 @@ AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA, if (!EnableTBAA) return AAResultBase::alias(LocA, LocB); - // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must - // be conservative. - const MDNode *AM = LocA.AATags.TBAA; - if (!AM) - return AAResultBase::alias(LocA, LocB); - const MDNode *BM = LocB.AATags.TBAA; - if (!BM) - return AAResultBase::alias(LocA, LocB); - - // If they may alias, chain to the next AliasAnalysis. - if (Aliases(AM, BM)) + // If accesses may alias, chain to the next AliasAnalysis. + if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA)) return AAResultBase::alias(LocA, LocB); // Otherwise return a definitive result. @@ -424,25 +415,24 @@ bool MDNode::isTBAAVtableAccess() const { return false; } +static bool matchAccessTags(const MDNode *A, const MDNode *B, + const MDNode **GenericTag = nullptr); + MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { + const MDNode *GenericTag; + matchAccessTags(A, B, &GenericTag); + return const_cast(GenericTag); +} + +static const MDNode *getLeastCommonType(const MDNode *A, const MDNode *B) { if (!A || !B) return nullptr; if (A == B) return A; - // For struct-path aware TBAA, we use the access type of the tag. - assert(isStructPathTBAA(A) && isStructPathTBAA(B) && - "Auto upgrade should have taken care of this!"); - A = cast_or_null(MutableTBAAStructTagNode(A).getAccessType()); - if (!A) - return nullptr; - B = cast_or_null(MutableTBAAStructTagNode(B).getAccessType()); - if (!B) - return nullptr; - - SmallSetVector PathA; - MutableTBAANode TA(A); + SmallSetVector PathA; + TBAANode TA(A); while (TA.getNode()) { if (PathA.count(TA.getNode())) report_fatal_error("Cycle found in TBAA metadata."); @@ -450,8 +440,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { TA = TA.getParent(); } - SmallSetVector PathB; - MutableTBAANode TB(B); + SmallSetVector PathB; + TBAANode TB(B); while (TB.getNode()) { if (PathB.count(TB.getNode())) report_fatal_error("Cycle found in TBAA metadata."); @@ -462,7 +452,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { int IA = PathA.size() - 1; int IB = PathB.size() - 1; - MDNode *Ret = nullptr; + const MDNode *Ret = nullptr; while (IA >= 0 && IB >= 0) { if (PathA[IA] == PathB[IB]) Ret = PathA[IA]; @@ -472,17 +462,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) { --IB; } - // We either did not find a match, or the only common base "type" is - // the root node. In either case, we don't have any useful TBAA - // metadata to attach. - if (!Ret || Ret->getNumOperands() < 2) - return nullptr; - - // We need to convert from a type node to a tag node. - Type *Int64 = IntegerType::get(A->getContext(), 64); - Metadata *Ops[3] = {Ret, Ret, - ConstantAsMetadata::get(ConstantInt::get(Int64, 0))}; - return MDNode::get(A->getContext(), Ops); + return Ret; } void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const { @@ -505,70 +485,107 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const { N.NoAlias = getMetadata(LLVMContext::MD_noalias); } -/// Aliases - Test whether the type represented by A may alias the -/// type represented by B. -bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const { +static bool findAccessType(TBAAStructTagNode BaseTag, + const MDNode *AccessTypeNode, + uint64_t &OffsetInBase) { + // Start from the base type, follow the edge with the correct offset in + // the type DAG and adjust the offset until we reach the access type or + // until we reach a root node. + TBAAStructTypeNode BaseType(BaseTag.getBaseType()); + OffsetInBase = BaseTag.getOffset(); + + while (const MDNode *BaseTypeNode = BaseType.getNode()) { + if (BaseTypeNode == AccessTypeNode) + return true; + + // Follow the edge with the correct offset, Offset will be adjusted to + // be relative to the field type. + BaseType = BaseType.getParent(OffsetInBase); + } + return false; +} + +static const MDNode *createAccessTag(const MDNode *AccessType) { + Type *Int64 = IntegerType::get(AccessType->getContext(), 64); + auto *ImmutabilityFlag = ConstantAsMetadata::get(ConstantInt::get(Int64, 0)); + Metadata *Ops[] = {const_cast(AccessType), + const_cast(AccessType), ImmutabilityFlag}; + return MDNode::get(AccessType->getContext(), Ops); +} + +/// matchTags - Return true if the given couple of accesses are allowed to +/// overlap. If \arg GenericTag is not null, then on return it points to the +/// most generic access descriptor for the given two. +static bool matchAccessTags(const MDNode *A, const MDNode *B, + const MDNode **GenericTag) { + if (A == B) { + if (GenericTag) + *GenericTag = A; + return true; + } + + // Accesses with no TBAA information may alias with any other accesses. + if (!A || !B) { + if (GenericTag) + *GenericTag = nullptr; + return true; + } + // Verify that both input nodes are struct-path aware. Auto-upgrade should // have taken care of this. - assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware."); - assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware."); + assert(isStructPathTBAA(A) && "Access A is not struct-path aware!"); + assert(isStructPathTBAA(B) && "Access B is not struct-path aware!"); - // Keep track of the root node for A and B. - TBAAStructTypeNode RootA, RootB; TBAAStructTagNode TagA(A), TagB(B); // TODO: We need to check if AccessType of TagA encloses AccessType of // TagB to support aggregate AccessType. If yes, return true. - // Start from the base type of A, follow the edge with the correct offset in - // the type DAG and adjust the offset until we reach the base type of B or - // until we reach the Root node. - // Compare the adjusted offset once we have the same base. - - // Climb the type DAG from base type of A to see if we reach base type of B. const MDNode *BaseA = TagA.getBaseType(); const MDNode *BaseB = TagB.getBaseType(); - uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset(); - for (TBAAStructTypeNode T(BaseA);;) { - if (T.getNode() == BaseB) - // Base type of A encloses base type of B, check if the offsets match. - return OffsetA == OffsetB; - - RootA = T; - // Follow the edge with the correct offset, OffsetA will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetA); - if (!T.getNode()) - break; - } - // Reset OffsetA and climb the type DAG from base type of B to see if we reach - // base type of A. - OffsetA = TagA.getOffset(); - for (TBAAStructTypeNode T(BaseB);;) { - if (T.getNode() == BaseA) - // Base type of B encloses base type of A, check if the offsets match. - return OffsetA == OffsetB; + // Climb the type DAG from base type of A to see if we reach base type of B. + uint64_t OffsetA; + if (findAccessType(TagA, BaseB, OffsetA)) { + if (GenericTag) + *GenericTag = createAccessTag(TagB.getAccessType()); + return OffsetA == TagB.getOffset(); + } - RootB = T; - // Follow the edge with the correct offset, OffsetB will be adjusted to - // be relative to the field type. - T = T.getParent(OffsetB); - if (!T.getNode()) - break; + // Climb the type DAG from base type of B to see if we reach base type of A. + uint64_t OffsetB; + if (findAccessType(TagB, BaseA, OffsetB)) { + if (GenericTag) + *GenericTag = createAccessTag(TagA.getAccessType()); + return OffsetB == TagA.getOffset(); } - // Neither node is an ancestor of the other. + // If neither node is an ancestor of the other, then try to find the type + // that is common to both the final access types. + const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(), + TagB.getAccessType()); + + // If there is no common type or the only common type is the root node, then + // we don't have any useful generic access tag to return. + if (GenericTag) + *GenericTag = !CommonType || CommonType->getNumOperands() < 2 ? + nullptr : createAccessTag(CommonType); // If they have different roots, they're part of different potentially // unrelated type systems, so we must be conservative. - if (RootA.getNode() != RootB.getNode()) + if (!CommonType) return true; // If they have the same root, then we've proved there's no alias. return false; } +/// Aliases - Test whether the access represented by tag A may alias the +/// access represented by tag B. +bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const { + return matchAccessTags(A, B); +} + AnalysisKey TypeBasedAA::Key; TypeBasedAAResult TypeBasedAA::run(Function &F, FunctionAnalysisManager &AM) { From 5281112161326303f9a4571f3c7492fc2f2be6e6 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 3 Nov 2017 10:30:12 +0000 Subject: [PATCH 044/238] [ARM GlobalISel] Move the check for Thumb higher up We're currently bailing out for Thumb targets while lowering formal parameters, but there used to be some other checks before it, which could've caused some functions (e.g. those without formal parameters) to sneak through unnoticed. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317312 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMCallLowering.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index e1323cd9427ee..9c10a1c79a422 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -417,6 +417,12 @@ struct FormalArgHandler : public IncomingValueHandler { bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const { + auto &TLI = *getTLI(); + auto Subtarget = TLI.getSubtarget(); + + if (Subtarget->isThumb()) + return false; + // Quick exit if there aren't any args if (F.arg_empty()) return true; @@ -427,12 +433,6 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto &MF = MIRBuilder.getMF(); auto &MBB = MIRBuilder.getMBB(); auto DL = MF.getDataLayout(); - auto &TLI = *getTLI(); - - auto Subtarget = TLI.getSubtarget(); - - if (Subtarget->isThumb()) - return false; for (auto &Arg : F.args()) if (!isSupportedType(DL, TLI, Arg.getType())) From a7372f15c92ca8a556487877cb9df960ad68f4e3 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 3 Nov 2017 10:30:19 +0000 Subject: [PATCH 045/238] [globalisel][tablegen] Skip src child predicates The GlobalISel TableGen backend didn't check for predicates on the source children. This caused it to generate code for ARM patterns such as SMLABB or similar, but without properly checking for the sext_16_node part of the operands. This in turn meant that we would select SMLABB instead of MLA for simple sequences such as s32 + s32 * s32, which is wrong (we want a MLA on the full operands, not just their bottom 16 bits). This patch forces TableGen to skip patterns with predicates on the src children, so it doesn't generate code for SMLABB and other similar ARM instructions at all anymore. AArch64 and X86 are not affected. Differential Revision: https://reviews.llvm.org/D39554 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317313 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../arm-instruction-select-combos.mir | 35 +++++++++++++++++++ utils/TableGen/GlobalISelEmitter.cpp | 3 ++ 2 files changed, 38 insertions(+) diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir index d96463f00c7bb..939c851584cf2 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir @@ -1,6 +1,7 @@ # RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- | define void @test_mla() #0 { ret void } + define void @test_mla_commutative() #0 { ret void } define void @test_mla_v5() #1 { ret void } define void @test_mls() #2 { ret void } @@ -45,6 +46,40 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_mla_commutative +# CHECK-LABEL: name: test_mla_commutative +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: %r0, %r1, %r2 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY %r0 + ; CHECK: [[VREGY:%[0-9]+]]:gprnopc = COPY %r1 + ; CHECK: [[VREGZ:%[0-9]+]]:gprnopc = COPY %r2 + + %3(s32) = G_MUL %0, %1 + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGR:%[0-9]+]]:gprnopc = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _ + + %r0 = COPY %4(s32) + ; CHECK: %r0 = COPY [[VREGR]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_mla_v5 # CHECK-LABEL: name: test_mla_v5 legalized: true diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp index fed8ae5a80b0f..08649d7f9b5a6 100644 --- a/utils/TableGen/GlobalISelEmitter.cpp +++ b/utils/TableGen/GlobalISelEmitter.cpp @@ -2629,6 +2629,9 @@ Error GlobalISelEmitter::importChildMatcher(RuleMatcher &Rule, return Error::success(); } + if (SrcChild->hasAnyPredicate()) + return failedImport("Src pattern child has unsupported predicate"); + // Check for constant immediates. if (auto *ChildInt = dyn_cast(SrcChild->getLeafValue())) { OM.addPredicate(ChildInt->getValue()); From 6cd2a99eb6756292d2b78115f00d0fe9e1e35e23 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 3 Nov 2017 11:29:00 +0000 Subject: [PATCH 046/238] [PartialInliner] Skip call sites where inlining fails. Summary: InlineFunction can fail, for example when trying to inline vararg fuctions. In those cases, we do not want to bump partial inlining counters or set AnyInlined to true, because this could leave an unused function hanging around. Reviewers: davidxl, davide, gyiu Reviewed By: davide Subscribers: llvm-commits, eraman Differential Revision: https://reviews.llvm.org/D39581 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317314 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/PartialInlining.cpp | 16 ++++--- .../CodeExtractor/PartialInlineNoInline.ll | 45 +++++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 test/Transforms/CodeExtractor/PartialInlineNoInline.ll diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index b5267f75e417f..c47d8b78df30a 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -931,15 +931,17 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) continue; - ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", - CS.getInstruction()) - << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " - << ore::NV("Caller", CS.getCaller()); - }); + // Construct remark before doing the inlining, as after successful inlining + // the callsite is removed. + OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); + OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " + << ore::NV("Caller", CS.getCaller()); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); - InlineFunction(CS, IFI); + if (!InlineFunction(CS, IFI)) + continue; + + ORE.emit(OR); // Now update the entry count: if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { diff --git a/test/Transforms/CodeExtractor/PartialInlineNoInline.ll b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll new file mode 100644 index 0000000000000..6c0b83298d23e --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll @@ -0,0 +1,45 @@ +; RUN: opt < %s -partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s + +@stat = external global i32, align 4 + +define i32 @inline_fail(i32 %count, ...) { +entry: + %vargs = alloca i8*, align 8 + %vargs1 = bitcast i8** %vargs to i8* + call void @llvm.va_start(i8* %vargs1) + %stat1 = load i32, i32* @stat, align 4 + %cmp = icmp slt i32 %stat1, 0 + br i1 %cmp, label %bb2, label %bb1 + +bb1: ; preds = %entry + %vg1 = add nsw i32 %stat1, 1 + store i32 %vg1, i32* @stat, align 4 + %va1 = va_arg i8** %vargs, i32 + call void @foo(i32 %count, i32 %va1) #2 + br label %bb2 + +bb2: ; preds = %bb1, %entry + %res = phi i32 [ 1, %bb1 ], [ 0, %entry ] + call void @llvm.va_end(i8* %vargs1) + ret i32 %res +} + +define i32 @caller(i32 %arg) { +bb: + %res = tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg) + ret i32 %res +} + +declare void @foo(i32, i32) +declare void @llvm.va_start(i8*) +declare void @llvm.va_end(i8*) + +; Check that no remarks have been emitted, inline_fail has not been partial +; inlined, no code has been extracted and the partial-inlining counter +; has not been incremented. + +; CHECK-NOT: remark +; CHECK: tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg) +; CHECK-NOT: inline_fail.1_bb1 +; CHECK-NOT: partial-inlining From eb7c044ce99bf0576ab8017c0b63eb0f2d7e6c5b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Nov 2017 11:33:48 +0000 Subject: [PATCH 047/238] [X86][SSE] Add PACKUS support to combineVectorTruncation Similar to the existing code to lower to PACKSS, we can use PACKUS if the input vector's leading zero bits extend all the way to the packed/truncated value. We have to account for pre-SSE41 targets not supporting PACKUSDW git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317315 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 22 +++- test/CodeGen/X86/avg.ll | 185 ++++++++++++----------------- test/CodeGen/X86/combine-srl.ll | 2 +- test/CodeGen/X86/vector-trunc.ll | 73 +++++++----- 4 files changed, 141 insertions(+), 141 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d64cc411391d8..d65a65e365c42 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -34433,8 +34433,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// This function transforms vector truncation of 'extended sign-bits' values. -/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +/// This function transforms vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -34467,10 +34468,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); unsigned NumPackedBits = std::min(SVT.getSizeInBits(), 16); - if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits)) - return SDValue(); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + + // Use PACKUS if the input has zero-bits that extend all the way to the + // packed/truncated value. e.g. masks, zext_in_reg, etc. + KnownBits Known; + DAG.computeKnownBits(In, Known); + unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); - return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, @@ -34499,7 +34509,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } - // Try to truncate extended sign bits with PACKSS. + // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 508f10e98894d..14494779f10af 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind { define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] +; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] +; SSE2-NEXT: paddd %xmm4, %xmm8 ; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm6 ; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: packuswb %xmm7, %xmm1 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: movdqu %xmm4, (%rax) +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_const: @@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4] ; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8] ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 @@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9 +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3 +; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 9f7f8a97dc208..c5f03dbd5a31f 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; SSE: # BB#0: ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: packusdw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index dc08d88074d2b..ac1083ad4478d 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -947,28 +944,52 @@ entry: } define void @trunc16i32_16i8_lshr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_lshr: -; SSE: # BB#0: # %entry -; SSE-NEXT: psrld $24, %xmm1 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psrld $24, %xmm3 -; SSE-NEXT: psrld $24, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_lshr: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_lshr: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_lshr: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_lshr: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i8_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq From 3d456013b6bbf241696e8bf1570502412e62a63c Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Fri, 3 Nov 2017 12:12:27 +0000 Subject: [PATCH 048/238] re-land [ExpandMemCmp] Split ExpandMemCmp from CodeGen into its own pass." Fix undefined references: ExpandMemCmp belongs to CodeGen/, not Scalar/. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317318 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/Passes.h | 3 + include/llvm/InitializePasses.h | 1 + include/llvm/LinkAllPasses.h | 1 + include/llvm/Transforms/Scalar.h | 2 +- lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/CodeGenPrepare.cpp | 710 --------------- lib/CodeGen/ExpandMemCmp.cpp | 828 ++++++++++++++++++ lib/CodeGen/TargetPassConfig.cpp | 10 +- test/CodeGen/Generic/llc-start-stop.ll | 6 +- test/CodeGen/X86/memcmp-optsize.ll | 224 +++-- test/CodeGen/X86/memcmp.ll | 240 +++-- .../Transforms/ExpandMemCmp/X86/lit.local.cfg | 3 + .../X86/memcmp.ll | 519 +++++------ tools/opt/opt.cpp | 1 + 15 files changed, 1350 insertions(+), 1200 deletions(-) create mode 100644 lib/CodeGen/ExpandMemCmp.cpp create mode 100644 test/Transforms/ExpandMemCmp/X86/lit.local.cfg rename test/Transforms/{CodeGenPrepare => ExpandMemCmp}/X86/memcmp.ll (56%) diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 8e6b1570e4a37..c106ff6cdfef9 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -417,6 +417,9 @@ namespace llvm { /// shuffles. FunctionPass *createExpandReductionsPass(); + // This pass expands memcmp() to load/stores. + FunctionPass *createExpandMemCmpPass(); + } // End llvm namespace #endif diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 8c63ab0284dfa..b8183d1c8e2f3 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -128,6 +128,7 @@ void initializeEdgeBundlesPass(PassRegistry&); void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); +void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 765e63926daec..ce70f53ccb043 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -180,6 +180,7 @@ namespace { (void) llvm::createReversePostOrderFunctionAttrsPass(); (void) llvm::createMergeFunctionsPass(); (void) llvm::createMergeICmpsPass(); + (void) llvm::createExpandMemCmpPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 8ef65774a93ef..a78c897683fcd 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -422,7 +422,7 @@ Pass *createLowerGuardIntrinsicPass(); //===----------------------------------------------------------------------===// // -// MergeICmps - Merge integer comparison chains +// MergeICmps - Merge integer comparison chains into a memcmp // Pass *createMergeICmpsPass(); diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2e364cd4794d5..df04cf85049fc 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -21,6 +21,7 @@ add_llvm_library(LLVMCodeGen EdgeBundles.cpp ExecutionDepsFix.cpp ExpandISelPseudos.cpp + ExpandMemCmp.cpp ExpandPostRAPseudos.cpp ExpandReductions.cpp FaultMaps.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index bfab865687e7c..2f119554a1e22 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -30,6 +30,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeDwarfEHPreparePass(Registry); initializeEarlyIfConverterPass(Registry); initializeExpandISelPseudosPass(Registry); + initializeExpandMemCmpPassPass(Registry); initializeExpandPostRAPass(Registry); initializeFEntryInserterPass(Registry); initializeFinalizeMachineBundlesPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 51f2a320b299f..973ddebd987cf 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -123,12 +123,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); -STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); -STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); -STATISTIC(NumMemCmpGreaterThanMax, - "Number of memcmp calls with size greater than max size"); -STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); - static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -189,11 +183,6 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); -static cl::opt MemCmpNumLoadsPerBlock( - "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), - cl::desc("The number of loads per basic block for inline expansion of " - "memcmp that is only being compared against zero.")); - namespace { using SetOfInstrs = SmallPtrSet; @@ -1697,699 +1686,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } -namespace { - -// This class provides helper functions to expand a memcmp library call into an -// inline expansion. -class MemCmpExpansion { - struct ResultBlock { - BasicBlock *BB = nullptr; - PHINode *PhiSrc1 = nullptr; - PHINode *PhiSrc2 = nullptr; - - ResultBlock() = default; - }; - - CallInst *const CI; - ResultBlock ResBlock; - const uint64_t Size; - unsigned MaxLoadSize; - uint64_t NumLoadsNonOneByte; - const uint64_t NumLoadsPerBlock; - std::vector LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; - const bool IsUsedForZeroCmp; - const DataLayout &DL; - IRBuilder<> Builder; - // Represents the decomposition in blocks of the expansion. For example, - // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and - // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. - // TODO(courbet): Involve the target more in this computation. On X86, 7 - // bytes can be done more efficiently with two overlaping 4-byte loads than - // covering the interval with [{4, 0},{2, 4},{1, 6}}. - struct LoadEntry { - LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - assert(Offset % LoadSize == 0 && "invalid load entry"); - } - - uint64_t getGEPIndex() const { return Offset / LoadSize; } - - // The size of the load for this block, in bytes. - const unsigned LoadSize; - // The offset of this load WRT the base pointer, in bytes. - const uint64_t Offset; - }; - SmallVector LoadSequence; - - void createLoadCmpBlocks(); - void createResultBlock(); - void setupResultBlockPHINodes(); - void setupEndBlockPHINodes(); - Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); - void emitLoadCompareBlock(unsigned BlockIndex); - void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex); - void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); - void emitMemCmpResultBlock(); - Value *getMemCmpExpansionZeroCase(); - Value *getMemCmpEqZeroOneBlock(); - Value *getMemCmpOneBlock(); - - public: - MemCmpExpansion(CallInst *CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - unsigned NumLoadsPerBlock, const DataLayout &DL); - - unsigned getNumBlocks(); - uint64_t getNumLoads() const { return LoadSequence.size(); } - - Value *getMemCmpExpansion(); -}; - -} // end anonymous namespace - -// Initialize the basic block structure required for expansion of memcmp call -// with given maximum load size and memcmp size parameter. -// This structure includes: -// 1. A list of load compare blocks - LoadCmpBlocks. -// 2. An EndBlock, split from original instruction point, which is the block to -// return from. -// 3. ResultBlock, block to branch to for early exit when a -// LoadCmpBlock finds a difference. -MemCmpExpansion::MemCmpExpansion( - CallInst *const CI, uint64_t Size, - const TargetTransformInfo::MemCmpExpansionOptions &Options, - const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, - const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) - : CI(CI), - Size(Size), - MaxLoadSize(0), - NumLoadsNonOneByte(0), - NumLoadsPerBlock(NumLoadsPerBlock), - IsUsedForZeroCmp(IsUsedForZeroCmp), - DL(TheDataLayout), - Builder(CI) { - assert(Size > 0 && "zero blocks"); - // Scale the max size down if the target can load more bytes than we need. - size_t LoadSizeIndex = 0; - while (LoadSizeIndex < Options.LoadSizes.size() && - Options.LoadSizes[LoadSizeIndex] > Size) { - ++LoadSizeIndex; - } - this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; - // Compute the decomposition. - uint64_t CurSize = Size; - uint64_t Offset = 0; - while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { - const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; - assert(LoadSize > 0 && "zero load size"); - const uint64_t NumLoadsForThisSize = CurSize / LoadSize; - if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { - // Do not expand if the total number of loads is larger than what the - // target allows. Note that it's important that we exit before completing - // the expansion to avoid using a ton of memory to store the expansion for - // large sizes. - LoadSequence.clear(); - return; - } - if (NumLoadsForThisSize > 0) { - for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { - LoadSequence.push_back({LoadSize, Offset}); - Offset += LoadSize; - } - if (LoadSize > 1) { - ++NumLoadsNonOneByte; - } - CurSize = CurSize % LoadSize; - } - ++LoadSizeIndex; - } - assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); -} - -unsigned MemCmpExpansion::getNumBlocks() { - if (IsUsedForZeroCmp) - return getNumLoads() / NumLoadsPerBlock + - (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); - return getNumLoads(); -} - -void MemCmpExpansion::createLoadCmpBlocks() { - for (unsigned i = 0; i < getNumBlocks(); i++) { - BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", - EndBlock->getParent(), EndBlock); - LoadCmpBlocks.push_back(BB); - } -} - -void MemCmpExpansion::createResultBlock() { - ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", - EndBlock->getParent(), EndBlock); -} - -// This function creates the IR instructions for loading and comparing 1 byte. -// It loads 1 byte from each source of the memcmp parameters with the given -// GEPIndex. It then subtracts the two loaded values and adds this result to the -// final phi node for selecting the memcmp result. -void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, - unsigned GEPIndex) { - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using the GEPIndex. - if (GEPIndex != 0) { - Source1 = Builder.CreateGEP(LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, GEPIndex)); - Source2 = Builder.CreateGEP(LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, GEPIndex)); - } - - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); - Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); - - PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); - - if (BlockIndex < (LoadCmpBlocks.size() - 1)) { - // Early exit branch if difference found to EndBlock. Otherwise, continue to - // next LoadCmpBlock, - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, - ConstantInt::get(Diff->getType(), 0)); - BranchInst *CmpBr = - BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); - Builder.Insert(CmpBr); - } else { - // The last block has an unconditional branch to EndBlock. - BranchInst *CmpBr = BranchInst::Create(EndBlock); - Builder.Insert(CmpBr); - } -} - -/// Generate an equality comparison for one or more pairs of loaded values. -/// This is used in the case where the memcmp() call is compared equal or not -/// equal to zero. -Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, - unsigned &LoadIndex) { - assert(LoadIndex < getNumLoads() && - "getCompareLoadPairs() called with no remaining loads"); - std::vector XorList, OrList; - Value *Diff; - - const unsigned NumLoads = - std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); - - // For a single-block expansion, start inserting before the memcmp call. - if (LoadCmpBlocks.empty()) - Builder.SetInsertPoint(CI); - else - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - - Value *Cmp = nullptr; - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. The type for the combinations is the largest load - // type. - IntegerType *const MaxLoadType = - NumLoads == 1 ? nullptr - : IntegerType::get(CI->getContext(), MaxLoadSize * 8); - for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { - const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; - - IntegerType *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Get a constant or load a value for each source address. - Value *LoadSrc1 = nullptr; - if (auto *Source1C = dyn_cast(Source1)) - LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); - if (!LoadSrc1) - LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - - Value *LoadSrc2 = nullptr; - if (auto *Source2C = dyn_cast(Source2)) - LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); - if (!LoadSrc2) - LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (NumLoads != 1) { - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - // If we have multiple loads per block, we need to generate a composite - // comparison using xor+or. - Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); - Diff = Builder.CreateZExt(Diff, MaxLoadType); - XorList.push_back(Diff); - } else { - // If there's only one load per block, we just compare the loaded values. - Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); - } - } - - auto pairWiseOr = [&](std::vector &InList) -> std::vector { - std::vector OutList; - for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { - Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); - OutList.push_back(Or); - } - if (InList.size() % 2 != 0) - OutList.push_back(InList.back()); - return OutList; - }; - - if (!Cmp) { - // Pairwise OR the XOR results. - OrList = pairWiseOr(XorList); - - // Pairwise OR the OR results until one result left. - while (OrList.size() != 1) { - OrList = pairWiseOr(OrList); - } - Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); - } - - return Cmp; -} - -void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, - unsigned &LoadIndex) { - Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); - - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, - // continue to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function creates the IR intructions for loading and comparing using the -// given LoadSize. It loads the number of bytes specified by LoadSize from each -// source of the memcmp parameters. It then does a subtract to see if there was -// a difference in the loaded values. If a difference is found, it branches -// with an early exit to the ResultBlock for calculating which source was -// larger. Otherwise, it falls through to the either the next LoadCmpBlock or -// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with -// a special case through emitLoadCompareByteBlock. The special handling can -// simply subtract the loaded values and add it to the result phi node. -void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { - // There is one load per block in this case, BlockIndex == LoadIndex. - const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; - - if (CurLoadEntry.LoadSize == 1) { - MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, - CurLoadEntry.getGEPIndex()); - return; - } - - Type *LoadSizeType = - IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); - - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Get the base address using a GEP. - if (CurLoadEntry.Offset != 0) { - Source1 = Builder.CreateGEP( - LoadSizeType, Source1, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - Source2 = Builder.CreateGEP( - LoadSizeType, Source2, - ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); - } - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian()) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (LoadSizeType != MaxLoadType) { - LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); - } - - // Add the loaded values to the phi nodes for calculating memcmp result only - // if result is not used in a zero equality. - if (!IsUsedForZeroCmp) { - ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); - ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); - } - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); - BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) - ? EndBlock - : LoadCmpBlocks[BlockIndex + 1]; - // Early exit branch if difference found to ResultBlock. Otherwise, continue - // to next LoadCmpBlock or EndBlock. - BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); - Builder.Insert(CmpBr); - - // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 - // since early exit to ResultBlock was not taken (no difference was found in - // any of the bytes). - if (BlockIndex == LoadCmpBlocks.size() - 1) { - Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); - PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); - } -} - -// This function populates the ResultBlock with a sequence to calculate the -// memcmp result. It compares the two loaded source values and returns -1 if -// src1 < src2 and 1 if src1 > src2. -void MemCmpExpansion::emitMemCmpResultBlock() { - // Special case: if memcmp result is used in a zero equality, result does not - // need to be calculated and can simply return 1. - if (IsUsedForZeroCmp) { - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); - PhiRes->addIncoming(Res, ResBlock.BB); - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - return; - } - BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); - Builder.SetInsertPoint(ResBlock.BB, InsertPt); - - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, - ResBlock.PhiSrc2); - - Value *Res = - Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), - ConstantInt::get(Builder.getInt32Ty(), 1)); - - BranchInst *NewBr = BranchInst::Create(EndBlock); - Builder.Insert(NewBr); - PhiRes->addIncoming(Res, ResBlock.BB); -} - -void MemCmpExpansion::setupResultBlockPHINodes() { - Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); - Builder.SetInsertPoint(ResBlock.BB); - // Note: this assumes one load per block. - ResBlock.PhiSrc1 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); - ResBlock.PhiSrc2 = - Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); -} - -void MemCmpExpansion::setupEndBlockPHINodes() { - Builder.SetInsertPoint(&EndBlock->front()); - PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); -} - -Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { - unsigned LoadIndex = 0; - // This loop populates each of the LoadCmpBlocks with the IR sequence to - // handle multiple loads per block. - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlockMultipleLoads(I, LoadIndex); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -/// A memcmp expansion that compares equality with 0 and only has one block of -/// load and compare can bypass the compare, branch, and phi IR that is required -/// in the general case. -Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { - unsigned LoadIndex = 0; - Value *Cmp = getCompareLoadPairs(0, LoadIndex); - assert(LoadIndex == getNumLoads() && "some entries were not consumed"); - return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); -} - -/// A memcmp expansion that only has one block of load and compare can bypass -/// the compare, branch, and phi IR that is required in the general case. -Value *MemCmpExpansion::getMemCmpOneBlock() { - assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); - - Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); - Value *Source1 = CI->getArgOperand(0); - Value *Source2 = CI->getArgOperand(1); - - // Cast source to LoadSizeType*. - if (Source1->getType() != LoadSizeType) - Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); - if (Source2->getType() != LoadSizeType) - Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); - - // Load LoadSizeType from the base address. - Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); - Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); - - if (DL.isLittleEndian() && Size != 1) { - Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::bswap, LoadSizeType); - LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); - LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); - } - - if (Size < 4) { - // The i8 and i16 cases don't need compares. We zext the loaded values and - // subtract them to get the suitable negative, zero, or positive i32 result. - LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); - LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); - return Builder.CreateSub(LoadSrc1, LoadSrc2); - } - - // The result of memcmp is negative, zero, or positive, so produce that by - // subtracting 2 extended compare bits: sub (ugt, ult). - // If a target prefers to use selects to get -1/0/1, they should be able - // to transform this later. The inverse transform (going from selects to math) - // may not be possible in the DAG because the selects got converted into - // branches before we got there. - Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); - Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); - Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); - Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); - return Builder.CreateSub(ZextUGT, ZextULT); -} - -// This function expands the memcmp call into an inline expansion and returns -// the memcmp result. -Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); - createResultBlock(); - - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); - - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); - - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); - } - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - if (IsUsedForZeroCmp) - return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() - : getMemCmpExpansionZeroCase(); - - // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). - if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); - - for (unsigned I = 0; I < getNumBlocks(); ++I) { - emitLoadCompareBlock(I); - } - - emitMemCmpResultBlock(); - return PhiRes; -} - -// This function checks to see if an expansion of memcmp can be generated. -// It checks for constant compare size that is less than the max inline size. -// If an expansion cannot occur, returns false to leave as a library call. -// Otherwise, the library call is replaced with a new IR instruction sequence. -/// We want to transform: -/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) -/// To: -/// loadbb: -/// %0 = bitcast i32* %buffer2 to i8* -/// %1 = bitcast i32* %buffer1 to i8* -/// %2 = bitcast i8* %1 to i64* -/// %3 = bitcast i8* %0 to i64* -/// %4 = load i64, i64* %2 -/// %5 = load i64, i64* %3 -/// %6 = call i64 @llvm.bswap.i64(i64 %4) -/// %7 = call i64 @llvm.bswap.i64(i64 %5) -/// %8 = sub i64 %6, %7 -/// %9 = icmp ne i64 %8, 0 -/// br i1 %9, label %res_block, label %loadbb1 -/// res_block: ; preds = %loadbb2, -/// %loadbb1, %loadbb -/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] -/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] -/// %10 = icmp ult i64 %phi.src1, %phi.src2 -/// %11 = select i1 %10, i32 -1, i32 1 -/// br label %endblock -/// loadbb1: ; preds = %loadbb -/// %12 = bitcast i32* %buffer2 to i8* -/// %13 = bitcast i32* %buffer1 to i8* -/// %14 = bitcast i8* %13 to i32* -/// %15 = bitcast i8* %12 to i32* -/// %16 = getelementptr i32, i32* %14, i32 2 -/// %17 = getelementptr i32, i32* %15, i32 2 -/// %18 = load i32, i32* %16 -/// %19 = load i32, i32* %17 -/// %20 = call i32 @llvm.bswap.i32(i32 %18) -/// %21 = call i32 @llvm.bswap.i32(i32 %19) -/// %22 = zext i32 %20 to i64 -/// %23 = zext i32 %21 to i64 -/// %24 = sub i64 %22, %23 -/// %25 = icmp ne i64 %24, 0 -/// br i1 %25, label %res_block, label %loadbb2 -/// loadbb2: ; preds = %loadbb1 -/// %26 = bitcast i32* %buffer2 to i8* -/// %27 = bitcast i32* %buffer1 to i8* -/// %28 = bitcast i8* %27 to i16* -/// %29 = bitcast i8* %26 to i16* -/// %30 = getelementptr i16, i16* %28, i16 6 -/// %31 = getelementptr i16, i16* %29, i16 6 -/// %32 = load i16, i16* %30 -/// %33 = load i16, i16* %31 -/// %34 = call i16 @llvm.bswap.i16(i16 %32) -/// %35 = call i16 @llvm.bswap.i16(i16 %33) -/// %36 = zext i16 %34 to i64 -/// %37 = zext i16 %35 to i64 -/// %38 = sub i64 %36, %37 -/// %39 = icmp ne i64 %38, 0 -/// br i1 %39, label %res_block, label %loadbb3 -/// loadbb3: ; preds = %loadbb2 -/// %40 = bitcast i32* %buffer2 to i8* -/// %41 = bitcast i32* %buffer1 to i8* -/// %42 = getelementptr i8, i8* %41, i8 14 -/// %43 = getelementptr i8, i8* %40, i8 14 -/// %44 = load i8, i8* %42 -/// %45 = load i8, i8* %43 -/// %46 = zext i8 %44 to i32 -/// %47 = zext i8 %45 to i32 -/// %48 = sub i32 %46, %47 -/// br label %endblock -/// endblock: ; preds = %res_block, -/// %loadbb3 -/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] -/// ret i32 %phi.res -static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, - const TargetLowering *TLI, const DataLayout *DL) { - NumMemCmpCalls++; - - // Early exit from expansion if -Oz. - if (CI->getFunction()->optForMinSize()) - return false; - - // Early exit from expansion if size is not a constant. - ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); - if (!SizeCast) { - NumMemCmpNotConstant++; - return false; - } - const uint64_t SizeVal = SizeCast->getZExtValue(); - - if (SizeVal == 0) { - return false; - } - - // TTI call to check if target would like to expand memcmp. Also, get the - // available load sizes. - const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); - if (!Options) return false; - - const unsigned MaxNumLoads = - TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); - - MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, - IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); - - // Don't expand if this will require more loads than desired by the target. - if (Expansion.getNumLoads() == 0) { - NumMemCmpGreaterThanMax++; - return false; - } - - NumMemCmpInlined++; - - Value *Res = Expansion.getMemCmpExpansion(); - - // Replace call with result of expansion and erase call. - CI->replaceAllUsesWith(Res); - CI->eraseFromParent(); - - return true; -} - bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2542,12 +1838,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { return true; } - LibFunc Func; - if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) && - Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) { - ModifiedDT = true; - return true; - } return false; } diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp new file mode 100644 index 0000000000000..c5910c18d89bd --- /dev/null +++ b/lib/CodeGen/ExpandMemCmp.cpp @@ -0,0 +1,828 @@ +//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "expandmemcmp" + +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + +namespace { + + +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + ResultBlock() = default; + }; + + CallInst *const CI; + ResultBlock ResBlock; + const uint64_t Size; + unsigned MaxLoadSize; + uint64_t NumLoadsNonOneByte; + const uint64_t NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + const bool IsUsedForZeroCmp; + const DataLayout &DL; + IRBuilder<> Builder; + // Represents the decomposition in blocks of the expansion. For example, + // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and + // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}. + // TODO(courbet): Involve the target more in this computation. On X86, 7 + // bytes can be done more efficiently with two overlaping 4-byte loads than + // covering the interval with [{4, 0},{2, 4},{1, 6}}. + struct LoadEntry { + LoadEntry(unsigned LoadSize, uint64_t Offset) + : LoadSize(LoadSize), Offset(Offset) { + assert(Offset % LoadSize == 0 && "invalid load entry"); + } + + uint64_t getGEPIndex() const { return Offset / LoadSize; } + + // The size of the load for this block, in bytes. + const unsigned LoadSize; + // The offset of this load WRT the base pointer, in bytes. + const uint64_t Offset; + }; + SmallVector LoadSequence; + + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); + void emitLoadCompareBlock(unsigned BlockIndex); + void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex); + void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex); + void emitMemCmpResultBlock(); + Value *getMemCmpExpansionZeroCase(); + Value *getMemCmpEqZeroOneBlock(); + Value *getMemCmpOneBlock(); + + public: + MemCmpExpansion(CallInst *CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + unsigned NumLoadsPerBlock, const DataLayout &DL); + + unsigned getNumBlocks(); + uint64_t getNumLoads() const { return LoadSequence.size(); } + + Value *getMemCmpExpansion(); +}; + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion( + CallInst *const CI, uint64_t Size, + const TargetTransformInfo::MemCmpExpansionOptions &Options, + const unsigned MaxNumLoads, const bool IsUsedForZeroCmp, + const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout) + : CI(CI), + Size(Size), + MaxLoadSize(0), + NumLoadsNonOneByte(0), + NumLoadsPerBlock(NumLoadsPerBlock), + IsUsedForZeroCmp(IsUsedForZeroCmp), + DL(TheDataLayout), + Builder(CI) { + assert(Size > 0 && "zero blocks"); + // Scale the max size down if the target can load more bytes than we need. + size_t LoadSizeIndex = 0; + while (LoadSizeIndex < Options.LoadSizes.size() && + Options.LoadSizes[LoadSizeIndex] > Size) { + ++LoadSizeIndex; + } + this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex]; + // Compute the decomposition. + uint64_t CurSize = Size; + uint64_t Offset = 0; + while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) { + const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex]; + assert(LoadSize > 0 && "zero load size"); + const uint64_t NumLoadsForThisSize = CurSize / LoadSize; + if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { + // Do not expand if the total number of loads is larger than what the + // target allows. Note that it's important that we exit before completing + // the expansion to avoid using a ton of memory to store the expansion for + // large sizes. + LoadSequence.clear(); + return; + } + if (NumLoadsForThisSize > 0) { + for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { + LoadSequence.push_back({LoadSize, Offset}); + Offset += LoadSize; + } + if (LoadSize > 1) { + ++NumLoadsNonOneByte; + } + CurSize = CurSize % LoadSize; + } + ++LoadSizeIndex; + } + assert(LoadSequence.size() <= MaxNumLoads && "broken invariant"); +} + +unsigned MemCmpExpansion::getNumBlocks() { + if (IsUsedForZeroCmp) + return getNumLoads() / NumLoadsPerBlock + + (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0); + return getNumLoads(); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < getNumBlocks(); i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp parameters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, + unsigned GEPIndex) { + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex. + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]); + + if (BlockIndex < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock. Otherwise, continue to + // next LoadCmpBlock, + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock. + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +/// Generate an equality comparison for one or more pairs of loaded values. +/// This is used in the case where the memcmp() call is compared equal or not +/// equal to zero. +Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, + unsigned &LoadIndex) { + assert(LoadIndex < getNumLoads() && + "getCompareLoadPairs() called with no remaining loads"); + std::vector XorList, OrList; + Value *Diff; + + const unsigned NumLoads = + std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock); + + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + + Value *Cmp = nullptr; + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. The type for the combinations is the largest load + // type. + IntegerType *const MaxLoadType = + NumLoads == 1 ? nullptr + : IntegerType::get(CI->getContext(), MaxLoadSize * 8); + for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { + const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; + + IntegerType *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Get a constant or load a value for each source address. + Value *LoadSrc1 = nullptr; + if (auto *Source1C = dyn_cast(Source1)) + LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL); + if (!LoadSrc1) + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + + Value *LoadSrc2 = nullptr; + if (auto *Source2C = dyn_cast(Source2)) + LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL); + if (!LoadSrc2) + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (NumLoads != 1) { + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + // If we have multiple loads per block, we need to generate a composite + // comparison using xor+or. + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExt(Diff, MaxLoadType); + XorList.push_back(Diff); + } else { + // If there's only one load per block, we just compare the loaded values. + Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2); + } + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + if (!Cmp) { + // Pairwise OR the XOR results. + OrList = pairWiseOr(XorList); + + // Pairwise OR the OR results until one result left. + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); + } + + return Cmp; +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, + unsigned &LoadIndex) { + Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); + + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, + // continue to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { + // There is one load per block in this case, BlockIndex == LoadIndex. + const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; + + if (CurLoadEntry.LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, + CurLoadEntry.getGEPIndex()); + return; + } + + Type *LoadSizeType = + IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using a GEP. + if (CurLoadEntry.Offset != 0) { + Source1 = Builder.CreateGEP( + LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + Source2 = Builder.CreateGEP( + LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex())); + } + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian()) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2); + BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[BlockIndex + 1]; + // Early exit branch if difference found to ResultBlock. Otherwise, continue + // to next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes). + if (BlockIndex == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock() { + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + // Note: this assumes one load per block. + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { + unsigned LoadIndex = 0; + // This loop populates each of the LoadCmpBlocks with the IR sequence to + // handle multiple loads per block. + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlockMultipleLoads(I, LoadIndex); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { + unsigned LoadIndex = 0; + Value *Cmp = getCompareLoadPairs(0, LoadIndex); + assert(LoadIndex == getNumLoads() && "some entries were not consumed"); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + +/// A memcmp expansion that only has one block of load and compare can bypass +/// the compare, branch, and phi IR that is required in the general case. +Value *MemCmpExpansion::getMemCmpOneBlock() { + assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block"); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType*. + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Load LoadSizeType from the base address. + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (DL.isLittleEndian() && Size != 1) { + Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (Size < 4) { + // The i8 and i16 cases don't need compares. We zext the loaded values and + // subtract them to get the suitable negative, zero, or positive i32 result. + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty()); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty()); + return Builder.CreateSub(LoadSrc1, LoadSrc2); + } + + // The result of memcmp is negative, zero, or positive, so produce that by + // subtracting 2 extended compare bits: sub (ugt, ult). + // If a target prefers to use selects to get -1/0/1, they should be able + // to transform this later. The inverse transform (going from selects to math) + // may not be possible in the DAG because the selects got converted into + // branches before we got there. + Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2); + Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2); + Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); + Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); + return Builder.CreateSub(ZextUGT, ZextULT); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion() { + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. + if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + if (IsUsedForZeroCmp) + return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() + : getMemCmpExpansionZeroCase(); + + // TODO: Handle more than one load pair per block in getMemCmpOneBlock(). + if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock(); + + for (unsigned I = 0; I < getNumBlocks(); ++I) { + emitLoadCompareBlock(I); + } + + emitMemCmpResultBlock(); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced with a new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + + // Early exit from expansion if -Oz. + if (CI->getFunction()->optForMinSize()) + return false; + + // Early exit from expansion if size is not a constant. + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + const uint64_t SizeVal = SizeCast->getZExtValue(); + + if (SizeVal == 0) { + return false; + } + + // TTI call to check if target would like to expand memcmp. Also, get the + // available load sizes. + const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp); + if (!Options) return false; + + const unsigned MaxNumLoads = + TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize()); + + MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads, + IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL); + + // Don't expand if this will require more loads than desired by the target. + if (Expansion.getNumLoads() == 0) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + Value *Res = Expansion.getMemCmpExpansion(); + + // Replace call with result of expansion and erase call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + + + +class ExpandMemCmpPass : public FunctionPass { +public: + static char ID; + + ExpandMemCmpPass() : FunctionPass(ID) { + initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) { + return false; + } + const TargetLowering* TL = + TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); + + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + const TargetTransformInfo *TTI = + &getAnalysis().getTTI(F); + auto PA = runImpl(F, TLI, TTI, TL); + return !PA.areAllPreserved(); + } + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + const TargetLowering* TL); + // Returns true if a change was made. + bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL); +}; + +bool ExpandMemCmpPass::runOnBlock( + BasicBlock &BB, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, const TargetLowering* TL, + const DataLayout& DL) { + for (Instruction& I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + LibFunc Func; + if (TLI->getLibFunc(ImmutableCallSite(CI), Func) && + Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) { + return true; + } + } + return false; +} + + +PreservedAnalyses ExpandMemCmpPass::runImpl( + Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, + const TargetLowering* TL) { + const DataLayout& DL = F.getParent()->getDataLayout(); + bool MadeChanges = false; + for (auto BBIt = F.begin(); BBIt != F.end();) { + if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) { + MadeChanges = true; + // If changes were made, restart the function from the beginning, since + // the structure of the function was changed. + BBIt = F.begin(); + } else { + ++BBIt; + } + } + return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace + +char ExpandMemCmpPass::ID = 0; +INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp", + "Expand memcmp() to load/stores", false, false) + +FunctionPass *llvm::createExpandMemCmpPass() { + return new ExpandMemCmpPass(); +} diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index c5101b1ecfc22..59e88ba3bdae4 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() { addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n")); } - if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { - addPass(createMergeICmpsPass()); + if (getOptLevel() != CodeGenOpt::None) { + // The MergeICmpsPass tries to create memcmp calls by grouping sequences of + // loads and compares. ExpandMemCmpPass then tries to expand those calls + // into optimally-sized loads and compares. The transforms are enabled by a + // target lowering hook. + if (EnableMergeICmps) + addPass(createMergeICmpsPass()); + addPass(createExpandMemCmpPass()); } // Run GC lowering passes for builtin collectors diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 85b69c37aa01e..9056e2cab49db 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -gc-lowering +; START-AFTER: -machine-branch-prob -expandmemcmp ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Lower Garbage Collection Instructions +; START-AFTER-NEXT: Expand memcmp() to load/stores ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Lower Garbage Collection Instructions +; START-BEFORE-NEXT: Expand memcmp() to load/stores ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 77d9fa69182b8..3f5eeba7055cd 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: .LBB5_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB10_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: .LBB10_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB11_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: .LBB14_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: .LBB20_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: .LBB20_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: .LBB21_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: .LBB21_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: .LBB23_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: .LBB24_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: .LBB24_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: .LBB26_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: .LBB27_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 393e4c42d8b94..84fd45b0a08cb 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB7_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB7_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB7_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_1: # %res_block +; X64-NEXT: .LBB7_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB12_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_1: # %res_block +; X64-NEXT: .LBB12_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB14_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB14_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: .LBB15_2: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB18_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: .LBB22_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: .LBB22_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: .LBB23_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: .LBB25_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: .LBB25_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: .LBB25_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: .LBB26_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: .LBB26_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: .LBB26_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: .LBB28_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: .LBB29_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg new file mode 100644 index 0000000000000..e71f3cc4c41e7 --- /dev/null +++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll similarity index 56% rename from test/Transforms/CodeGenPrepare/X86/memcmp.ll rename to test/Transforms/ExpandMemCmp/X86/memcmp.ll index a4f635c956df9..1abfb20f36961 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 +; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) @@ -23,30 +23,33 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) @@ -74,30 +77,33 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4 ; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; ALL-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; ALL-NEXT: br label [[ENDBLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) @@ -106,36 +112,37 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; ALL-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2 +; ALL-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* ; ALL-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2 -; ALL-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; ALL-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2 ; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i32 +; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) ; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]] -; ALL-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; ALL-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) @@ -153,34 +160,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) -; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; X32-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] -; X32-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32-NEXT: br label [[LOADBB:%.*]] ; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1 +; X32-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* ; X32-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1 -; X32-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1 ; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X32-NEXT: [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]] -; X32-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X32-NEXT: ret i32 [[PHI_RES]] ; ; X64-LABEL: @cmp8( @@ -207,30 +215,33 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP9]] +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8 ; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]] -; X64-NEXT: [[TMP13:%.*]] = zext i8 [[TMP11]] to i32 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]] ; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; X64-NEXT: [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]] +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] ; X64-NEXT: br label [[ENDBLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) @@ -243,36 +254,37 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i16* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4 -; X64-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4 ; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i16 [[TMP15]] to i64 +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) ; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) @@ -294,36 +306,37 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i32* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2 -; X64-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2 ; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] ; X64-NEXT: [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X64-NEXT: [[TMP17]] = zext i32 [[TMP15]] to i64 +; X64-NEXT: [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) ; X64-NEXT: [[TMP18]] = zext i32 [[TMP16]] to i64 -; X64-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]] -; X64-NEXT: br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP19]] = zext i32 [[TMP17]] to i64 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) @@ -363,34 +376,35 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp16( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) -; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; X64-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]] -; X64-NEXT: br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: -; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ] -; X64-NEXT: [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] -; X64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1 +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP9:%.*]] = bitcast i8* [[X]] to i64* -; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[Y]] to i64* -; X64-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1 +; X64-NEXT: [[TMP10:%.*]] = bitcast i8* [[X]] to i64* +; X64-NEXT: [[TMP11:%.*]] = bitcast i8* [[Y]] to i64* ; X64-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1 -; X64-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP11]] +; X64-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1 ; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]]) +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) -; X64-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]] -; X64-NEXT: br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] ; X64: endblock: -; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ] +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) @@ -417,22 +431,23 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -465,22 +480,23 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4 -; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4 -; ALL-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; ALL-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4 +; ALL-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4 ; ALL-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; ALL-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; ALL-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -495,24 +511,25 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: loadbb: -; ALL-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; ALL-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; ALL-NEXT: br label [[LOADBB:%.*]] ; ALL: res_block: ; ALL-NEXT: br label [[ENDBLOCK:%.*]] +; ALL: loadbb: +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; ALL: loadbb1: -; ALL-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; ALL-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2 +; ALL-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; ALL-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* ; ALL-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2 -; ALL-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; ALL-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2 ; ALL-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; ALL-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; ALL-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; ALL-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; ALL-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; ALL-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; ALL: endblock: ; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -540,24 +557,25 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: loadbb: -; X32-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] -; X32-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]] -; X32-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X32-NEXT: br label [[LOADBB:%.*]] ; X32: res_block: ; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X32-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X32: loadbb1: -; X32-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X32-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1 +; X32-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X32-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* ; X32-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1 -; X32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X32-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1 ; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X32-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X32-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X32-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X32-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X32: endblock: ; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -589,22 +607,23 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq9( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8 -; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8 -; X64-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]] +; X64-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8 +; X64-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8 ; X64-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]] -; X64-NEXT: [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]] +; X64-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] +; X64-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -625,24 +644,25 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq10( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i16* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i16* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4 +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i16* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i16* ; X64-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4 -; X64-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4 ; X64-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -676,24 +696,25 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq12( -; X64-NEXT: loadbb: -; X64-NEXT: [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64* -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] -; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] -; X64-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]] -; X64-NEXT: br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] ; X64: loadbb1: -; X64-NEXT: [[TMP5:%.*]] = bitcast i8* [[X]] to i32* -; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[Y]] to i32* -; X64-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2 +; X64-NEXT: [[TMP6:%.*]] = bitcast i8* [[X]] to i32* +; X64-NEXT: [[TMP7:%.*]] = bitcast i8* [[Y]] to i32* ; X64-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2 -; X64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2 ; X64-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; X64-NEXT: [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]] -; X64-NEXT: br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; X64-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]] +; X64-NEXT: br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]] ; X64: endblock: ; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index e2fdfe82b8c64..0371cd0372f49 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -391,6 +391,7 @@ int main(int argc, char **argv) { initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are // supported. + initializeExpandMemCmpPassPass(Registry); initializeScalarizeMaskedMemIntrinPass(Registry); initializeCodeGenPreparePass(Registry); initializeAtomicExpandPass(Registry); From 9e5188ca177ad10813f233bd693a57d73a90b86b Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Fri, 3 Nov 2017 14:15:08 +0000 Subject: [PATCH 049/238] [ADCE] Use MapVector for BlockInfo to make iteration order deterministic Summary: Also added a reserve() method to MapVector since we want to use that from ADCE. DenseMap does not provide deterministic iteration order so with that we will handle the members of BlockInfo in random order, eventually leading to random order of the blocks in the predecessor lists. Without this change, I get the same predecessor order in about 90% of the time when I compile a certain reproducer and in 10% I get a different one. No idea how to make a proper test case for this. Reviewers: kuhar, david2050 Reviewed By: kuhar Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39593 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317323 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/MapVector.h | 7 +++++++ lib/Transforms/Scalar/ADCE.cpp | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h index 26a555ee1d3bd..3d78f4b203c87 100644 --- a/include/llvm/ADT/MapVector.h +++ b/include/llvm/ADT/MapVector.h @@ -56,6 +56,13 @@ class MapVector { size_type size() const { return Vector.size(); } + /// Grow the MapVector so that it can contain at least \p NumEntries items + /// before resizing again. + void reserve(size_type NumEntries) { + Map.reserve(NumEntries); + Vector.reserve(NumEntries); + } + iterator begin() { return Vector.begin(); } const_iterator begin() const { return Vector.begin(); } iterator end() { return Vector.end(); } diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index f04d0f05ffc7e..1e683db50206c 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -118,7 +119,8 @@ class AggressiveDeadCodeElimination { PostDominatorTree &PDT; /// Mapping of blocks to associated information, an element in BlockInfoVec. - DenseMap BlockInfo; + /// Use MapVector to get deterministic iteration order. + MapVector BlockInfo; bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; } /// Mapping of instructions to associated information. From c9ed638d21437f805b89f805252ec78a59b22f96 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Fri, 3 Nov 2017 14:25:39 +0000 Subject: [PATCH 050/238] [LoopPredication] NFC: Refactored code to separate out functions being reused Summary: Refactored the code to separate out common functions that are being reused. This is to reduce the changes for changes coming up wrt loop predication with reverse loops. This refactoring is what we have in our downstream code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317324 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopPredication.cpp | 154 +++++++++++++--------- 1 file changed, 92 insertions(+), 62 deletions(-) diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp index e680fbed1138f..52dea3254e79e 100644 --- a/lib/Transforms/Scalar/LoopPredication.cpp +++ b/lib/Transforms/Scalar/LoopPredication.cpp @@ -189,6 +189,10 @@ class LoopPredication { const SCEV *Limit) : Pred(Pred), IV(IV), Limit(Limit) {} LoopICmp() {} + void dump() { + dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV + << ", Limit = " << *Limit << "\n"; + } }; ScalarEvolution *SE; @@ -198,6 +202,7 @@ class LoopPredication { BasicBlock *Preheader; LoopICmp LatchCheck; + bool isSupportedStep(const SCEV* Step); Optional parseLoopICmp(ICmpInst *ICI) { return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0), ICI->getOperand(1)); @@ -207,12 +212,18 @@ class LoopPredication { Optional parseLoopLatchICmp(); + bool CanExpand(const SCEV* S); Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, Instruction *InsertAt); Optional widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, IRBuilder<> &Builder); + Optional widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck, + LoopICmp RangeCheck, + SCEVExpander &Expander, + IRBuilder<> &Builder); + bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); // When the IV type is wider than the range operand type, we can still do loop @@ -348,6 +359,67 @@ LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { return NewLatchCheck; } +bool LoopPredication::isSupportedStep(const SCEV* Step) { + return Step->isOne(); +} + +bool LoopPredication::CanExpand(const SCEV* S) { + return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); +} + +Optional LoopPredication::widenICmpRangeCheckIncrementingLoop( + LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck, + SCEVExpander &Expander, IRBuilder<> &Builder) { + auto *Ty = RangeCheck.IV->getType(); + // Generate the widened condition for the forward loop: + // guardStart u< guardLimit && + // latchLimit guardLimit - 1 - guardStart + latchStart + // where depends on the latch condition predicate. See the file + // header comment for the reasoning. + // guardLimit - guardStart + latchStart - 1 + const SCEV *GuardStart = RangeCheck.IV->getStart(); + const SCEV *GuardLimit = RangeCheck.Limit; + const SCEV *LatchStart = LatchCheck.IV->getStart(); + const SCEV *LatchLimit = LatchCheck.Limit; + + // guardLimit - guardStart + latchStart - 1 + const SCEV *RHS = + SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), + SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); + if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || + !CanExpand(LatchLimit) || !CanExpand(RHS)) { + DEBUG(dbgs() << "Can't expand limit check!\n"); + return None; + } + ICmpInst::Predicate LimitCheckPred; + switch (LatchCheck.Pred) { + case ICmpInst::ICMP_ULT: + LimitCheckPred = ICmpInst::ICMP_ULE; + break; + case ICmpInst::ICMP_ULE: + LimitCheckPred = ICmpInst::ICMP_ULT; + break; + case ICmpInst::ICMP_SLT: + LimitCheckPred = ICmpInst::ICMP_SLE; + break; + case ICmpInst::ICMP_SLE: + LimitCheckPred = ICmpInst::ICMP_SLT; + break; + default: + llvm_unreachable("Unsupported loop latch!"); + } + + DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); + DEBUG(dbgs() << "RHS: " << *RHS << "\n"); + DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); + + Instruction *InsertAt = Preheader->getTerminator(); + auto *LimitCheck = + expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt); + auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred, + GuardStart, GuardLimit, InsertAt); + return Builder.CreateAnd(FirstIterationCheck, LimitCheck); +} /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -366,6 +438,8 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); return None; } + DEBUG(dbgs() << "Guard check:\n"); + DEBUG(RangeCheck->dump()); if (RangeCheck->Pred != ICmpInst::ICMP_ULT) { DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred << ")!\n"); @@ -379,7 +453,7 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, auto *Step = RangeCheckIV->getStepRecurrence(*SE); // We cannot just compare with latch IV step because the latch and range IVs // may have different types. - if (!Step->isOne()) { + if (!isSupportedStep(Step)) { DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } @@ -397,58 +471,9 @@ Optional LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, // value and type. assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) && "Range and latch should have same step recurrence!"); - // Generate the widened condition: - // guardStart u< guardLimit && - // latchLimit guardLimit - 1 - guardStart + latchStart - // where depends on the latch condition predicate. See the file - // header comment for the reasoning. - const SCEV *GuardStart = RangeCheckIV->getStart(); - const SCEV *GuardLimit = RangeCheck->Limit; - const SCEV *LatchStart = CurrLatchCheck.IV->getStart(); - const SCEV *LatchLimit = CurrLatchCheck.Limit; - - // guardLimit - guardStart + latchStart - 1 - const SCEV *RHS = - SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart), - SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); - - ICmpInst::Predicate LimitCheckPred; - switch (CurrLatchCheck.Pred) { - case ICmpInst::ICMP_ULT: - LimitCheckPred = ICmpInst::ICMP_ULE; - break; - case ICmpInst::ICMP_ULE: - LimitCheckPred = ICmpInst::ICMP_ULT; - break; - case ICmpInst::ICMP_SLT: - LimitCheckPred = ICmpInst::ICMP_SLE; - break; - case ICmpInst::ICMP_SLE: - LimitCheckPred = ICmpInst::ICMP_SLT; - break; - default: - llvm_unreachable("Unsupported loop latch!"); - } - DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n"); - DEBUG(dbgs() << "RHS: " << *RHS << "\n"); - DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n"); - - auto CanExpand = [this](const SCEV *S) { - return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); - }; - if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) || - !CanExpand(LatchLimit) || !CanExpand(RHS)) { - DEBUG(dbgs() << "Can't expand limit check!\n"); - return None; - } - - Instruction *InsertAt = Preheader->getTerminator(); - auto *LimitCheck = - expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt); - auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck->Pred, - GuardStart, GuardLimit, InsertAt); - return Builder.CreateAnd(FirstIterationCheck, LimitCheck); + return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck, + Expander, Builder); } bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, @@ -541,15 +566,6 @@ Optional LoopPredication::parseLoopLatchICmp() { return None; } - if (Result->Pred != ICmpInst::ICMP_ULT && - Result->Pred != ICmpInst::ICMP_SLT && - Result->Pred != ICmpInst::ICMP_ULE && - Result->Pred != ICmpInst::ICMP_SLE) { - DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred - << ")!\n"); - return None; - } - // Check affine first, so if it's not we don't try to compute the step // recurrence. if (!Result->IV->isAffine()) { @@ -558,11 +574,22 @@ Optional LoopPredication::parseLoopLatchICmp() { } auto *Step = Result->IV->getStepRecurrence(*SE); - if (!Step->isOne()) { + if (!isSupportedStep(Step)) { DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n"); return None; } + auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) { + assert(Step->isOne() && "expected Step to be one!"); + return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT && + Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE; + }; + + if (IsUnsupportedPredicate(Step, Result->Pred)) { + DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred + << ")!\n"); + return None; + } return Result; } @@ -621,6 +648,9 @@ bool LoopPredication::runOnLoop(Loop *Loop) { return false; LatchCheck = *LatchCheckOpt; + DEBUG(dbgs() << "Latch check:\n"); + DEBUG(LatchCheck.dump()); + // Collect all the guards into a vector and process later, so as not // to invalidate the instruction iterator. SmallVector Guards; From d1f487bc595728ae9c4dc3aa461a41470e19cf12 Mon Sep 17 00:00:00 2001 From: "Andrew V. Tischenko" Date: Fri, 3 Nov 2017 15:25:13 +0000 Subject: [PATCH 051/238] Fix for Bug 34475 - LOCK/REP/REPNE prefixes emitted as instruction on their own. Differential Revision: https://reviews.llvm.org/D39546 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317330 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/InstPrinter/X86ATTInstPrinter.cpp | 6 +- .../X86/InstPrinter/X86IntelInstPrinter.cpp | 6 +- test/CodeGen/X86/inline-asm-A-constraint.ll | 3 +- .../AddressSanitizer/X86/asm_rep_movs.ll | 6 +- test/MC/Disassembler/X86/prefixes-i386.txt | 78 +++++++------------ test/MC/Disassembler/X86/prefixes-x86_64.txt | 24 ++---- test/MC/Disassembler/X86/prefixes.txt | 66 ++++++---------- test/MC/Disassembler/X86/simple-tests.txt | 9 +-- 8 files changed, 68 insertions(+), 130 deletions(-) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 6ff1136cd85a8..0c99dbbe328b1 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, if (TSFlags & X86II::LOCK) OS << "\tlock\t"; if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 464941a1bab6b..1f02600a79827 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; if (TSFlags & X86II::LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; unsigned Flags = MI->getFlags(); if (Flags & X86::IP_HAS_REPEAT_NE) - OS << "\trepne\n"; + OS << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) - OS << "\trep\n"; + OS << "\trep\t"; printInstruction(MI, OS); diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll index 2ad011e88e0d8..7975b318eff54 100644 --- a/test/CodeGen/X86/inline-asm-A-constraint.ll +++ b/test/CodeGen/X86/inline-asm-A-constraint.ll @@ -19,8 +19,7 @@ entry: %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 ret { i64, i64 } %.fca.1.insert } -; CHECK: lock -; CHECK-NEXT: cmpxchg16b +; CHECK: lock cmpxchg16b attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll index c3c2435fc87bb..1fc20febc947f 100644 --- a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll +++ b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll @@ -39,8 +39,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: [[B]]: ; CHECK-NEXT: popfq -; CHECK: rep -; CHECK-NEXT: movsb (%rsi), %es:(%rdi) +; CHECK: rep movsb (%rsi), %es:(%rdi) ; Function Attrs: nounwind sanitize_address uwtable define void @rep_movs_1b(i8* %dst, i8* %src, i64 %n) #0 { @@ -73,8 +72,7 @@ entry: ; CHECK: [[Q]]: ; CHECK-NEXT: popfq -; CHECK: rep -; CHECK-NEXT: movsq (%rsi), %es:(%rdi) +; CHECK: rep movsq (%rsi), %es:(%rdi) ; Function Attrs: nounwind sanitize_address uwtable define void @rep_movs_8b(i64* %dst, i64* %src, i64 %n) #0 { diff --git a/test/MC/Disassembler/X86/prefixes-i386.txt b/test/MC/Disassembler/X86/prefixes-i386.txt index ff2fb22387374..3152cc31aad1c 100644 --- a/test/MC/Disassembler/X86/prefixes-i386.txt +++ b/test/MC/Disassembler/X86/prefixes-i386.txt @@ -3,85 +3,59 @@ # CHECK: movl %fs:24, %eax 0x64 0xa1 0x18 0x00 0x00 0x00 # mov eax, dword ptr fs:[18h] -# CHECK: rep -# CHECK-NEXT: insb %dx, %es:(%edi) +# CHECK: rep insb %dx, %es:(%edi) 0xf3 0x6c #rep ins -# CHECK: rep -# CHECK-NEXT: insl %dx, %es:(%edi) +# CHECK: rep insl %dx, %es:(%edi) 0xf3 0x6d #rep ins -# CHECK: rep -# CHECK-NEXT: movsb (%esi), %es:(%edi) +# CHECK: rep movsb (%esi), %es:(%edi) 0xf3 0xa4 #rep movs -# CHECK: rep -# CHECK-NEXT: movsl (%esi), %es:(%edi) +# CHECK: rep movsl (%esi), %es:(%edi) 0xf3 0xa5 #rep movs -# CHECK: rep -# CHECK-NEXT: outsb (%esi), %dx +# CHECK: rep outsb (%esi), %dx 0xf3 0x6e #rep outs -# CHECK: rep -# CHECK-NEXT: outsl (%esi), %dx +# CHECK: rep outsl (%esi), %dx 0xf3 0x6f #rep outs -# CHECK: rep -# CHECK-NEXT: lodsb (%esi), %al +# CHECK: rep lodsb (%esi), %al 0xf3 0xac #rep lods -# CHECK: rep -# CHECK-NEXT: lodsl (%esi), %eax +# CHECK: rep lodsl (%esi), %eax 0xf3 0xad #rep lods -# CHECK: rep -# CHECK-NEXT: stosb %al, %es:(%edi) +# CHECK: rep stosb %al, %es:(%edi) 0xf3 0xaa #rep stos -# CHECK: rep -# CHECK-NEXT: stosl %eax, %es:(%edi) +# CHECK: rep stosl %eax, %es:(%edi) 0xf3 0xab #rep stos -# CHECK: rep -# CHECK-NEXT: cmpsb %es:(%edi), (%esi) +# CHECK: rep cmpsb %es:(%edi), (%esi) 0xf3 0xa6 #rep cmps -# CHECK: rep -# CHECK-NEXT: cmpsl %es:(%edi), (%esi) +# CHECK: rep cmpsl %es:(%edi), (%esi) 0xf3 0xa7 #repe cmps -# CHECK: rep -# CHECK-NEXT: scasb %es:(%edi), %al +# CHECK: rep scasb %es:(%edi), %al 0xf3 0xae #repe scas -# CHECK: rep -# CHECK-NEXT: scasl %es:(%edi), %eax +# CHECK: rep scasl %es:(%edi), %eax 0xf3 0xaf #repe scas -# CHECK: repne -# CHECK-NEXT: cmpsb %es:(%edi), (%esi) +# CHECK: repne cmpsb %es:(%edi), (%esi) 0xf2 0xa6 #repne cmps -# CHECK: repne -# CHECK-NEXT: cmpsl %es:(%edi), (%esi) +# CHECK: repne cmpsl %es:(%edi), (%esi) 0xf2 0xa7 #repne cmps -# CHECK: repne -# CHECK-NEXT: scasb %es:(%edi), %al +# CHECK: repne scasb %es:(%edi), %al 0xf2 0xae #repne scas -# CHECK: repne -# CHECK-NEXT: scasl %es:(%edi), %eax +# CHECK: repne scasl %es:(%edi), %eax 0xf2 0xaf #repne scas -# CHECK: repne -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: repne scasw %es:(%edi), %ax 0xf2 0x66 0xaf -# CHECK: repne -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: repne scasw %es:(%edi), %ax 0x66 0xf2 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: rep scasw %es:(%edi), %ax 0xf3 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%edi), %ax +# CHECK: rep scasw %es:(%edi), %ax 0x66 0xf3 0xaf -# CHECK: repne -# CHECK: insw %dx, %es:(%edi) +# CHECK: repne insw %dx, %es:(%edi) 0xf2 0x66 0x6d -# CHECK: repne -# CHECK: insw %dx, %es:(%edi) +# CHECK: repne insw %dx, %es:(%edi) 0x66 0xf2 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%edi) +# CHECK: rep insw %dx, %es:(%edi) 0xf3 0x66 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%edi) +# CHECK: rep insw %dx, %es:(%edi) 0x66 0xf3 0x6d diff --git a/test/MC/Disassembler/X86/prefixes-x86_64.txt b/test/MC/Disassembler/X86/prefixes-x86_64.txt index 7a9208f7b639d..c9bf512aa7586 100644 --- a/test/MC/Disassembler/X86/prefixes-x86_64.txt +++ b/test/MC/Disassembler/X86/prefixes-x86_64.txt @@ -9,30 +9,22 @@ # CHECK: mulsd %xmm7, %xmm7 0xf2 0x66 0x0f 0x59 0xff -# CHECK: repne -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: repne scasw %es:(%rdi), %ax 0xf2 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: repne scasw %es:(%rdi), %ax 0x66 0xf2 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: rep scasw %es:(%rdi), %ax 0xf3 0x66 0xaf -# CHECK: rep -# CHECK-NEXT: scasw %es:(%rdi), %ax +# CHECK: rep scasw %es:(%rdi), %ax 0x66 0xf3 0xaf -# CHECK: repne -# CHECK: insw %dx, %es:(%rdi) +# CHECK: repne insw %dx, %es:(%rdi) 0xf2 0x66 0x6d -# CHECK: repne -# CHECK: insw %dx, %es:(%rdi) +# CHECK: repne insw %dx, %es:(%rdi) 0x66 0xf2 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%rdi) +# CHECK: rep insw %dx, %es:(%rdi) 0xf3 0x66 0x6d -# CHECK: rep -# CHECK: insw %dx, %es:(%rdi) +# CHECK: rep insw %dx, %es:(%rdi) 0x66 0xf3 0x6d diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt index 983e09670d681..75e11ae93f4cb 100644 --- a/test/MC/Disassembler/X86/prefixes.txt +++ b/test/MC/Disassembler/X86/prefixes.txt @@ -1,73 +1,53 @@ # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -# CHECK: rep -# CHECK-NEXT: insb %dx, %es:(%rdi) +# CHECK: rep insb %dx, %es:(%rdi) 0xf3 0x6c #rep ins -# CHECK: rep -# CHECK-NEXT: insl %dx, %es:(%rdi) +# CHECK: rep insl %dx, %es:(%rdi) 0xf3 0x6d #rep ins -# CHECK: rep -# CHECK-NEXT: movsb (%rsi), %es:(%rdi) +# CHECK: rep movsb (%rsi), %es:(%rdi) 0xf3 0xa4 #rep movs -# CHECK: rep -# CHECK-NEXT: movsl (%rsi), %es:(%rdi) +# CHECK: rep movsl (%rsi), %es:(%rdi) 0xf3 0xa5 #rep movs -# CHECK: rep -# CHECK-NEXT: outsb (%rsi), %dx +# CHECK: rep outsb (%rsi), %dx 0xf3 0x6e #rep outs -# CHECK: rep -# CHECK-NEXT: outsl (%rsi), %dx +# CHECK: rep outsl (%rsi), %dx 0xf3 0x6f #rep outs -# CHECK: rep -# CHECK-NEXT: lodsb (%rsi), %al +# CHECK: rep lodsb (%rsi), %al 0xf3 0xac #rep lods -# CHECK: rep -# CHECK-NEXT: lodsl (%rsi), %eax +# CHECK: rep lodsl (%rsi), %eax 0xf3 0xad #rep lods -# CHECK: rep -# CHECK-NEXT: stosb %al, %es:(%rdi) +# CHECK: rep stosb %al, %es:(%rdi) 0xf3 0xaa #rep stos -# CHECK: rep -# CHECK-NEXT: stosl %eax, %es:(%rdi) +# CHECK: rep stosl %eax, %es:(%rdi) 0xf3 0xab #rep stos -# CHECK: rep -# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi) +# CHECK: rep cmpsb %es:(%rdi), (%rsi) 0xf3 0xa6 #rep cmps -# CHECK: rep -# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi) +# CHECK: rep cmpsl %es:(%rdi), (%rsi) 0xf3 0xa7 #repe cmps -# CHECK: rep -# CHECK-NEXT: scasb %es:(%rdi), %al +# CHECK: rep scasb %es:(%rdi), %al 0xf3 0xae #repe scas -# CHECK: rep -# CHECK-NEXT: scasl %es:(%rdi), %eax +# CHECK: rep scasl %es:(%rdi), %eax 0xf3 0xaf #repe scas -# CHECK: repne -# CHECK-NEXT: cmpsb %es:(%rdi), (%rsi) +# CHECK: repne cmpsb %es:(%rdi), (%rsi) 0xf2 0xa6 #repne cmps -# CHECK: repne -# CHECK-NEXT: cmpsl %es:(%rdi), (%rsi) +# CHECK: repne cmpsl %es:(%rdi), (%rsi) 0xf2 0xa7 #repne cmps -# CHECK: repne -# CHECK-NEXT: scasb %es:(%rdi), %al +# CHECK: repne scasb %es:(%rdi), %al 0xf2 0xae #repne scas -# CHECK: repne -# CHECK-NEXT: scasl %es:(%rdi), %eax +# CHECK: repne scasl %es:(%rdi), %eax 0xf2 0xaf #repne scas # CHECK: lock -# CHECK-NEXT: orl $16, %fs:776 +# CHECK-NEXT: orl $16, %fs:776 0xf0 0x64 0x83 0x0c 0x25 0x08 0x03 0x00 0x00 0x10 # CHECK: movq %fs:768, %rdi 0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00 -# CHECK: rep -# CHECK-NEXT: stosq %rax, %es:(%rdi) +# CHECK: rep stosq %rax, %es:(%rdi) 0xf3 0x48 0xab -# CHECK: rep -# CHECK-NEXT: stosq %rax, %es:(%edi) +# CHECK: rep stosq %rax, %es:(%edi) 0xf3 0x67 0x48 0xab # CHECK: movl 32(%rbp), %eax @@ -104,11 +84,9 @@ 0x66,0x83,0xc0,0xf4 # Test that multiple redundant prefixes work (redundant, but valid x86). -# CHECK: rep -# CHECK-NEXT: stosq +# CHECK: rep stosq 0xf3 0xf3 0x48 0xab - # Test that we can disassembler control registers above CR8 # CHECK: movq %cr15, %rax 0x44 0x0f 0x20 0xf8 diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt index 86d9f92fbbfa4..390749341647a 100644 --- a/test/MC/Disassembler/X86/simple-tests.txt +++ b/test/MC/Disassembler/X86/simple-tests.txt @@ -851,14 +851,11 @@ 0xf0 0x48 0x0f 0xc1 0xcb # rdar://13493622 lldb doesn't print the x86 rep/repne prefix when disassembling -# CHECK: repne -# CHECK-NEXT: movsl +# CHECK: repne movsl 0xf2 0xa5 -# CHECK: repne -# CHECK-NEXT: movsq +# CHECK: repne movsq 0xf2 0x48 0xa5 -# CHECK: repne -# CHECK-NEXT: movb $0, (%rax) +# CHECK: repne movb $0, (%rax) 0xf2 0xc6 0x0 0x0 # rdar://11019859 Support 2013 Haswell RTM instructions and HLE prefixes From 876a9b9b65e9035aaf1b22739a2b9c8d9698e242 Mon Sep 17 00:00:00 2001 From: Simon Dardis Date: Fri, 3 Nov 2017 15:35:13 +0000 Subject: [PATCH 052/238] [mips] Match 'ins' and its' variants with C++ code Change the ISel matching of 'ins', 'dins[mu]' from tablegen code to C++ code. This resolves an issue where ISel would select 'dins' instead of 'dinsm' when the instructions size and position were individually in range but their sum was out of range according to the ISA specification. Reviewers: atanasyan Differential Revision: https://reviews.llvm.org/D39117 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317331 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MicroMips64r6InstrInfo.td | 7 ++- lib/Target/Mips/MicroMipsInstrInfo.td | 2 +- lib/Target/Mips/Mips64InstrInfo.td | 6 +-- lib/Target/Mips/MipsInstrInfo.td | 9 ++-- lib/Target/Mips/MipsSEISelDAGToDAG.cpp | 58 +++++++++++++++++++++++ test/CodeGen/Mips/dins.ll | 14 ++++-- 6 files changed, 79 insertions(+), 17 deletions(-) diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td index e0f4d83339262..4f705feed0aaf 100644 --- a/lib/Target/Mips/MicroMips64r6InstrInfo.td +++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td @@ -162,12 +162,11 @@ class DCLZ_MM64R6_DESC { class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1, immZExt5Plus32, - immZExt5Plus1, MipsIns>; + immZExt5Plus1>; class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64, - immZExt5, immZExtRange2To64, MipsIns>; + immZExt5, immZExtRange2To64>; class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5_report_uimm6, - uimm5_inssize_plus1, immZExt5, immZExt5Plus1, - MipsIns>; + uimm5_inssize_plus1, immZExt5, immZExt5Plus1>; class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>; class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd, diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td index 1f869db4efee2..90399ddfab577 100644 --- a/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/lib/Target/Mips/MicroMipsInstrInfo.td @@ -884,7 +884,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5, immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>; def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1, - immZExt5, immZExt5Plus1, MipsIns>, + immZExt5, immZExt5Plus1>, EXT_FM_MM<0x0c>; /// Jump Instructions diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td index 04a050c2ff4e6..dbd47de4dad15 100644 --- a/lib/Target/Mips/Mips64InstrInfo.td +++ b/lib/Target/Mips/Mips64InstrInfo.td @@ -341,13 +341,13 @@ let AdditionalPredicates = [NotInMicroMips] in { // for dinsm and dinsu like binutils. let DecoderMethod = "DecodeDINS" in { def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1, - immZExt5, immZExt5Plus1, MipsIns>, EXT_FM<7>, + immZExt5, immZExt5Plus1>, EXT_FM<7>, ISA_MIPS64R2; def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1, - immZExt5Plus32, immZExt5Plus1, MipsIns>, + immZExt5Plus32, immZExt5Plus1>, EXT_FM<6>, ISA_MIPS64R2; def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64, - immZExt5, immZExtRange2To64, MipsIns>, + immZExt5, immZExtRange2To64>, EXT_FM<5>, ISA_MIPS64R2; } } diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index c4c3eb760c57c..ac4980e99a7e4 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -1726,12 +1726,13 @@ class ExtBase, ISA_MIPS32R2; +// 'ins' and its' 64 bit variants are matched by C++ code. class InsBase: + Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm>: InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src), !strconcat(opstr, " $rt, $rs, $pos, $size"), - [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size, RO:$src))], + [(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size, + RO:$src))], II_INS, FrmR, opstr>, ISA_MIPS32R2 { let Constraints = "$src = $rt"; } @@ -2236,7 +2237,7 @@ let AdditionalPredicates = [NotInMicroMips] in { EXT_FM<0>; def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1, immZExt5, - immZExt5Plus1, MipsIns>, + immZExt5Plus1>, EXT_FM<4>; } /// Move Control Registers From/To CPU Registers diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 283fcaa73a7ac..3c6a7d7a66510 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -905,6 +905,64 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { break; } + // Manually match MipsISD::Ins nodes to get the correct instruction. It has + // to be done in this fashion so that we respect the differences between + // dins and dinsm, as the difference is that the size operand has the range + // 0 < size <= 32 for dins while dinsm has the range 2 <= size <= 64 which + // means SelectionDAGISel would have to test all the operands at once to + // match the instruction. + case MipsISD::Ins: { + + // Sanity checking for the node operands. + if (Node->getValueType(0) != MVT::i32 && Node->getValueType(0) != MVT::i64) + return false; + + if (Node->getNumOperands() != 4) + return false; + + if (Node->getOperand(1)->getOpcode() != ISD::Constant || + Node->getOperand(2)->getOpcode() != ISD::Constant) + return false; + + MVT ResTy = Node->getSimpleValueType(0); + uint64_t Pos = Node->getConstantOperandVal(1); + uint64_t Size = Node->getConstantOperandVal(2); + + // Size has to be >0 for 'ins', 'dins' and 'dinsu'. + if (!Size) + return false; + + if (Pos + Size > 64) + return false; + + if (ResTy != MVT::i32 && ResTy != MVT::i64) + return false; + + unsigned Opcode = 0; + if (ResTy == MVT::i32) { + if (Pos + Size <= 32) + Opcode = Mips::INS; + } else { + if (Pos + Size <= 32) + Opcode = Mips::DINS; + else if (Pos < 32 && 1 < Size) + Opcode = Mips::DINSM; + else + Opcode = Mips::DINSU; + } + + if (Opcode) { + SDValue Ops[4] = { + Node->getOperand(0), CurDAG->getTargetConstant(Pos, DL, MVT::i32), + CurDAG->getTargetConstant(Size, DL, MVT::i32), Node->getOperand(3)}; + + ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, ResTy, Ops)); + return true; + } + + return false; + } + case MipsISD::ThreadPointer: { EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout()); unsigned RdhwrOpc, DestReg; diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll index 8a8b377861aee..2f7138ca4c5db 100644 --- a/test/CodeGen/Mips/dins.ll +++ b/test/CodeGen/Mips/dins.ll @@ -1,7 +1,11 @@ -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 -; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2 -; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16 -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mcpu=mips32r2 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS32R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mattr=mips16 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS16 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 ; #include ; #include @@ -60,7 +64,7 @@ entry: ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123 ; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4 -; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 +; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5 ; MIPS64R2: dinsu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14 ; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50 From d16b502afd11c5c7f2883da31b63460eea106ae7 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 3 Nov 2017 16:17:13 +0000 Subject: [PATCH 053/238] [SLP] Test for PR23510, NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317334 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SLPVectorizer/X86/stores_vectorize.ll | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 test/Transforms/SLPVectorizer/X86/stores_vectorize.ll diff --git a/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll new file mode 100644 index 0000000000000..79fb782db8f58 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s + +;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) { +; long a = p3[0] = 5; +; p1 += p2; +; p4[3] += p1[a]; +; p3[0] >>= 5; +; p3[1] >>= 5; +; p3[2] >>= 5; +; p3[3] >>= 5; +; p1 += p2; +; p4[0] += p1[p3[0] & a]; +;} + +define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) { +; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i64 5, i64* [[P3:%.*]], align 8 +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P3]], align 8 +; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[TMP2]], 5 +; CHECK-NEXT: store i64 [[SHR]], i64* [[P3]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[SHR9:%.*]] = lshr i64 [[TMP7]], 5 +; CHECK-NEXT: store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[SHR]], 5 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[P4]], align 4 +; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]] +; CHECK-NEXT: store float [[ADD15]], float* [[P4]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i64 5, i64* %p3, align 8 + %idx.ext = sext i32 %p2 to i64 + %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext + %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5 + %0 = load float, float* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3 + %1 = load float, float* %arrayidx2, align 4 + %add = fadd float %0, %1 + store float %add, float* %arrayidx2, align 4 + %2 = load i64, i64* %p3, align 8 + %shr = lshr i64 %2, 5 + store i64 %shr, i64* %p3, align 8 + %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1 + %3 = load i64, i64* %arrayidx4, align 8 + %shr5 = lshr i64 %3, 5 + store i64 %shr5, i64* %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2 + %4 = load i64, i64* %arrayidx6, align 8 + %shr7 = lshr i64 %4, 5 + store i64 %shr7, i64* %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3 + %5 = load i64, i64* %arrayidx8, align 8 + %shr9 = lshr i64 %5, 5 + store i64 %shr9, i64* %arrayidx8, align 8 + %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext + %and = and i64 %shr, 5 + %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and + %6 = load float, float* %arrayidx13, align 4 + %7 = load float, float* %p4, align 4 + %add15 = fadd float %6, %7 + store float %add15, float* %p4, align 4 + ret void +} From 7c2eb4ec8b267bb3887787bf8e2afe800a72828a Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 16:24:53 +0000 Subject: [PATCH 054/238] [LICM] sink through non-trivially replicable PHI Summary: The current LICM allows sinking an instruction only when it is exposed to exit blocks through a trivially replacable PHI of which all incoming values are the same instruction. This change enhance LICM to sink a sinkable instruction through non-trivially replacable PHIs by spliting predecessors of loop exits. Reviewers: hfinkel, majnemer, davidxl, bmakam, mcrosier, danielcdh, efriedma, jtony Reviewed By: efriedma Subscribers: nemanjai, dberlin, llvm-commits Differential Revision: https://reviews.llvm.org/D37163 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317335 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LICM.cpp | 196 +++++++++++----- test/CodeGen/PowerPC/subreg-postra-2.ll | 8 +- test/Transforms/LICM/sinking.ll | 284 +++++++++++++++++++++++- 3 files changed, 427 insertions(+), 61 deletions(-) diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 6ca8d602302b7..c60ec9f50f7af 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -62,6 +62,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -93,9 +94,8 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, @@ -394,8 +394,12 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // if (isNotUsedInLoop(I, CurLoop, SafetyInfo) && canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) { - ++II; - Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE); + if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) { + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + Changed = true; + } } } } @@ -717,26 +721,6 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, if (!BlockColors.empty() && BlockColors.find(const_cast(BB))->second.size() != 1) return false; - - // A PHI node where all of the incoming values are this instruction are - // special -- they can just be RAUW'ed with the instruction and thus - // don't require a use in the predecessor. This is a particular important - // special case because it is the pattern found in LCSSA form. - if (isTriviallyReplacablePHI(*PN, I)) { - if (CurLoop->contains(PN)) - return false; - else - continue; - } - - // Otherwise, PHI node uses occur in predecessor blocks if the incoming - // values. Check for such a use being inside the loop. - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (PN->getIncomingValue(i) == &I) - if (CurLoop->contains(PN->getIncomingBlock(i))) - return false; - - continue; } if (CurLoop->contains(UI)) @@ -806,14 +790,96 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, return New; } +static Instruction *sinkThroughTriviallyReplacablePHI( + PHINode *TPN, Instruction *I, LoopInfo *LI, + SmallDenseMap &SunkCopies, + const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) { + assert(isTriviallyReplacablePHI(*TPN, *I) && + "Expect only trivially replacalbe PHI"); + BasicBlock *ExitBlock = TPN->getParent(); + Instruction *New; + auto It = SunkCopies.find(ExitBlock); + if (It != SunkCopies.end()) + New = It->second; + else + New = SunkCopies[ExitBlock] = + CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo); + return New; +} + +static bool canSplitPredecessors(PHINode *PN) { + BasicBlock *BB = PN->getParent(); + if (!BB->canSplitPredecessors()) + return false; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *BBPred = *PI; + if (isa(BBPred->getTerminator())) + return false; + } + return true; +} + +static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, + LoopInfo *LI, const Loop *CurLoop) { +#ifndef NDEBUG + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + BasicBlock *ExitBB = PN->getParent(); + assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block."); + + // Split predecessors of the loop exit to make instructions in the loop are + // exposed to exit blocks through trivially replacable PHIs while keeping the + // loop in the canonical form where each predecessor of each exit block should + // be contained within the loop. For example, this will convert the loop below + // from + // + // LB1: + // %v1 = + // br %LE, %LB2 + // LB2: + // %v2 = + // br %LE, %LB1 + // LE: + // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable + // + // to + // + // LB1: + // %v1 = + // br %LE.split, %LB2 + // LB2: + // %v2 = + // br %LE.split2, %LB1 + // LE.split: + // %p1 = phi [%v1, %LB1] <-- trivially replacable + // br %LE + // LE.split2: + // %p2 = phi [%v2, %LB2] <-- trivially replacable + // br %LE + // LE: + // %p = phi [%p1, %LE.split], [%p2, %LE.split2] + // + SmallSetVector PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); + while (!PredBBs.empty()) { + BasicBlock *PredBB = *PredBBs.begin(); + assert(CurLoop->contains(PredBB) && + "Expect all predecessors are in the loop"); + if (PN->getBasicBlockIndex(PredBB) >= 0) + SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true); + PredBBs.remove(PredBB); + } +} + /// When an instruction is found to only be used outside of the loop, this /// function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its /// position, and may either delete it or move it to outside of the loop. /// -static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, - const Loop *CurLoop, AliasSetTracker *CurAST, - const LoopSafetyInfo *SafetyInfo, +static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { @@ -828,57 +894,75 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT, ++NumSunk; Changed = true; -#ifndef NDEBUG - SmallVector ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - SmallPtrSet ExitBlockSet(ExitBlocks.begin(), - ExitBlocks.end()); -#endif + // Iterate over users to be ready for actual sinking. Replace users via + // unrechable blocks with undef and make all user PHIs trivially replcable. + SmallPtrSet VisitedUsers; + for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) { + auto *User = cast(*UI); + Use &U = UI.getUse(); + ++UI; - // Clones of this instruction. Don't create more than one per exit block! - SmallDenseMap SunkCopies; + if (VisitedUsers.count(User)) + continue; - // If this instruction is only used outside of the loop, then all users are - // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of - // the instruction. - while (!I.use_empty()) { - Value::user_iterator UI = I.user_begin(); - auto *User = cast(*UI); if (!DT->isReachableFromEntry(User->getParent())) { User->replaceUsesOfWith(&I, UndefValue::get(I.getType())); continue; } + // The user must be a PHI node. PHINode *PN = cast(User); // Surprisingly, instructions can be used outside of loops without any // exits. This can only happen in PHI nodes if the incoming block is // unreachable. - Use &U = UI.getUse(); BasicBlock *BB = PN->getIncomingBlock(U); if (!DT->isReachableFromEntry(BB)) { U = UndefValue::get(I.getType()); continue; } - BasicBlock *ExitBlock = PN->getParent(); - assert(ExitBlockSet.count(ExitBlock) && - "The LCSSA PHI is not in an exit block!"); + VisitedUsers.insert(PN); + if (isTriviallyReplacablePHI(*PN, I)) + continue; - Instruction *New; - auto It = SunkCopies.find(ExitBlock); - if (It != SunkCopies.end()) - New = It->second; - else - New = SunkCopies[ExitBlock] = - CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo); + if (!canSplitPredecessors(PN)) + return false; + + // Split predecessors of the PHI so that we can make users trivially + // replacable. + splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop); + // Should rebuild the iterators, as they may be invalidated by + // splitPredecessorsOfLoopExit(). + UI = I.user_begin(); + UE = I.user_end(); + } + +#ifndef NDEBUG + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + SmallPtrSet ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); +#endif + + // Clones of this instruction. Don't create more than one per exit block! + SmallDenseMap SunkCopies; + + // If this instruction is only used outside of the loop, then all users are + // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of + // the instruction. + while (!I.use_empty()) { + Value::user_iterator UI = I.user_begin(); + PHINode *PN = cast(*UI); + assert(ExitBlockSet.count(PN->getParent()) && + "The LCSSA PHI is not in an exit block!"); + // The PHI must be trivially replacable. + Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies, + SafetyInfo, CurLoop); PN->replaceAllUsesWith(New); PN->eraseFromParent(); } - - CurAST->deleteValue(&I); - I.eraseFromParent(); return Changed; } diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll index 338000cd8bae6..794c9c190d1c6 100644 --- a/test/CodeGen/PowerPC/subreg-postra-2.ll +++ b/test/CodeGen/PowerPC/subreg-postra-2.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -38,10 +38,10 @@ while.end418: ; preds = %wait_on_buffer.exit ; CHECK: stdcx. ; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]] ; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]] -; CHECK-NO-ISEL: ori 4, 7, 0 +; CHECK-NO-ISEL: ori 7, 8, 0 ; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] ; CHECK-NO-ISEL: [[TRUE]] -; CHECK-NO-ISEL-NEXT: addi 4, 3, 0 +; CHECK-NO-ISEL: addi 7, 3, 0 if.then420: ; preds = %while.end418 unreachable diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll index 6e9e8d4b7b6f4..b28eea0bc2aa6 100644 --- a/test/Transforms/LICM/sinking.ll +++ b/test/Transforms/LICM/sinking.ll @@ -392,6 +392,288 @@ lab60: indirectbr i8* undef, [label %lab21, label %lab19] } -declare void @f(i32*) +; Check if LICM can sink a sinkable instruction the exit blocks through +; a non-trivially replacable PHI node. +; +; CHECK-LABEL: @test14 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL: Out12.split.loop.exit: +; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ] +; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]] +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12.split.loop.exit1: +; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ] +; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]] +; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12: +; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ] +define i32 @test14(i32 %N, i32 %N2, i1 %C) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %sink.sub = sub i32 %sink.mul, %N + %dec = add i32 %N_addr.0.pn, -1 + br i1 %C, label %ContLoop, label %Out12 +ContLoop: + %tmp.1 = icmp ne i32 %N_addr.0.pn, 1 + br i1 %tmp.1, label %Loop, label %Out12 +Out12: + %tmp = phi i32 [%sink.mul, %ContLoop], [%sink.sub, %Loop] + ret i32 %tmp +} + +; In this test, splitting predecessors is not really required because the +; operations of sinkable instructions (sub and mul) are same. In this case, we +; can sink the same sinkable operations and modify the PHI to pass the operands +; to the shared operations. As of now, we split predecessors of non-trivially +; replicalbe PHIs by default in LICM because all incoming edges of a +; non-trivially replacable PHI in LCSSA is critical. +; +; CHECK-LABEL: @test15 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL: Out12.split.loop.exit: +; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ] +; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]] +; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12.split.loop.exit1: +; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ] +; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]] +; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N +; CHECK: br label %Out12 +; +; CHECK-LABEL: Out12: +; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ] +define i32 @test15(i32 %N, i32 %N2, i1 %C) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %sink.sub = sub i32 %sink.mul, %N + %sink.sub2 = sub i32 %sink.mul, %N2 + %dec = add i32 %N_addr.0.pn, -1 + br i1 %C, label %ContLoop, label %Out12 +ContLoop: + %tmp.1 = icmp ne i32 %N_addr.0.pn, 1 + br i1 %tmp.1, label %Loop, label %Out12 +Out12: + %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop] + ret i32 %tmp +} + +; Sink through a non-trivially replacable PHI node which use the same sinkable +; instruction multiple times. +; +; CHECK-LABEL: @test16 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; +; CHECK-LABEL: Out.split.loop.exit: +; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ] +; CHECK: br label %Out +; +; CHECK-LABEL: Out.split.loop.exit1: +; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le +; CHECK: br label %Out +; +; CHECK-LABEL: Out: +; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ] +define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) { +entry: + br label %loop.ph +loop.ph: + br label %Loop +Loop: + %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ] + %l2 = call i32 @getv() + %t = trunc i64 %iv to i32 + %sinkable = mul i32 %l2, %t + switch i32 %l2, label %ContLoop [ + i32 32, label %Out + i32 46, label %Out + i32 95, label %Out + ] +ContLoop: + %next = add nuw i64 %iv, 1 + %c1 = call i1 @getc() + br i1 %c1, label %Loop, label %Out +Out: + %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ] + ret i32 %idx +} + +; Sink a sinkable instruction through multiple non-trivially replacable PHIs in +; differect exit blocks. +; +; CHECK-LABEL: @test17 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; +; CHECK-LABEL:OutA.split.loop.exit{{.*}}: +; CHECK: %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ] +; CHECK: %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]] +; CHECK: br label %OutA +; +; CHECK-LABEL:OutA: +; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ] +; +; CHECK-LABEL:OutB.split.loop.exit{{.*}}: +; CHECK: %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ] +; CHECK: %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]] +; CHECK: br label %OutB +; +; CHECK-LABEL:OutB: +; CHECK: phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ] +define i32 @test17(i32 %N, i32 %N2) { +Entry: + br label %Loop +Loop: + %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %N_addr.0.pn + %c0 = call i1 @getc() + br i1 %c0 , label %ContLoop1, label %OutA +ContLoop1: + %c1 = call i1 @getc() + br i1 %c1, label %ContLoop2, label %OutA + +ContLoop2: + %c2 = call i1 @getc() + br i1 %c2, label %ContLoop3, label %OutB +ContLoop3: + %c3 = call i1 @getc() + %dec = add i32 %N_addr.0.pn, -1 + br i1 %c3, label %Loop, label %OutB +OutA: + %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop] + br label %Out12 +OutB: + %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3] + br label %Out12 +Out12: + %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB] + ret i32 %tmp +} + + +; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs. +; +; CHECK-LABEL: @test18 +; CHECK-LABEL: Loop: +; CHECK-NOT: mul +; CHECK-NOT: sub +; +; CHECK-LABEL:Out12.split.loop.exit: +; CHECK: %[[OP:.*]] = phi i32 [ %iv, %ContLoop ] +; CHECK: %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ] +; CHECK: %[[SINKMUL:.*]] = mul i32 %N, %[[OP]] +; CHECK: %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL:Out12.split.loop.exit1: +; CHECK: %[[OP2:.*]] = phi i32 [ %iv, %Loop ] +; CHECK: %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]] +; CHECK: %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2 +; CHECK: br label %Out12 +; +; CHECK-LABEL:Out12: +; CHECK: %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ] +; CHECK: %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ] +; CHECK: %add = add i32 %tmp1, %tmp2 +define i32 @test18(i32 %N, i32 %N2) { +Entry: + br label %Loop +Loop: + %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ] + %sink.mul = mul i32 %N, %iv + %sink.sub = sub i32 %sink.mul, %N2 + %c0 = call i1 @getc() + br i1 %c0, label %ContLoop, label %Out12 +ContLoop: + %dec = add i32 %iv, -1 + %c1 = call i1 @getc() + br i1 %c1, label %Loop, label %Out12 +Out12: + %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop] + %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop] + %add = add i32 %tmp1, %tmp2 + ret i32 %add +} + +; Do not sink an instruction through a non-trivially replacable PHI, to avoid +; assert while splitting predecessors, if the terminator of predecessor is an +; indirectbr. +; CHECK-LABEL: @test19 +; CHECK-LABEL: L0: +; CHECK: %sinkable = mul +; CHECK: %sinkable2 = add + +define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind { +entry: + br label %L0 +L0: + %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address + %v2 = call i32 @getv() + %sinkable = mul i32 %v1, %v2 + %sinkable2 = add i32 %v1, %v2 + indirectbr i8* %indirect.goto.dest, [label %L1, label %exit] + +L1: + %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit] + +exit: + %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1] + ret i32 %r +} + +; Do not sink through a non-trivially replacable PHI if splitting predecessors +; not allowed in SplitBlockPredecessors(). +; +; CHECK-LABEL: @test20 +; CHECK-LABEL: while.cond +; CHECK: %sinkable = mul +; CHECK: %sinkable2 = add +define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 { +entry: + br label %while.cond +while.cond: + %v = call i32 @getv() + %sinkable = mul i32 %v, %v2 + %sinkable2 = add i32 %v, %v2 + br i1 %b, label %try.cont, label %while.body +while.body: + invoke void @may_throw() + to label %while.body2 unwind label %catch.dispatch +while.body2: + invoke void @may_throw2() + to label %while.cond unwind label %catch.dispatch +catch.dispatch: + %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ] + %cp = cleanuppad within none [] + store i32 %.lcssa1, i32* %s + cleanupret from %cp unwind to caller +try.cont: + ret void +} + +declare void @may_throw() +declare void @may_throw2() +declare i32 @__CxxFrameHandler3(...) +declare i32 @getv() +declare i1 @getc() +declare void @f(i32*) declare void @g() From 604f04f397ea185b505dcc4ea8cd16bce7ccbbea Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 3 Nov 2017 18:00:02 +0000 Subject: [PATCH 055/238] Invoke salvageDebugInfo from CodeGenPrepare's SinkCast() This preserves the debug info for the cast operation in the original location. rdar://problem/33460652 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317340 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 + lib/Transforms/Utils/Local.cpp | 2 +- .../CodeGenPrepare/salvage-debug-info.ll | 118 ++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/CodeGenPrepare/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987cf..73f014704b879 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,6 +1171,7 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { + salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 8c643c93ec4dc..cb7978f76aa0b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I)) { + if (isa(&I) || isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll new file mode 100644 index 0000000000000..5509b92a5c130 --- /dev/null +++ b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll @@ -0,0 +1,118 @@ +; RUN: opt -codegenprepare -S %s -o - | FileCheck %s +; typedef struct info { +; unsigned long long size; +; } info_t; +; extern unsigned p; +; extern unsigned n; +; void f() { +; unsigned int i; +; if (p) { +; info_t *info = (info_t *)p; +; for (i = 0; i < n; i++) +; use(info[i].size); +; } +; } +source_filename = "debug.i" +target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" +target triple = "thumbv7k-apple-ios10.0.0" + +%struct.info = type { i64 } + +@p = external local_unnamed_addr global i32, align 4 +@n = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind ssp uwtable +define void @f() local_unnamed_addr #0 !dbg !16 { +entry: + %0 = load i32, i32* @p, align 4, !dbg !25 + %tobool = icmp eq i32 %0, 0, !dbg !25 + br i1 %tobool, label %if.end, label %if.then, !dbg !26 + +if.then: ; preds = %entry + %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 + tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 + ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) + tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 + %2 = load i32, i32* @n, align 4, !dbg !30 + %cmp5 = icmp eq i32 %2, 0, !dbg !33 + br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 + +for.body.preheader: ; preds = %if.then + ; CHECK: for.body.preheader: + ; CHECK: %2 = inttoptr i32 %0 to %struct.info* + br label %for.body, !dbg !35 + +for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* + tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 + %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 + %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 + %inc = add nuw i32 %i.06, 1, !dbg !37 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 + %4 = load i32, i32* @n, align 4, !dbg !30 + %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 + %cmp = icmp ult i32 %inc, %4, !dbg !33 + br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 + +if.end.loopexit: ; preds = %for.body + br label %if.end, !dbg !40 + +if.end: ; preds = %if.end.loopexit, %if.then, %entry + ret void, !dbg !40 +} +declare i32 @use(...) local_unnamed_addr #1 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #2 + +attributes #0 = { nounwind ssp uwtable } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nobuiltin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!10, !11, !12, !13, !14} +!llvm.ident = !{!15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) +!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) +!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!10 = !{i32 2, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{i32 1, !"min_enum_size", i32 4} +!14 = !{i32 7, !"PIC Level", i32 2} +!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} +!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) +!17 = !DISubroutineType(types: !18) +!18 = !{null} +!19 = !{!20, !22} +!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) +!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) +!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) +!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) +!25 = !DILocation(line: 8, column: 7, scope: !24) +!26 = !DILocation(line: 8, column: 7, scope: !16) +!27 = !DILocation(line: 9, column: 20, scope: !23) +!28 = !DILocation(line: 9, column: 13, scope: !23) +!29 = !DILocation(line: 7, column: 16, scope: !16) +!30 = !DILocation(line: 10, column: 21, scope: !31) +!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) +!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) +!33 = !DILocation(line: 10, column: 19, scope: !31) +!34 = !DILocation(line: 10, column: 5, scope: !32) +!35 = !DILocation(line: 11, column: 19, scope: !31) +!36 = !DILocation(line: 11, column: 7, scope: !31) +!37 = !DILocation(line: 10, column: 25, scope: !31) +!38 = distinct !{!38, !34, !39} +!39 = !DILocation(line: 11, column: 23, scope: !32) +!40 = !DILocation(line: 13, column: 1, scope: !16) From 761cb9cc0a2d5422dd22e2a68bbbbc7d374d8247 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 18:02:44 +0000 Subject: [PATCH 056/238] [X86] Initialize Type and Subtype in getHostCPUName to 0. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317341 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index c167df5a4449d..40ed87bf40df8 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -1057,8 +1057,8 @@ StringRef sys::getHostCPUName() { detectX86FamilyModel(EAX, &Family, &Model); getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2); - unsigned Type; - unsigned Subtype; + unsigned Type = 0; + unsigned Subtype = 0; if (Vendor == SIG_INTEL) { getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features, From aaf1db11f9e3b32446153ce847093dd24fdf8f65 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 18:02:46 +0000 Subject: [PATCH 057/238] [CodeGen] Remove unnecessary semicolons to fix a warning. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317342 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MIRCanonicalizerPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp index 61f9f7e2c5d8b..09b3a8774cbe6 100644 --- a/lib/CodeGen/MIRCanonicalizerPass.cpp +++ b/lib/CodeGen/MIRCanonicalizerPass.cpp @@ -101,10 +101,10 @@ char MIRCanonicalizer::ID; char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID; INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer", - "Rename Register Operands Canonically", false, false); + "Rename Register Operands Canonically", false, false) INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer", - "Rename Register Operands Canonically", false, false); + "Rename Register Operands Canonically", false, false) static std::vector GetRPOList(MachineFunction &MF) { ReversePostOrderTraversal RPOT(&*MF.begin()); From 6a8da4f6feecd43764872f3e52a9db813491d266 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Fri, 3 Nov 2017 18:26:36 +0000 Subject: [PATCH 058/238] Revert "Invoke salvageDebugInfo from CodeGenPrepare's SinkCast()" This reverts commit 317342 while investigating bot breakage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317345 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 - lib/Transforms/Utils/Local.cpp | 2 +- .../CodeGenPrepare/salvage-debug-info.ll | 118 ------------------ 3 files changed, 1 insertion(+), 120 deletions(-) delete mode 100644 test/Transforms/CodeGenPrepare/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 73f014704b879..973ddebd987cf 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,7 +1171,6 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { - salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index cb7978f76aa0b..8c643c93ec4dc 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I) || isa(&I)) { + if (isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll b/test/Transforms/CodeGenPrepare/salvage-debug-info.ll deleted file mode 100644 index 5509b92a5c130..0000000000000 --- a/test/Transforms/CodeGenPrepare/salvage-debug-info.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: opt -codegenprepare -S %s -o - | FileCheck %s -; typedef struct info { -; unsigned long long size; -; } info_t; -; extern unsigned p; -; extern unsigned n; -; void f() { -; unsigned int i; -; if (p) { -; info_t *info = (info_t *)p; -; for (i = 0; i < n; i++) -; use(info[i].size); -; } -; } -source_filename = "debug.i" -target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" -target triple = "thumbv7k-apple-ios10.0.0" - -%struct.info = type { i64 } - -@p = external local_unnamed_addr global i32, align 4 -@n = external local_unnamed_addr global i32, align 4 - -; Function Attrs: nounwind ssp uwtable -define void @f() local_unnamed_addr #0 !dbg !16 { -entry: - %0 = load i32, i32* @p, align 4, !dbg !25 - %tobool = icmp eq i32 %0, 0, !dbg !25 - br i1 %tobool, label %if.end, label %if.then, !dbg !26 - -if.then: ; preds = %entry - %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 - tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 - ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) - tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 - %2 = load i32, i32* @n, align 4, !dbg !30 - %cmp5 = icmp eq i32 %2, 0, !dbg !33 - br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 - -for.body.preheader: ; preds = %if.then - ; CHECK: for.body.preheader: - ; CHECK: %2 = inttoptr i32 %0 to %struct.info* - br label %for.body, !dbg !35 - -for.body: ; preds = %for.body.preheader, %for.body - %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* - tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 - %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 - %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 - %inc = add nuw i32 %i.06, 1, !dbg !37 - tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 - %4 = load i32, i32* @n, align 4, !dbg !30 - %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 - %cmp = icmp ult i32 %inc, %4, !dbg !33 - br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 - -if.end.loopexit: ; preds = %for.body - br label %if.end, !dbg !40 - -if.end: ; preds = %if.end.loopexit, %if.then, %entry - ret void, !dbg !40 -} -declare i32 @use(...) local_unnamed_addr #1 - -; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.value(metadata, metadata, metadata) #2 - -attributes #0 = { nounwind ssp uwtable } -attributes #2 = { nounwind readnone speculatable } -attributes #3 = { nobuiltin nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!10, !11, !12, !13, !14} -!llvm.ident = !{!15} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) -!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") -!2 = !{} -!3 = !{!4} -!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) -!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) -!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) -!7 = !{!8} -!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) -!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) -!10 = !{i32 2, !"Dwarf Version", i32 4} -!11 = !{i32 2, !"Debug Info Version", i32 3} -!12 = !{i32 1, !"wchar_size", i32 4} -!13 = !{i32 1, !"min_enum_size", i32 4} -!14 = !{i32 7, !"PIC Level", i32 2} -!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} -!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) -!17 = !DISubroutineType(types: !18) -!18 = !{null} -!19 = !{!20, !22} -!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) -!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) -!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) -!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) -!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) -!25 = !DILocation(line: 8, column: 7, scope: !24) -!26 = !DILocation(line: 8, column: 7, scope: !16) -!27 = !DILocation(line: 9, column: 20, scope: !23) -!28 = !DILocation(line: 9, column: 13, scope: !23) -!29 = !DILocation(line: 7, column: 16, scope: !16) -!30 = !DILocation(line: 10, column: 21, scope: !31) -!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) -!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) -!33 = !DILocation(line: 10, column: 19, scope: !31) -!34 = !DILocation(line: 10, column: 5, scope: !32) -!35 = !DILocation(line: 11, column: 19, scope: !31) -!36 = !DILocation(line: 11, column: 7, scope: !31) -!37 = !DILocation(line: 10, column: 25, scope: !31) -!38 = distinct !{!38, !34, !39} -!39 = !DILocation(line: 11, column: 23, scope: !32) -!40 = !DILocation(line: 13, column: 1, scope: !16) From aba0da108e9400f8cd31655e241d7d6af5f43abe Mon Sep 17 00:00:00 2001 From: Evgeny Stupachenko Date: Fri, 3 Nov 2017 18:50:03 +0000 Subject: [PATCH 059/238] The patch fixes PR35131 Summary: Fix a misprint which led to false CTLZ recognition. Reviewers: craig.topper Differential Revision: https://reviews.llvm.org/D39585 From: Evgeny Stupachenko git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317348 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 413fb75d1725d..eb5f3cc47cef8 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1326,9 +1326,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, // step 2: detect instructions corresponding to "x.next = x >> 1" if (!DefX || DefX->getOpcode() != Instruction::AShr) return false; - if (ConstantInt *Shft = dyn_cast(DefX->getOperand(1))) - if (!Shft || !Shft->isOne()) - return false; + ConstantInt *Shft = dyn_cast(DefX->getOperand(1)); + if (!Shft || !Shft->isOne()) + return false; VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X From 8f805056c27cc02d22eb0717d4af9d00e25b9c31 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Fri, 3 Nov 2017 18:56:36 +0000 Subject: [PATCH 060/238] [AArch64] Fix the number of iterations for the Newton series The number of iterations was incorrectly determined for DP FP vector types and the tests were insufficient to flag this issue. Differential revision: https://reviews.llvm.org/D39507 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317349 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- test/CodeGen/AArch64/recp-fastmath.ll | 34 ++++++--- test/CodeGen/AArch64/sqrt-fastmath.ll | 83 +++++++++++++++++----- 3 files changed, 94 insertions(+), 25 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index bec872ae8c099..aabbaf90f68a7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4981,7 +4981,7 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. - ExtraSteps = VT == MVT::f64 ? 3 : 2; + ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll index 38e0fb360e492..4776931cf0625 100644 --- a/test/CodeGen/AArch64/recp-fastmath.ll +++ b/test/CodeGen/AArch64/recp-fastmath.ll @@ -18,6 +18,8 @@ define float @frecp1(float %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:s[0-7]]] ; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]] +; CHECK: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} } define <2 x float> @f2recp0(<2 x float> %x) #0 { @@ -38,6 +40,8 @@ define <2 x float> @f2recp1(<2 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]] ; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} } define <4 x float> @f4recp0(<4 x float> %x) #0 { @@ -58,6 +62,8 @@ define <4 x float> @f4recp1(<4 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] ; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define <8 x float> @f8recp0(<8 x float> %x) #0 { @@ -77,10 +83,12 @@ define <8 x float> @f8recp1(<8 x float> %x) #1 { ; CHECK-LABEL: f8recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]] -; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define double @drecp0(double %x) #0 { @@ -101,6 +109,9 @@ define double @drecp1(double %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:d[0-7]]] ; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]] +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} } define <2 x double> @d2recp0(<2 x double> %x) #0 { @@ -121,6 +132,9 @@ define <2 x double> @d2recp1(<2 x double> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] ; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } define <4 x double> @d4recp0(<4 x double> %x) #0 { @@ -140,10 +154,14 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 { ; CHECK-LABEL: d4recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]] -; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] +; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll index 079562c05819f..4dd0516faf0c6 100644 --- a/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -22,7 +22,9 @@ define float @fsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] -; CHECK: fcmp s0, #0 +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK: fcmp {{s[0-7]}}, #0 } define <2 x float> @f2sqrt(<2 x float> %a) #0 { @@ -38,7 +40,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4sqrt(<4 x float> %a) #0 { @@ -54,7 +58,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8sqrt(<8 x float> %a) #0 { @@ -69,9 +75,16 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 { ; CHECK-LABEL: f8sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] -; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.4s]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.4s]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @dsqrt(double %a) #0 { @@ -87,7 +100,10 @@ define double @dsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] -; CHECK: fcmp d0, #0 +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: fcmp {{d[0-7]}}, #0 } define <2 x double> @d2sqrt(<2 x double> %a) #0 { @@ -103,7 +119,10 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4sqrt(<4 x double> %a) #0 { @@ -118,9 +137,19 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 { ; CHECK-LABEL: d4sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] -; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.2d]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.2d]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define float @frsqrt(float %a) #0 { @@ -137,6 +166,8 @@ define float @frsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} ; CHECK-NOT: fcmp {{s[0-7]}}, #0 } @@ -154,7 +185,9 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4rsqrt(<4 x float> %a) #0 { @@ -171,7 +204,9 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8rsqrt(<8 x float> %a) #0 { @@ -189,7 +224,11 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @drsqrt(double %a) #0 { @@ -206,6 +245,9 @@ define double @drsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} ; CHECK-NOT: fcmp d0, #0 } @@ -223,7 +265,10 @@ define <2 x double> @d2rsqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4rsqrt(<4 x double> %a) #0 { @@ -241,7 +286,13 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } attributes #0 = { nounwind "unsafe-fp-math"="true" } From a8631b87aef95da6cd44dd94508c4f37c26b4867 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Fri, 3 Nov 2017 18:58:41 +0000 Subject: [PATCH 061/238] [llvm-objcopy] Add support for dwarf fission This change adds support for dwarf fission. Differential Revision: https://reviews.llvm.org/D39207 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317350 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-objcopy/Inputs/dwarf.dwo | Bin 0 -> 3568 bytes test/tools/llvm-objcopy/drawf-fission.test | 43 +++++++++++ tools/llvm-objcopy/Object.h | 1 + tools/llvm-objcopy/llvm-objcopy.cpp | 81 +++++++++++++++++---- 4 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 test/tools/llvm-objcopy/Inputs/dwarf.dwo create mode 100644 test/tools/llvm-objcopy/drawf-fission.test diff --git a/test/tools/llvm-objcopy/Inputs/dwarf.dwo b/test/tools/llvm-objcopy/Inputs/dwarf.dwo new file mode 100644 index 0000000000000000000000000000000000000000..4b6fd5055061e0b2c85d03a5af0a8d85d7b0006c GIT binary patch literal 3568 zcmbVOO>7%g5T5ngZ5pR(ehMiqKbAzX`-Sj$MC-S?EPPysT`|V{40c#d!7H-$3PfUMR zJNR{NVb+?X{AcjXQzDocsqwRGADn(K`S}NB(!(tALH>sC2JO-fFN(WiP&(^WoJwga zaochE>e~9+#n;w0&Q>brRyT0{Q}9BZG`m3+dTuv>Zjh)M58?O>epUQl#P1Y-gIcU9 z$Z-)lpw!ki+p?EXu=9mHuE)Av+pV75t-}JzxeWytN4Twad<0U8jeC$qwmR|s z7F!)m7JgzMIEm|;GwS61d=fp=9UToxTQ5Cn+QPr-kdUZNiO5bJ#5iByNyN-mPoYjk zILDICISJ=ngi}8dG0rh1g{C<5E7PJD=tOM>4K8uBPL|1?Wy;Gk;oOIWo2OGEBa)o| zbPDgo!BNl8hFh;k-VJaf&v%`~+eyG_guR}JufSG6iM1j1-Rjqqn_J#KFmQVWrcMz) zJVPx`B5>9n#Z(HNwb?}6tU$e*8bR-ku+@saBo?r#c7s-E>V7xyM)a^@sy(+s5hDvy zfO_?uAWS^x(krho$BEn69Py%wM}Ss@3+7u*YfIcZIPuM%;!)Ui6PMr>XGQP+|7Vx+ zK(qR2LN|4X4%{H2s{SR+>>+Yrr2&cWU8)W50qaLlFz*B7-|C|1IVPf)kM1S+?-oH^Kl8UV#(Btx=*^VDvzjyGIOe~gfo1j~{*kE|vh*cgV&u;8 z^uuJ5^!*2o5+naFbdX$^^!*e4viAS3-v2#45A8qgz1;t+h{-uA class Object { Object(const object::ELFObjectFile &Obj); virtual ~Object() = default; + const SectionBase *getSectionHeaderStrTab() const { return SectionNames; } void removeSections(std::function ToRemove); virtual size_t totalSize() const = 0; virtual void finalize() = 0; diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index f3e9c7750a64a..52091d3e183e7 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -83,12 +83,63 @@ static cl::alias ToRemoveA("R", cl::desc("Alias for remove-section"), cl::aliasopt(ToRemove)); static cl::opt StripSections("strip-sections", cl::desc("Remove all section headers")); +static cl::opt + StripDWO("strip-dwo", cl::desc("remove all DWARF .dwo sections from file")); +static cl::opt ExtractDWO( + "extract-dwo", + cl::desc("remove all sections that are not DWARF .dwo sections from file")); +static cl::opt + SplitDWO("split-dwo", + cl::desc("equivalent to extract-dwo on the input file to " + ", then strip-dwo on the input file"), + cl::value_desc("dwo-file")); using SectionPred = std::function; -void CopyBinary(const ELFObjectFile &ObjFile) { +bool IsDWOSection(const SectionBase &Sec) { + return Sec.Name.endswith(".dwo"); +} + +template +bool OnlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) { + // We can't remove the section header string table. + if (&Sec == Obj.getSectionHeaderStrTab()) + return false; + // Short of keeping the string table we want to keep everything that is a DWO + // section and remove everything else. + return !IsDWOSection(Sec); +} + +template +void WriteObjectFile(const Object &Obj, StringRef File) { std::unique_ptr Buffer; + ErrorOr> BufferOrErr = + FileOutputBuffer::create(File, Obj.totalSize(), + FileOutputBuffer::F_executable); + if (BufferOrErr.getError()) + error("failed to open " + OutputFilename); + else + Buffer = std::move(*BufferOrErr); + Obj.write(*Buffer); + if (auto EC = Buffer->commit()) + reportError(File, EC); +} + +template +void SplitDWOToFile(const ELFObjectFile &ObjFile, StringRef File) { + // Construct a second output file for the DWO sections. + ELFObject DWOFile(ObjFile); + + DWOFile.removeSections([&](const SectionBase &Sec) { + return OnlyKeepDWOPred(DWOFile, Sec); + }); + DWOFile.finalize(); + WriteObjectFile(DWOFile, File); +} + +void CopyBinary(const ELFObjectFile &ObjFile) { std::unique_ptr> Obj; + if (!OutputFormat.empty() && OutputFormat != "binary") error("invalid output format '" + OutputFormat + "'"); if (!OutputFormat.empty() && OutputFormat == "binary") @@ -96,6 +147,9 @@ void CopyBinary(const ELFObjectFile &ObjFile) { else Obj = llvm::make_unique>(ObjFile); + if (!SplitDWO.empty()) + SplitDWOToFile(ObjFile, SplitDWO.getValue()); + SectionPred RemovePred = [](const SectionBase &) { return false; }; if (!ToRemove.empty()) { @@ -105,6 +159,16 @@ void CopyBinary(const ELFObjectFile &ObjFile) { }; } + if (StripDWO || !SplitDWO.empty()) + RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + return IsDWOSection(Sec) || RemovePred(Sec); + }; + + if (ExtractDWO) + RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + return OnlyKeepDWOPred(*Obj, Sec) || RemovePred(Sec); + }; + if (StripSections) { RemovePred = [RemovePred](const SectionBase &Sec) { return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0; @@ -113,21 +177,8 @@ void CopyBinary(const ELFObjectFile &ObjFile) { } Obj->removeSections(RemovePred); - Obj->finalize(); - ErrorOr> BufferOrErr = - FileOutputBuffer::create(OutputFilename, Obj->totalSize(), - FileOutputBuffer::F_executable); - if (BufferOrErr.getError()) - error("failed to open " + OutputFilename); - else - Buffer = std::move(*BufferOrErr); - std::error_code EC; - if (EC) - report_fatal_error(EC.message()); - Obj->write(*Buffer); - if (auto EC = Buffer->commit()) - reportError(OutputFilename, EC); + WriteObjectFile(*Obj, OutputFilename.getValue()); } int main(int argc, char **argv) { From 1b91c5e8aad019b3b3649db6c496b74739b4e5d2 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 19:01:57 +0000 Subject: [PATCH 062/238] Add CallSiteSplitting pass Summary: This change add a pass which tries to split a call-site to pass more constrained arguments if its argument is predicated in the control flow so that we can expose better context to the later passes (e.g, inliner, jump threading, or IPA-CP based function cloning, etc.). As of now we support two cases : 1) If a call site is dominated by an OR condition and if any of its arguments are predicated on this OR condition, try to split the condition with more constrained arguments. For example, in the code below, we try to split the call site since we can predicate the argument (ptr) based on the OR condition. Split from : if (!ptr || c) callee(ptr); to : if (!ptr) callee(null ptr) // set the known constant value else if (c) callee(nonnull ptr) // set non-null attribute in the argument 2) We can also split a call-site based on constant incoming values of a PHI For example, from : BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2, label %BB1 BB1: br label %BB2 BB2: %p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ] call void @bar(i32 %p) to BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2-split0, label %BB1 BB1: br label %BB2-split1 BB2-split0: call void @bar(i32 0) br label %BB2 BB2-split1: call void @bar(i32 1) br label %BB2 BB2: %p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ] Reviewers: davidxl, huntergr, chandlerc, mcrosier, eraman, davide Reviewed By: davidxl Subscribers: sdesmalen, ashutosh.nema, fhahn, mssimpso, aemerson, mgorny, mehdi_amini, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D39137 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317351 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 + .../Transforms/Scalar/CallSiteSplitting.h | 29 ++ lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 + lib/Transforms/IPO/PassManagerBuilder.cpp | 6 + lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/CallSiteSplitting.cpp | 492 ++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/Other/new-pm-defaults.ll | 1 + test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 + .../callsite-split-or-phi.ll | 339 ++++++++++++ .../CallSiteSplitting/callsite-split.ll | 119 +++++ 14 files changed, 1014 insertions(+), 3 deletions(-) create mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h create mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp create mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll create mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index b8183d1c8e2f3..9cdb49330ae14 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index a78c897683fcd..0cf1115dc9735 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); + +//===----------------------------------------------------------------------===// +// +// CallSiteSplitting - This pass split call-site based on its known argument +// values. +FunctionPass *createCallSiteSplittingPass(); + + //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h new file mode 100644 index 0000000000000..5ab951a49f2c8 --- /dev/null +++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h @@ -0,0 +1,29 @@ +//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H +#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +struct CallSiteSplittingPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 21d95a07125c3..2088ea0cea269 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); + if (Level == O3) + EarlyFPM.addPass(CallSiteSplittingPass()); + // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -920,13 +924,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { + FunctionPassManager EarlyFPM(DebugLogging); + EarlyFPM.addPass(CallSiteSplittingPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); - // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 20d1220ac3301..40b884351fd5b 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) +FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 828eb5eee2978..b8ff614f7c8ca 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + if (OptLevel > 2) + MPM.add(createCallSiteSplittingPass()); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -703,6 +706,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { + // Split call-site with more constrained arguments. + PM.add(createCallSiteSplittingPass()); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005d3..6a27fbca8b782 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp + CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp new file mode 100644 index 0000000000000..251e3322359b2 --- /dev/null +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -0,0 +1,492 @@ +//===- CallSiteSplitting.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that tries to split a call-site to pass +// more constrained arguments if its argument is predicated in the control flow +// so that we can expose better context to the later passes (e.g, inliner, jump +// threading, or IPA-CP based function cloning, etc.). +// As of now we support two cases : +// +// 1) If a call site is dominated by an OR condition and if any of its arguments +// are predicated on this OR condition, try to split the condition with more +// constrained arguments. For example, in the code below, we try to split the +// call site since we can predicate the argument(ptr) based on the OR condition. +// +// Split from : +// if (!ptr || c) +// callee(ptr); +// to : +// if (!ptr) +// callee(null) // set the known constant value +// else if (c) +// callee(nonnull ptr) // set non-null attribute in the argument +// +// 2) We can also split a call-site based on constant incoming values of a PHI +// For example, +// from : +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail, label %TBB +// TBB: +// br label Tail% +// Tail: +// %p = phi i32 [ 0, %Header], [ 1, %TBB] +// call void @bar(i32 %p) +// to +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail-split0, label %TBB +// TBB: +// br label %Tail-split1 +// Tail-split0: +// call void @bar(i32 0) +// br label %Tail +// Tail-split1: +// call void @bar(i32 1) +// br label %Tail +// Tail: +// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "callsite-splitting" + +STATISTIC(NumCallSiteSplit, "Number of call-site split"); + +static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, + Value *Op) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.addParamAttr(ArgNo, Attribute::NonNull); + ++ArgNo; + } +} + +static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, + Value *Op, Constant *ConstValue) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.setArgument(ArgNo, ConstValue); + ++ArgNo; + } +} + +static bool createCallSitesOnOrPredicatedArgument( + CallSite CS, Instruction *&NewCSTakenFromHeader, + Instruction *&NewCSTakenFromNextCond, + SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { + assert(BranchInsts.size() <= 2 && + "Unexpected number of blocks in the OR predicated condition"); + Instruction *Instr = CS.getInstruction(); + BasicBlock *CallSiteBB = Instr->getParent(); + TerminatorInst *HeaderTI = HeaderBB->getTerminator(); + bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); + + for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { + BranchInst *PBI = BranchInsts[I]; + assert(isa(PBI->getCondition()) && + "Unexpected condition in a conditional branch."); + ICmpInst *Cmp = cast(PBI->getCondition()); + Value *Arg = Cmp->getOperand(0); + assert(isa(Cmp->getOperand(1)) && + "Expected op1 to be a constant."); + Constant *ConstVal = cast(Cmp->getOperand(1)); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + if (PBI->getParent() == HeaderBB) { + Instruction *&CallTakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; + Instruction *&CallUntakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; + + assert(Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE && + "Unexpected predicate in an OR condition"); + + // Set the constant value for agruments in the call predicated based on + // the OR condition. + Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ + ? CallTakenFromHeader + : CallUntakenFromHeader; + setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); + + // Add the NonNull attribute if compared with the null pointer. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { + Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ + ? CallUntakenFromHeader + : CallTakenFromHeader; + addNonNullAttribute(Instr, CallToSetAttr, Arg); + } + continue; + } + + if (Pred == ICmpInst::ICMP_EQ) { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Set the constant value for the call taken from the second block in + // the OR condition. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } + } else { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } else if (Pred == ICmpInst::ICMP_NE) { + // Set the constant value for the call in the untaken path from the + // header block. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else + llvm_unreachable("Unexpected condition"); + } + } + return NewCSTakenFromHeader || NewCSTakenFromNextCond; +} + +static bool canSplitCallSite(CallSite CS) { + // FIXME: As of now we handle only CallInst. InvokeInst could be handled + // without too much effort. + Instruction *Instr = CS.getInstruction(); + if (!isa(Instr)) + return false; + + // Allow splitting a call-site only when there is no instruction before the + // call-site in the basic block. Based on this constraint, we only clone the + // call instruction, and we do not move a call-site across any other + // instruction. + BasicBlock *CallSiteBB = Instr->getParent(); + if (Instr != CallSiteBB->getFirstNonPHI()) + return false; + + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + unsigned NumPreds = std::distance(PII, PIE); + + // Allow only one extra call-site. No more than two from one call-site. + if (NumPreds != 2) + return false; + + // Cannot split an edge from an IndirectBrInst. + BasicBlock *Preds[2] = {*PII++, *PII}; + if (isa(Preds[0]->getTerminator()) || + isa(Preds[1]->getTerminator())) + return false; + + return CallSiteBB->canSplitPredecessors(); +} + +/// Return true if the CS is split into its new predecessors which are directly +/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. +/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), +/// especially for the OR predicated case where PredBB1 will point the header, +/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 +/// will be the new call-sites placed in the new predecessors split for PredBB1 +/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed +/// between Header and Tail, and CallInst2 will be the call-site between TBB and +/// Tail. For example, in the IR below with an OR condition, the call-site can +/// be split +/// +/// from : +/// +/// Header: +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail, %TBB +/// TBB: +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail, %End +/// Tail: +/// %ca = call i1 @callee (i32* %a, i32* %b) +/// +/// to : +/// +/// Header: // PredBB1 is Header +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail-split1, %TBB +/// TBB: // PredBB2 is TBB +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail-split2, %End +/// Tail-split1: +/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 +/// br %Tail +/// Tail-split2: +/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 +/// br %Tail +/// Tail: +/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] +/// +/// Note that for an OR predicated case, CallInst1 and CallInst2 should be +/// created with more constrained arguments in +/// createCallSitesOnOrPredicatedArgument(). +static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, + Instruction *CallInst1, Instruction *CallInst2) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *TailBB = Instr->getParent(); + assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); + + BasicBlock *SplitBlock1 = + SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); + BasicBlock *SplitBlock2 = + SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); + + assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); + + if (!CallInst1) + CallInst1 = Instr->clone(); + if (!CallInst2) + CallInst2 = Instr->clone(); + + CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); + CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); + + CallSite CS1(CallInst1); + CallSite CS2(CallInst2); + + // Handle PHIs used as arguments in the call-site. + for (auto &PI : *TailBB) { + PHINode *PN = dyn_cast(&PI); + if (!PN) + break; + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == PN) { + CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + } + ++ArgNo; + } + } + + // Replace users of the original call with a PHI mering call-sites split. + if (Instr->getNumUses()) { + PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); + PN->addIncoming(CallInst1, SplitBlock1); + PN->addIncoming(CallInst2, SplitBlock2); + Instr->replaceAllUsesWith(PN); + } + DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() + << "\n"); + DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() + << "\n"); + Instr->eraseFromParent(); + NumCallSiteSplit++; +} + +static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { + assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); + Value *Op0 = Cmp->getOperand(0); + unsigned ArgNo = 0; + for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; + ++I, ++ArgNo) { + // Don't consider constant or arguments that are already known non-null. + if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + + if (*I == Op0) + return true; + } + return false; +} + +static void findOrCondRelevantToCallArgument( + CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, + SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { + auto *PBI = dyn_cast(PredBB->getTerminator()); + if (!PBI || !PBI->isConditional()) + return; + + if (PBI->getSuccessor(0) == OtherPredBB || + PBI->getSuccessor(1) == OtherPredBB) + if (PredBB == OtherPredBB->getSinglePredecessor()) { + assert(!HeaderBB && "Expect to find only a single header block"); + HeaderBB = PredBB; + } + + CmpInst::Predicate Pred; + Value *Cond = PBI->getCondition(); + if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) + return; + ICmpInst *Cmp = cast(Cond); + if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) + if (isCondRelevantToAnyCallArgument(Cmp, CS)) + BranchInsts.push_back(PBI); +} + +// Return true if the call-site has an argument which is a PHI with only +// constant incoming values. +static bool isPredicatedOnPHI(CallSite CS) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *Parent = Instr->getParent(); + if (Instr != Parent->getFirstNonPHI()) + return false; + + for (auto &BI : *Parent) { + if (PHINode *PN = dyn_cast(&BI)) { + for (auto &I : CS.args()) + if (&*I == PN) { + assert(PN->getNumIncomingValues() == 2 && + "Unexpected number of incoming values"); + if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) + return false; + if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) + continue; + if (isa(PN->getIncomingValue(0)) && + isa(PN->getIncomingValue(1))) + return true; + } + } + break; + } + return false; +} + +// Return true if an agument in CS is predicated on an 'or' condition. +// Create new call-site with arguments constrained based on the OR condition. +static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, + BasicBlock *PredBB2, + Instruction *&NewCallTakenFromHeader, + Instruction *&NewCallTakenFromNextCond, + BasicBlock *&HeaderBB) { + SmallVector BranchInsts; + findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); + findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); + if (BranchInsts.empty() || !HeaderBB) + return false; + + // If an OR condition is detected, try to create call sites with constrained + // arguments (e.g., NonNull attribute or constant value). + return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, + NewCallTakenFromNextCond, + BranchInsts, HeaderBB); +} + +static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, + Instruction *&CallInst2, + BasicBlock *&PredBB1, BasicBlock *&PredBB2) { + BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); + BasicBlock *Preds[2] = {*PII++, *PII}; + BasicBlock *&HeaderBB = PredBB1; + if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, + HeaderBB) && + !isPredicatedOnPHI(CS)) + return false; + + if (!PredBB1) + PredBB1 = Preds[0]; + + PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; + return true; +} + +static bool tryToSplitCallSite(CallSite CS) { + if (!CS.arg_size()) + return false; + + BasicBlock *PredBB1 = nullptr; + BasicBlock *PredBB2 = nullptr; + Instruction *CallInst1 = nullptr; + Instruction *CallInst2 = nullptr; + if (!canSplitCallSite(CS) || + !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { + assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); + return false; + } + splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); + return true; +} + +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { + bool Changed = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { + BasicBlock &BB = *BI++; + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II++; + CallSite CS(cast(I)); + if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) + continue; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + Changed |= tryToSplitCallSite(CS); + } + } + return Changed; +} + +namespace { +struct CallSiteSplittingLegacyPass : public FunctionPass { + static char ID; + CallSiteSplittingLegacyPass() : FunctionPass(ID) { + initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &TLI = getAnalysis().getTLI(); + return doCallSiteSplitting(F, TLI); + } +}; +} // namespace + +char CallSiteSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +FunctionPass *llvm::createCallSiteSplittingPass() { + return new CallSiteSplittingLegacyPass(); +} + +PreservedAnalyses CallSiteSplittingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + + if (!doCallSiteSplitting(F, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace20685..8a5ae1b87312e 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); + initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 816f75310e305..0810a13c14182 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,6 +76,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index fc52f70ff4cc4..878198d1447b7 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,9 +29,14 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module +; CHECK-O2-NEXT: Starting llvm::Function pass manager run. +; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo +; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo +; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -42,7 +47,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index 7d40ef3eea2e0..e83f0f8705532 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,6 +72,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll new file mode 100644 index 0000000000000..d1d854d8f457f --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll @@ -0,0 +1,339 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +;CHECK-LABEL: @test_eq_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_eq_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %End, label %Tail + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_const_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[%v,%Header], [%v2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_fisrtnonphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_fisrtnonphi(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + store i32 %v, i32* %a + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_3preds_constphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { +Header: + br i1 %c1, label %Tail, label %TBB1 + +TBB1: + br i1 %c2, label %Tail, label %TBB2 + +TBB2: + br i1 %c3, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_indirectbr_phi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { +Header: + %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] + +TBB: + %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +define i32 @callee(i32* %a, i32 %v, i32 %p) { +entry: + %c = icmp ne i32* %a, null + br i1 %c, label %BB1, label %BB2 + +BB1: + call void @dummy(i32* %a, i32 %p) + br label %End + +BB2: + call void @dummy2(i32 %v, i32 %p) + br label %End + +End: + ret i32 %p +} + +declare void @dummy(i32*, i32) +declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll new file mode 100644 index 0000000000000..419fa738563c9 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split.ll @@ -0,0 +1,119 @@ +; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +%struct.bitmap = type { i32, %struct.bitmap* } + +;CHECK-LABEL: @caller +;CHECK-LABEL: NextCond: +;CHECK: br {{.*}} label %callee.exit +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) +;CHECK-LABEL: callee.exit: +;CHECK: call void @dummy2(%struct.bitmap* %a_elt) + +define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, null + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, null + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %p = phi i1 [0, %Top], [%c, %NextCond] + call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) + br label %End + +End: + ret void +} + +define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { +entry: + %tobool = icmp ne %struct.bitmap* %a_elt, null + %tobool1 = icmp ne %struct.bitmap* %b_elt, null + %or.cond = and i1 %tobool, %tobool1 + br i1 %or.cond, label %Cond, label %Big + +Cond: + %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt + br i1 %cmp, label %Small, label %Big + +Small: + call void @dummy2(%struct.bitmap* %a_elt) + br label %End + +Big: + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + br label %End + +End: + ret void +} + +declare void @dummy2(%struct.bitmap*) +declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) + + +;CHECK-LABEL: @caller2 +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @dummy4() +;CHECK-LABEL: CallSiteBB.predBB2.split: +;CHECK: call void @dummy3() +;CheCK-LABEL: CallSiteBB: +;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] +;CHECK: call void @foo(i1 %phi.call) +define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %phi = phi i1 [0, %Top],[1, %NextCond] + %u = call i1 @callee2(i1 %phi) + call void @foo(i1 %u) + br label %End + +End: + ret void +} + +define i1 @callee2(i1 %b) { +entry: + br i1 %b, label %BB1, label %BB2 + +BB1: + call void @dummy3() + br label %End + +BB2: + call void @dummy4() + br label %End + +End: + ret i1 %b +} + +declare void @dummy3() +declare void @dummy4() +declare void @foo(i1) From 06dbf5ad00617e8f526e9ed55343fc88fafef6a8 Mon Sep 17 00:00:00 2001 From: Jake Ehrlich Date: Fri, 3 Nov 2017 19:15:06 +0000 Subject: [PATCH 063/238] Reland "Add support for writing 64-bit symbol tables for archives when offsets become too large for 32-bit" Tests were failing because some bots were running out of address space and memory. Additionally the test was very slow. These issues were solved by changing the test to take advantage of sparse filse and restricting the test to run only on 64-bit systems. This should fix https://bugs.llvm.org//show_bug.cgi?id=34189 This change makes it so that if writing a K_GNU style archive, you need to output a > 32-bit offset it should output in K_GNU64 style instead. Differential Revision: https://reviews.llvm.org/D36812 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317352 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Object/ArchiveWriter.cpp | 64 ++++++++++++++++++++++++---- test/Object/archive-SYM64-write.test | 35 +++++++++++++++ 2 files changed, 90 insertions(+), 9 deletions(-) create mode 100644 test/Object/archive-SYM64-write.test diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp index 919e267680205..63f5082c29d97 100644 --- a/lib/Object/ArchiveWriter.cpp +++ b/lib/Object/ArchiveWriter.cpp @@ -122,11 +122,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) { static bool isBSDLike(object::Archive::Kind Kind) { switch (Kind) { case object::Archive::K_GNU: + case object::Archive::K_GNU64: return false; case object::Archive::K_BSD: case object::Archive::K_DARWIN: return true; - case object::Archive::K_GNU64: case object::Archive::K_DARWIN64: case object::Archive::K_COFF: break; @@ -134,8 +134,8 @@ static bool isBSDLike(object::Archive::Kind Kind) { llvm_unreachable("not supported for writting"); } -static void print32(raw_ostream &Out, object::Archive::Kind Kind, - uint32_t Val) { +template +static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) { if (isBSDLike(Kind)) support::endian::Writer(Out).write(Val); else @@ -216,6 +216,20 @@ static std::string computeRelativePath(StringRef From, StringRef To) { return Relative.str(); } +static bool is64BitKind(object::Archive::Kind Kind) { + switch (Kind) { + case object::Archive::K_GNU: + case object::Archive::K_BSD: + case object::Archive::K_DARWIN: + case object::Archive::K_COFF: + return false; + case object::Archive::K_DARWIN64: + case object::Archive::K_GNU64: + return true; + } + llvm_unreachable("not supported for writting"); +} + static void addToStringTable(raw_ostream &Out, StringRef ArcName, const NewArchiveMember &M, bool Thin) { StringRef ID = M.Buf->getBufferIdentifier(); @@ -288,6 +302,14 @@ static bool isArchiveSymbol(const object::BasicSymbolRef &S) { return true; } +static void printNBits(raw_ostream &Out, object::Archive::Kind Kind, + uint64_t Val) { + if (is64BitKind(Kind)) + print(Out, Kind, Val); + else + print(Out, Kind, Val); +} + static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, bool Deterministic, ArrayRef Members, StringRef StringTable) { @@ -299,9 +321,11 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, NumSyms += M.Symbols.size(); unsigned Size = 0; - Size += 4; // Number of entries + Size += is64BitKind(Kind) ? 8 : 4; // Number of entries if (isBSDLike(Kind)) Size += NumSyms * 8; // Table + else if (is64BitKind(Kind)) + Size += NumSyms * 8; // Table else Size += NumSyms * 4; // Table if (isBSDLike(Kind)) @@ -318,27 +342,30 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, if (isBSDLike(Kind)) printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0, 0, Size); + else if (is64BitKind(Kind)) + printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size); else printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size); uint64_t Pos = Out.tell() + Size; if (isBSDLike(Kind)) - print32(Out, Kind, NumSyms * 8); + print(Out, Kind, NumSyms * 8); else - print32(Out, Kind, NumSyms); + printNBits(Out, Kind, NumSyms); for (const MemberData &M : Members) { for (unsigned StringOffset : M.Symbols) { if (isBSDLike(Kind)) - print32(Out, Kind, StringOffset); - print32(Out, Kind, Pos); // member offset + print(Out, Kind, StringOffset); + printNBits(Out, Kind, Pos); // member offset } Pos += M.Header.size() + M.Data.size() + M.Padding.size(); } if (isBSDLike(Kind)) - print32(Out, Kind, StringTable.size()); // byte count of the string table + // byte count of the string table + print(Out, Kind, StringTable.size()); Out << StringTable; while (Pad--) @@ -442,6 +469,25 @@ Error llvm::writeArchive(StringRef ArcName, if (!StringTableBuf.empty()) Data.insert(Data.begin(), computeStringTable(StringTableBuf)); + // We would like to detect if we need to switch to a 64-bit symbol table. + if (WriteSymtab) { + uint64_t MaxOffset = 0; + uint64_t LastOffset = MaxOffset; + for (const auto& M : Data) { + // Record the start of the member's offset + LastOffset = MaxOffset; + // Account for the size of each part associated with the member. + MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size(); + // We assume 32-bit symbols to see if 32-bit symbols are possible or not. + MaxOffset += M.Symbols.size() * 4; + } + // If LastOffset isn't going to fit in a 32-bit varible we need to switch + // to 64-bit. Note that the file can be larger than 4GB as long as the last + // member starts before the 4GB offset. + if (LastOffset >> 32 != 0) + Kind = object::Archive::K_GNU64; + } + SmallString<128> TmpArchive; int TmpArchiveFD; if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a", diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test new file mode 100644 index 0000000000000..d03b54c58b37d --- /dev/null +++ b/test/Object/archive-SYM64-write.test @@ -0,0 +1,35 @@ +# REQUIRES: llvm-64-bits +# REQUIRES: system-linux + +# RUN: yaml2obj %s > %t +# RUN: dd if=%t of=%t bs=1 count=0 seek=2200M +# RUN: rm -f %t.lib +# RUN: cp %t %t2 +# RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64 +# RUN: llvm-nm --print-armap %t.lib | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + AddressAlign: 0x0000000000000001 + Content: "00" + Size: 32 + +# CHECK: Archive map +# CHECK-NEXT: main in trivial-object-test.elf-x86-64 + +# CHECK: archive-SYM64-write.test.tmp: + +# CHECK: archive-SYM64-write.test.tmp2: + +# CHECK: trivial-object-test.elf-x86-64: +# CHECK-NEXT: U SomeOtherFunction +# CHECK-NEXT: 0000000000000000 T main +# CHECK-NEXT: U puts From c86c85f907f2513916a2cbd184c8a02d7c64d5a2 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 19:17:11 +0000 Subject: [PATCH 064/238] Revert "Add CallSiteSplitting pass" Revert due to Buildbot failure. This reverts commit r317351. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317353 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 - include/llvm/Transforms/Scalar.h | 8 - .../Transforms/Scalar/CallSiteSplitting.h | 29 -- lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 - lib/Transforms/IPO/PassManagerBuilder.cpp | 6 - lib/Transforms/Scalar/CMakeLists.txt | 1 - lib/Transforms/Scalar/CallSiteSplitting.cpp | 492 ------------------ lib/Transforms/Scalar/Scalar.cpp | 1 - test/Other/new-pm-defaults.ll | 1 - test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 - .../callsite-split-or-phi.ll | 339 ------------ .../CallSiteSplitting/callsite-split.ll | 119 ----- 14 files changed, 3 insertions(+), 1014 deletions(-) delete mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h delete mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp delete mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll delete mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 9cdb49330ae14..b8183d1c8e2f3 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,7 +80,6 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); -void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 0cf1115dc9735..a78c897683fcd 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,14 +73,6 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); - -//===----------------------------------------------------------------------===// -// -// CallSiteSplitting - This pass split call-site based on its known argument -// values. -FunctionPass *createCallSiteSplittingPass(); - - //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h deleted file mode 100644 index 5ab951a49f2c8..0000000000000 --- a/include/llvm/Transforms/Scalar/CallSiteSplitting.h +++ /dev/null @@ -1,29 +0,0 @@ -//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H -#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H - -#include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/Compiler.h" -#include - -namespace llvm { - -struct CallSiteSplittingPass : PassInfoMixin { - /// \brief Run the pass over the function. - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); -}; -} // end namespace llvm - -#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 2088ea0cea269..21d95a07125c3 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,7 +89,6 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" -#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -549,9 +548,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); - if (Level == O3) - EarlyFPM.addPass(CallSiteSplittingPass()); - // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -924,16 +920,13 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { - FunctionPassManager EarlyFPM(DebugLogging); - EarlyFPM.addPass(CallSiteSplittingPass()); - MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); - // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); + // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 40b884351fd5b..20d1220ac3301 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,7 +140,6 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) -FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index b8ff614f7c8ca..828eb5eee2978 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,9 +467,6 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); - if (OptLevel > 2) - MPM.add(createCallSiteSplittingPass()); - MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -706,9 +703,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { - // Split call-site with more constrained arguments. - PM.add(createCallSiteSplittingPass()); - // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 6a27fbca8b782..d79ae851005d3 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,7 +2,6 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp - CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp deleted file mode 100644 index 251e3322359b2..0000000000000 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ /dev/null @@ -1,492 +0,0 @@ -//===- CallSiteSplitting.cpp ----------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a transformation that tries to split a call-site to pass -// more constrained arguments if its argument is predicated in the control flow -// so that we can expose better context to the later passes (e.g, inliner, jump -// threading, or IPA-CP based function cloning, etc.). -// As of now we support two cases : -// -// 1) If a call site is dominated by an OR condition and if any of its arguments -// are predicated on this OR condition, try to split the condition with more -// constrained arguments. For example, in the code below, we try to split the -// call site since we can predicate the argument(ptr) based on the OR condition. -// -// Split from : -// if (!ptr || c) -// callee(ptr); -// to : -// if (!ptr) -// callee(null) // set the known constant value -// else if (c) -// callee(nonnull ptr) // set non-null attribute in the argument -// -// 2) We can also split a call-site based on constant incoming values of a PHI -// For example, -// from : -// Header: -// %c = icmp eq i32 %i1, %i2 -// br i1 %c, label %Tail, label %TBB -// TBB: -// br label Tail% -// Tail: -// %p = phi i32 [ 0, %Header], [ 1, %TBB] -// call void @bar(i32 %p) -// to -// Header: -// %c = icmp eq i32 %i1, %i2 -// br i1 %c, label %Tail-split0, label %TBB -// TBB: -// br label %Tail-split1 -// Tail-split0: -// call void @bar(i32 0) -// br label %Tail -// Tail-split1: -// call void @bar(i32 1) -// br label %Tail -// Tail: -// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar/CallSiteSplitting.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "callsite-splitting" - -STATISTIC(NumCallSiteSplit, "Number of call-site split"); - -static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, - Value *Op) { - if (!NewCallI) - NewCallI = CallI->clone(); - CallSite CS(NewCallI); - unsigned ArgNo = 0; - for (auto &I : CS.args()) { - if (&*I == Op) - CS.addParamAttr(ArgNo, Attribute::NonNull); - ++ArgNo; - } -} - -static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, - Value *Op, Constant *ConstValue) { - if (!NewCallI) - NewCallI = CallI->clone(); - CallSite CS(NewCallI); - unsigned ArgNo = 0; - for (auto &I : CS.args()) { - if (&*I == Op) - CS.setArgument(ArgNo, ConstValue); - ++ArgNo; - } -} - -static bool createCallSitesOnOrPredicatedArgument( - CallSite CS, Instruction *&NewCSTakenFromHeader, - Instruction *&NewCSTakenFromNextCond, - SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { - assert(BranchInsts.size() <= 2 && - "Unexpected number of blocks in the OR predicated condition"); - Instruction *Instr = CS.getInstruction(); - BasicBlock *CallSiteBB = Instr->getParent(); - TerminatorInst *HeaderTI = HeaderBB->getTerminator(); - bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); - - for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { - BranchInst *PBI = BranchInsts[I]; - assert(isa(PBI->getCondition()) && - "Unexpected condition in a conditional branch."); - ICmpInst *Cmp = cast(PBI->getCondition()); - Value *Arg = Cmp->getOperand(0); - assert(isa(Cmp->getOperand(1)) && - "Expected op1 to be a constant."); - Constant *ConstVal = cast(Cmp->getOperand(1)); - CmpInst::Predicate Pred = Cmp->getPredicate(); - - if (PBI->getParent() == HeaderBB) { - Instruction *&CallTakenFromHeader = - IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; - Instruction *&CallUntakenFromHeader = - IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - - assert(Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE && - "Unexpected predicate in an OR condition"); - - // Set the constant value for agruments in the call predicated based on - // the OR condition. - Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ - ? CallTakenFromHeader - : CallUntakenFromHeader; - setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); - - // Add the NonNull attribute if compared with the null pointer. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { - Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ - ? CallUntakenFromHeader - : CallTakenFromHeader; - addNonNullAttribute(Instr, CallToSetAttr, Arg); - } - continue; - } - - if (Pred == ICmpInst::ICMP_EQ) { - if (PBI->getSuccessor(0) == Instr->getParent()) { - // Set the constant value for the call taken from the second block in - // the OR condition. - setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); - } else { - // Add the NonNull attribute if compared with the null pointer for the - // call taken from the second block in the OR condition. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) - addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); - } - } else { - if (PBI->getSuccessor(0) == Instr->getParent()) { - // Add the NonNull attribute if compared with the null pointer for the - // call taken from the second block in the OR condition. - if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) - addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); - } else if (Pred == ICmpInst::ICMP_NE) { - // Set the constant value for the call in the untaken path from the - // header block. - setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); - } else - llvm_unreachable("Unexpected condition"); - } - } - return NewCSTakenFromHeader || NewCSTakenFromNextCond; -} - -static bool canSplitCallSite(CallSite CS) { - // FIXME: As of now we handle only CallInst. InvokeInst could be handled - // without too much effort. - Instruction *Instr = CS.getInstruction(); - if (!isa(Instr)) - return false; - - // Allow splitting a call-site only when there is no instruction before the - // call-site in the basic block. Based on this constraint, we only clone the - // call instruction, and we do not move a call-site across any other - // instruction. - BasicBlock *CallSiteBB = Instr->getParent(); - if (Instr != CallSiteBB->getFirstNonPHI()) - return false; - - pred_iterator PII = pred_begin(CallSiteBB); - pred_iterator PIE = pred_end(CallSiteBB); - unsigned NumPreds = std::distance(PII, PIE); - - // Allow only one extra call-site. No more than two from one call-site. - if (NumPreds != 2) - return false; - - // Cannot split an edge from an IndirectBrInst. - BasicBlock *Preds[2] = {*PII++, *PII}; - if (isa(Preds[0]->getTerminator()) || - isa(Preds[1]->getTerminator())) - return false; - - return CallSiteBB->canSplitPredecessors(); -} - -/// Return true if the CS is split into its new predecessors which are directly -/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. -/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), -/// especially for the OR predicated case where PredBB1 will point the header, -/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 -/// will be the new call-sites placed in the new predecessors split for PredBB1 -/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed -/// between Header and Tail, and CallInst2 will be the call-site between TBB and -/// Tail. For example, in the IR below with an OR condition, the call-site can -/// be split -/// -/// from : -/// -/// Header: -/// %c = icmp eq i32* %a, null -/// br i1 %c %Tail, %TBB -/// TBB: -/// %c2 = icmp eq i32* %b, null -/// br i1 %c %Tail, %End -/// Tail: -/// %ca = call i1 @callee (i32* %a, i32* %b) -/// -/// to : -/// -/// Header: // PredBB1 is Header -/// %c = icmp eq i32* %a, null -/// br i1 %c %Tail-split1, %TBB -/// TBB: // PredBB2 is TBB -/// %c2 = icmp eq i32* %b, null -/// br i1 %c %Tail-split2, %End -/// Tail-split1: -/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 -/// br %Tail -/// Tail-split2: -/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 -/// br %Tail -/// Tail: -/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] -/// -/// Note that for an OR predicated case, CallInst1 and CallInst2 should be -/// created with more constrained arguments in -/// createCallSitesOnOrPredicatedArgument(). -static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, - Instruction *CallInst1, Instruction *CallInst2) { - Instruction *Instr = CS.getInstruction(); - BasicBlock *TailBB = Instr->getParent(); - assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); - - BasicBlock *SplitBlock1 = - SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); - BasicBlock *SplitBlock2 = - SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); - - assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); - - if (!CallInst1) - CallInst1 = Instr->clone(); - if (!CallInst2) - CallInst2 = Instr->clone(); - - CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); - CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); - - CallSite CS1(CallInst1); - CallSite CS2(CallInst2); - - // Handle PHIs used as arguments in the call-site. - for (auto &PI : *TailBB) { - PHINode *PN = dyn_cast(&PI); - if (!PN) - break; - unsigned ArgNo = 0; - for (auto &CI : CS.args()) { - if (&*CI == PN) { - CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); - CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); - } - ++ArgNo; - } - } - - // Replace users of the original call with a PHI mering call-sites split. - if (Instr->getNumUses()) { - PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); - PN->addIncoming(CallInst1, SplitBlock1); - PN->addIncoming(CallInst2, SplitBlock2); - Instr->replaceAllUsesWith(PN); - } - DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); - DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() - << "\n"); - DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() - << "\n"); - Instr->eraseFromParent(); - NumCallSiteSplit++; -} - -static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { - assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); - Value *Op0 = Cmp->getOperand(0); - unsigned ArgNo = 0; - for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; - ++I, ++ArgNo) { - // Don't consider constant or arguments that are already known non-null. - if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) - continue; - - if (*I == Op0) - return true; - } - return false; -} - -static void findOrCondRelevantToCallArgument( - CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, - SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { - auto *PBI = dyn_cast(PredBB->getTerminator()); - if (!PBI || !PBI->isConditional()) - return; - - if (PBI->getSuccessor(0) == OtherPredBB || - PBI->getSuccessor(1) == OtherPredBB) - if (PredBB == OtherPredBB->getSinglePredecessor()) { - assert(!HeaderBB && "Expect to find only a single header block"); - HeaderBB = PredBB; - } - - CmpInst::Predicate Pred; - Value *Cond = PBI->getCondition(); - if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) - return; - ICmpInst *Cmp = cast(Cond); - if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) - if (isCondRelevantToAnyCallArgument(Cmp, CS)) - BranchInsts.push_back(PBI); -} - -// Return true if the call-site has an argument which is a PHI with only -// constant incoming values. -static bool isPredicatedOnPHI(CallSite CS) { - Instruction *Instr = CS.getInstruction(); - BasicBlock *Parent = Instr->getParent(); - if (Instr != Parent->getFirstNonPHI()) - return false; - - for (auto &BI : *Parent) { - if (PHINode *PN = dyn_cast(&BI)) { - for (auto &I : CS.args()) - if (&*I == PN) { - assert(PN->getNumIncomingValues() == 2 && - "Unexpected number of incoming values"); - if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) - return false; - if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) - continue; - if (isa(PN->getIncomingValue(0)) && - isa(PN->getIncomingValue(1))) - return true; - } - } - break; - } - return false; -} - -// Return true if an agument in CS is predicated on an 'or' condition. -// Create new call-site with arguments constrained based on the OR condition. -static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, - BasicBlock *PredBB2, - Instruction *&NewCallTakenFromHeader, - Instruction *&NewCallTakenFromNextCond, - BasicBlock *&HeaderBB) { - SmallVector BranchInsts; - findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); - findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); - if (BranchInsts.empty() || !HeaderBB) - return false; - - // If an OR condition is detected, try to create call sites with constrained - // arguments (e.g., NonNull attribute or constant value). - return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, - NewCallTakenFromNextCond, - BranchInsts, HeaderBB); -} - -static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, - Instruction *&CallInst2, - BasicBlock *&PredBB1, BasicBlock *&PredBB2) { - BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); - pred_iterator PII = pred_begin(CallSiteBB); - pred_iterator PIE = pred_end(CallSiteBB); - assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); - BasicBlock *Preds[2] = {*PII++, *PII}; - BasicBlock *&HeaderBB = PredBB1; - if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, - HeaderBB) && - !isPredicatedOnPHI(CS)) - return false; - - if (!PredBB1) - PredBB1 = Preds[0]; - - PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; - return true; -} - -static bool tryToSplitCallSite(CallSite CS) { - if (!CS.arg_size()) - return false; - - BasicBlock *PredBB1 = nullptr; - BasicBlock *PredBB2 = nullptr; - Instruction *CallInst1 = nullptr; - Instruction *CallInst2 = nullptr; - if (!canSplitCallSite(CS) || - !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { - assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); - return false; - } - splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); - return true; -} - -static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { - bool Changed = false; - for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { - BasicBlock &BB = *BI++; - for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { - Instruction *I = &*II++; - CallSite CS(cast(I)); - if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) - continue; - - Function *Callee = CS.getCalledFunction(); - if (!Callee || Callee->isDeclaration()) - continue; - Changed |= tryToSplitCallSite(CS); - } - } - return Changed; -} - -namespace { -struct CallSiteSplittingLegacyPass : public FunctionPass { - static char ID; - CallSiteSplittingLegacyPass() : FunctionPass(ID) { - initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - FunctionPass::getAnalysisUsage(AU); - } - - bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; - - auto &TLI = getAnalysis().getTLI(); - return doCallSiteSplitting(F, TLI); - } -}; -} // namespace - -char CallSiteSplittingLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", - "Call-site splitting", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", - "Call-site splitting", false, false) -FunctionPass *llvm::createCallSiteSplittingPass() { - return new CallSiteSplittingLegacyPass(); -} - -PreservedAnalyses CallSiteSplittingPass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &TLI = AM.getResult(F); - - if (!doCallSiteSplitting(F, TLI)) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - return PA; -} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 8a5ae1b87312e..c1034ace20685 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,7 +35,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 0810a13c14182..816f75310e305 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,7 +76,6 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass -; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index 878198d1447b7..fc52f70ff4cc4 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,14 +29,9 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis -; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module -; CHECK-O2-NEXT: Starting llvm::Function pass manager run. -; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo -; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo -; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -47,7 +42,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index e83f0f8705532..7d40ef3eea2e0 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,7 +72,6 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass -; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll deleted file mode 100644 index d1d854d8f457f..0000000000000 --- a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll +++ /dev/null @@ -1,339 +0,0 @@ -; RUN: opt < %s -callsite-splitting -S | FileCheck %s -; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-linaro-linux-gnueabi" - -;CHECK-LABEL: @test_eq_eq -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_eq_eq(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_eq -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_eq(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_ne -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_ne(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp ne i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_eq_eq_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_eq_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_ne_ne_untaken -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { -Header: - %tobool1 = icmp ne i32* %a, null - br i1 %tobool1, label %TBB, label %Tail - -TBB: - %cmp = icmp ne i32 %v, 1 - br i1 %cmp, label %End, label %Tail - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_const_phi -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_nonconst_phi -;CHECK-LABEL: Tail.predBB1.split: -;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) -;CHECK-LABEL: Tail.predBB2.split: -;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) -;CHECK-LABEL: Tail -;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] -;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] -;CHECK: ret i32 %[[MERGED]] -define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, %v2 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { -Header: - %tobool1 = icmp eq i32* %a, %b - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, %v2 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[%v,%Header], [%v2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_fisrtnonphi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_fisrtnonphi(i32* %a, i32 %v) { -Header: - %tobool1 = icmp eq i32* %a, null - br i1 %tobool1, label %Tail, label %TBB - -TBB: - %cmp = icmp eq i32 %v, 1 - br i1 %cmp, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - store i32 %v, i32* %a - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_3preds_constphi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { -Header: - br i1 %c1, label %Tail, label %TBB1 - -TBB1: - br i1 %c2, label %Tail, label %TBB2 - -TBB2: - br i1 %c3, label %Tail, label %End - -Tail: - %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -;CHECK-LABEL: @test_indirectbr_phi -;CHECK-NOT: Tail.predBB1.split: -;CHECK-NOT: Tail.predBB2.split: -;CHECK-LABEL: Tail: -;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) -;CHECK: ret i32 %r -define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { -Header: - %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address - indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] - -TBB: - %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address - indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] - -Tail: - %p = phi i32[1,%Header], [2, %TBB] - %r = call i32 @callee(i32* %a, i32 %v, i32 %p) - ret i32 %r - -End: - ret i32 %v -} - -define i32 @callee(i32* %a, i32 %v, i32 %p) { -entry: - %c = icmp ne i32* %a, null - br i1 %c, label %BB1, label %BB2 - -BB1: - call void @dummy(i32* %a, i32 %p) - br label %End - -BB2: - call void @dummy2(i32 %v, i32 %p) - br label %End - -End: - ret i32 %p -} - -declare void @dummy(i32*, i32) -declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll deleted file mode 100644 index 419fa738563c9..0000000000000 --- a/test/Transforms/CallSiteSplitting/callsite-split.ll +++ /dev/null @@ -1,119 +0,0 @@ -; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s -; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-linaro-linux-gnueabi" - -%struct.bitmap = type { i32, %struct.bitmap* } - -;CHECK-LABEL: @caller -;CHECK-LABEL: NextCond: -;CHECK: br {{.*}} label %callee.exit -;CHECK-LABEL: CallSiteBB.predBB1.split: -;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) -;CHECK-LABEL: callee.exit: -;CHECK: call void @dummy2(%struct.bitmap* %a_elt) - -define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { -entry: - br label %Top - -Top: - %tobool1 = icmp eq %struct.bitmap* %a_elt, null - br i1 %tobool1, label %CallSiteBB, label %NextCond - -NextCond: - %cmp = icmp ne %struct.bitmap* %b_elt, null - br i1 %cmp, label %CallSiteBB, label %End - -CallSiteBB: - %p = phi i1 [0, %Top], [%c, %NextCond] - call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) - br label %End - -End: - ret void -} - -define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { -entry: - %tobool = icmp ne %struct.bitmap* %a_elt, null - %tobool1 = icmp ne %struct.bitmap* %b_elt, null - %or.cond = and i1 %tobool, %tobool1 - br i1 %or.cond, label %Cond, label %Big - -Cond: - %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt - br i1 %cmp, label %Small, label %Big - -Small: - call void @dummy2(%struct.bitmap* %a_elt) - br label %End - -Big: - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) - br label %End - -End: - ret void -} - -declare void @dummy2(%struct.bitmap*) -declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) - - -;CHECK-LABEL: @caller2 -;CHECK-LABEL: CallSiteBB.predBB1.split: -;CHECK: call void @dummy4() -;CHECK-LABEL: CallSiteBB.predBB2.split: -;CHECK: call void @dummy3() -;CheCK-LABEL: CallSiteBB: -;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] -;CHECK: call void @foo(i1 %phi.call) -define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { -entry: - br label %Top - -Top: - %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt - br i1 %tobool1, label %CallSiteBB, label %NextCond - -NextCond: - %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt - br i1 %cmp, label %CallSiteBB, label %End - -CallSiteBB: - %phi = phi i1 [0, %Top],[1, %NextCond] - %u = call i1 @callee2(i1 %phi) - call void @foo(i1 %u) - br label %End - -End: - ret void -} - -define i1 @callee2(i1 %b) { -entry: - br i1 %b, label %BB1, label %BB2 - -BB1: - call void @dummy3() - br label %End - -BB2: - call void @dummy4() - br label %End - -End: - ret i1 %b -} - -declare void @dummy3() -declare void @dummy4() -declare void @foo(i1) From b24883f402ddb788a35189d091f5fb5286dc74f7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 19:37:41 +0000 Subject: [PATCH 065/238] [X86] Promote athlon, athlon-xp, k8, and k8-sse3 to types instead of subtypes in getHostCPUName. NFCI This removes the athlon type and simplifies the string decoding. We only really need these type/subtype breaks where we need to match libgcc/compiler-rt and these CPUs aren't part of that. I'm looking into moving some of this information to a .def file to share with clang's __builtin_cpu_is handling. And while these CPUs aren't part of that the less lines I have to deal with in the .def file the better. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317354 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Host.cpp | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 40ed87bf40df8..5b2a0f1d0c2c2 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -355,7 +355,10 @@ enum ProcessorTypes { INTEL_PRESCOTT, AMD_i486, AMDPENTIUM, - AMDATHLON, + AMD_ATHLON, + AMD_ATHLON_XP, + AMD_K8, + AMD_K8SSE3, INTEL_GOLDMONT, CPU_TYPE_MAX }; @@ -384,10 +387,6 @@ enum ProcessorSubtypes { AMDPENTIUM_K62, AMDPENTIUM_K63, AMDPENTIUM_GEODE, - AMDATHLON_CLASSIC, - AMDATHLON_XP, - AMDATHLON_K8, - AMDATHLON_K8SSE3, CPU_SUBTYPE_MAX }; @@ -864,20 +863,18 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, } break; case 6: - *Type = AMDATHLON; if (Features & (1 << FEATURE_SSE)) { - *Subtype = AMDATHLON_XP; + *Type = AMD_ATHLON_XP; break; // "athlon-xp" } - *Subtype = AMDATHLON_CLASSIC; + *Type = AMD_ATHLON; break; // "athlon" case 15: - *Type = AMDATHLON; if (Features & (1 << FEATURE_SSE3)) { - *Subtype = AMDATHLON_K8SSE3; + *Type = AMD_K8SSE3; break; // "k8-sse3" } - *Subtype = AMDATHLON_K8; + *Type = AMD_K8; break; // "k8" case 16: *Type = AMDFAM10H; // "amdfam10" @@ -1149,19 +1146,14 @@ StringRef sys::getHostCPUName() { default: return "pentium"; } - case AMDATHLON: - switch (Subtype) { - case AMDATHLON_CLASSIC: - return "athlon"; - case AMDATHLON_XP: - return "athlon-xp"; - case AMDATHLON_K8: - return "k8"; - case AMDATHLON_K8SSE3: - return "k8-sse3"; - default: - llvm_unreachable("Unexpected subtype!"); - } + case AMD_ATHLON: + return "athlon"; + case AMD_ATHLON_XP: + return "athlon-xp"; + case AMD_K8: + return "k8"; + case AMD_K8SSE3: + return "k8-sse3"; case AMDFAM10H: return "amdfam10"; case AMD_BTVER1: From b72a3a9da434080da25914c9eed94416b1adee40 Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Fri, 3 Nov 2017 20:00:05 +0000 Subject: [PATCH 066/238] [cfi-verify] Add an interesting unit test where undef search length changes result. Add an interesting unit test, found by changing --search-length-undef from the default. Program handles it correctly but good for ensuring correctness on further changes :) Reviewers: pcc Subscribers: mgorny, llvm-commits, kcc, vlad.tsyrklevich Differential Revision: https://reviews.llvm.org/D38658 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317355 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../tools/llvm-cfi-verify/FileAnalysis.cpp | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp index 0df468e8995c4..a3da1fc3f56da 100644 --- a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp +++ b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp @@ -650,7 +650,60 @@ TEST_F(BasicFileAnalysisTest, CFIProtectionComplexExample) { 0x0f, 0x0b, // 22: ud2 }, 0xDEADBEEF); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 5; EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0xDEADBEEF + 9)); + SearchLengthForUndef = PrevSearchLengthForUndef; +} + +TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTest) { + Analysis.parseSectionContents( + { + 0x77, 0x0d, // 0x688118: ja 0x688127 [+12] + 0x48, 0x89, 0xdf, // 0x68811a: mov %rbx, %rdi + 0xff, 0xd0, // 0x68811d: callq *%rax + 0x48, 0x89, 0xdf, // 0x68811f: mov %rbx, %rdi + 0xe8, 0x09, 0x00, 0x00, 0x00, // 0x688122: callq 0x688130 + 0x0f, 0x0b, // 0x688127: ud2 + }, + 0x688118); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 1; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x68811d)); + SearchLengthForUndef = PrevSearchLengthForUndef; +} + +TEST_F(BasicFileAnalysisTest, UndefSearchLengthOneTestFarAway) { + Analysis.parseSectionContents( + { + 0x74, 0x73, // 0x7759eb: je 0x775a60 + 0xe9, 0x1c, 0x04, 0x00, 0x00, 0x00, // 0x7759ed: jmpq 0x775e0e + }, + 0x7759eb); + + Analysis.parseSectionContents( + { + 0x0f, 0x85, 0xb2, 0x03, 0x00, 0x00, // 0x775a56: jne 0x775e0e + 0x48, 0x83, 0xc3, 0xf4, // 0x775a5c: add $0xfffffffffffffff4,%rbx + 0x48, 0x8b, 0x7c, 0x24, 0x10, // 0x775a60: mov 0x10(%rsp),%rdi + 0x48, 0x89, 0xde, // 0x775a65: mov %rbx,%rsi + 0xff, 0xd1, // 0x775a68: callq *%rcx + }, + 0x775a56); + + Analysis.parseSectionContents( + { + 0x0f, 0x0b, // 0x775e0e: ud2 + }, + 0x775e0e); + uint64_t PrevSearchLengthForUndef = SearchLengthForUndef; + SearchLengthForUndef = 1; + EXPECT_FALSE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = 2; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = 3; + EXPECT_TRUE(Analysis.isIndirectInstructionCFIProtected(0x775a68)); + SearchLengthForUndef = PrevSearchLengthForUndef; } } // anonymous namespace From bdc30c02fb2f7dceab4499c871fc00aa9b7543b9 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 3 Nov 2017 20:01:25 +0000 Subject: [PATCH 067/238] Add llvm::for_each as a range-based extensions to and make use of it in some cases where it is a more clear alternative to std::for_each. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317356 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 7 ++++++ lib/LTO/LTOCodeGenerator.cpp | 20 +++++++-------- .../Hexagon/HexagonVectorLoopCarriedReuse.cpp | 15 ++++++----- lib/Transforms/Utils/SplitModule.cpp | 18 ++++++------- tools/llvm-cxxdump/llvm-cxxdump.cpp | 15 ++++++----- tools/llvm-mcmarkup/llvm-mcmarkup.cpp | 13 +++++----- tools/llvm-nm/llvm-nm.cpp | 3 +-- tools/llvm-objdump/llvm-objdump.cpp | 15 ++++++----- tools/llvm-pdbutil/llvm-pdbutil.cpp | 25 ++++++++----------- tools/llvm-readobj/llvm-readobj.cpp | 15 ++++++----- tools/llvm-size/llvm-size.cpp | 15 ++++++----- unittests/ADT/STLExtrasTest.cpp | 20 ++++++++++----- 12 files changed, 92 insertions(+), 89 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 3ec9dfe5de0a8..c42d976f46784 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -813,6 +813,13 @@ void DeleteContainerSeconds(Container &C) { C.clear(); } +/// Provide wrappers to std::for_each which take ranges instead of having to +/// pass begin/end explicitly. +template +UnaryPredicate for_each(R &&Range, UnaryPredicate P) { + return std::for_each(std::begin(Range), std::end(Range), P); +} + /// Provide wrappers to std::all_of which take ranges instead of having to pass /// begin/end explicitly. template diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 9759c0c6c1d98..87867c54fad20 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -469,17 +469,15 @@ void LTOCodeGenerator::restoreLinkageForExternals() { if (I == ExternalSymbols.end()) return; - GV.setLinkage(I->second); - }; - - std::for_each(MergedModule->begin(), MergedModule->end(), externalize); - std::for_each(MergedModule->global_begin(), MergedModule->global_end(), - externalize); - std::for_each(MergedModule->alias_begin(), MergedModule->alias_end(), - externalize); -} - -void LTOCodeGenerator::verifyMergedModuleOnce() { + GV.setLinkage(I->second); + }; + + llvm::for_each(MergedModule->functions(), externalize); + llvm::for_each(MergedModule->globals(), externalize); + llvm::for_each(MergedModule->aliases(), externalize); +} + +void LTOCodeGenerator::verifyMergedModuleOnce() { // Only run on the first call. if (HasVerifiedInput) return; diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp index a0fdc70e141a5..52e5dcd46388b 100644 --- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp +++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp @@ -548,14 +548,13 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() { findValueToReuse(); if (ReuseCandidate.isDefined()) { reuseValue(); - Changed = true; - Continue = true; - } - std::for_each(Dependences.begin(), Dependences.end(), - std::default_delete()); - } while (Continue); - return Changed; -} + Changed = true; + Continue = true; + } + llvm::for_each(Dependences, std::default_delete()); + } while (Continue); + return Changed; +} void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I, DepChain &D) { diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp index 07157069518ad..934a1bd73c248 100644 --- a/lib/Transforms/Utils/SplitModule.cpp +++ b/lib/Transforms/Utils/SplitModule.cpp @@ -141,15 +141,15 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap, } if (GV.hasLocalLinkage()) - addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV); - }; - - std::for_each(M->begin(), M->end(), recordGVSet); - std::for_each(M->global_begin(), M->global_end(), recordGVSet); - std::for_each(M->alias_begin(), M->alias_end(), recordGVSet); - - // Assigned all GVs to merged clusters while balancing number of objects in - // each. + addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV); + }; + + llvm::for_each(M->functions(), recordGVSet); + llvm::for_each(M->globals(), recordGVSet); + llvm::for_each(M->aliases(), recordGVSet); + + // Assigned all GVs to merged clusters while balancing number of objects in + // each. auto CompareClusters = [](const std::pair &a, const std::pair &b) { if (a.second || b.second) diff --git a/tools/llvm-cxxdump/llvm-cxxdump.cpp b/tools/llvm-cxxdump/llvm-cxxdump.cpp index b10759ad05c03..69b1a8ef2099a 100644 --- a/tools/llvm-cxxdump/llvm-cxxdump.cpp +++ b/tools/llvm-cxxdump/llvm-cxxdump.cpp @@ -546,11 +546,10 @@ int main(int argc, const char *argv[]) { cl::ParseCommandLineOptions(argc, argv, "LLVM C++ ABI Data Dumper\n"); // Default to stdin if no filename is specified. - if (opts::InputFilenames.size() == 0) - opts::InputFilenames.push_back("-"); - - std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(), - dumpInput); - - return EXIT_SUCCESS; -} + if (opts::InputFilenames.size() == 0) + opts::InputFilenames.push_back("-"); + + llvm::for_each(opts::InputFilenames, dumpInput); + + return EXIT_SUCCESS; +} diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp index 0be3c715eee4e..db57a6bdaa82f 100644 --- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp +++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp @@ -217,10 +217,9 @@ int main(int argc, char **argv) { ToolName = argv[0]; // If no input files specified, read from stdin. - if (InputFilenames.size() == 0) - InputFilenames.push_back("-"); - - std::for_each(InputFilenames.begin(), InputFilenames.end(), - parseMCMarkup); - return 0; -} + if (InputFilenames.size() == 0) + InputFilenames.push_back("-"); + + llvm::for_each(InputFilenames, parseMCMarkup); + return 0; +} diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 852043002846a..d2909644628c7 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -1977,8 +1977,7 @@ int main(int argc, char **argv) { if (NoDyldInfo && (AddDyldInfo || DyldInfoOnly)) error("-no-dyldinfo can't be used with -add-dyldinfo or -dyldinfo-only"); - std::for_each(InputFilenames.begin(), InputFilenames.end(), - dumpSymbolNamesFromFile); + llvm::for_each(InputFilenames, dumpSymbolNamesFromFile); if (HadError) return 1; diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index 09396466c40e5..d80f1cb049da6 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -2183,11 +2183,10 @@ int main(int argc, char **argv) { && !PrintFaultMaps && DwarfDumpType == DIDT_Null) { cl::PrintHelpMessage(); - return 2; - } - - std::for_each(InputFilenames.begin(), InputFilenames.end(), - DumpInput); - - return EXIT_SUCCESS; -} + return 2; + } + + llvm::for_each(InputFilenames, DumpInput); + + return EXIT_SUCCESS; +} diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp index 8b2d5ce179f44..bee9f182e3fb5 100644 --- a/tools/llvm-pdbutil/llvm-pdbutil.cpp +++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp @@ -1199,20 +1199,17 @@ int main(int argc_, const char *argv_[]) { opts::pretty::ExcludeCompilands.push_back( "f:\\\\binaries\\\\Intermediate\\\\vctools\\\\crt_bld"); opts::pretty::ExcludeCompilands.push_back("f:\\\\dd\\\\vctools\\\\crt"); - opts::pretty::ExcludeCompilands.push_back( - "d:\\\\th.obj.x86fre\\\\minkernel"); - } - std::for_each(opts::pretty::InputFilenames.begin(), - opts::pretty::InputFilenames.end(), dumpPretty); - } else if (opts::DumpSubcommand) { - std::for_each(opts::dump::InputFilenames.begin(), - opts::dump::InputFilenames.end(), dumpRaw); - } else if (opts::BytesSubcommand) { - std::for_each(opts::bytes::InputFilenames.begin(), - opts::bytes::InputFilenames.end(), dumpBytes); - } else if (opts::DiffSubcommand) { - for (StringRef S : opts::diff::RawModiEquivalences) { - StringRef Left; + opts::pretty::ExcludeCompilands.push_back( + "d:\\\\th.obj.x86fre\\\\minkernel"); + } + llvm::for_each(opts::pretty::InputFilenames, dumpPretty); + } else if (opts::DumpSubcommand) { + llvm::for_each(opts::dump::InputFilenames, dumpRaw); + } else if (opts::BytesSubcommand) { + llvm::for_each(opts::bytes::InputFilenames, dumpBytes); + } else if (opts::DiffSubcommand) { + for (StringRef S : opts::diff::RawModiEquivalences) { + StringRef Left; StringRef Right; std::tie(Left, Right) = S.split(','); uint32_t X, Y; diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 05b7c800cc1b9..851988110ea71 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -566,14 +566,13 @@ int main(int argc, const char *argv[]) { cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n"); // Default to stdin if no filename is specified. - if (opts::InputFilenames.size() == 0) - opts::InputFilenames.push_back("-"); - - std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(), - dumpInput); - - if (opts::CodeViewMergedTypes) { - ScopedPrinter W(outs()); + if (opts::InputFilenames.size() == 0) + opts::InputFilenames.push_back("-"); + + llvm::for_each(opts::InputFilenames, dumpInput); + + if (opts::CodeViewMergedTypes) { + ScopedPrinter W(outs()); dumpCodeViewMergedTypes(W, CVTypes.IDTable, CVTypes.TypeTable); } diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp index bdb118a264e8c..7a8e744d2e6e9 100644 --- a/tools/llvm-size/llvm-size.cpp +++ b/tools/llvm-size/llvm-size.cpp @@ -880,14 +880,13 @@ int main(int argc, char **argv) { } if (InputFilenames.size() == 0) - InputFilenames.push_back("a.out"); - - MoreThanOneFile = InputFilenames.size() > 1; - std::for_each(InputFilenames.begin(), InputFilenames.end(), - printFileSectionSizes); - if (OutputFormat == berkeley && TotalSizes) - printBerkelyTotals(); - + InputFilenames.push_back("a.out"); + + MoreThanOneFile = InputFilenames.size() > 1; + llvm::for_each(InputFilenames, printFileSectionSizes); + if (OutputFormat == berkeley && TotalSizes) + printBerkelyTotals(); + if (HadError) return 1; } diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp index 2e6eb6f413f6c..68cd9f5d2c8b4 100644 --- a/unittests/ADT/STLExtrasTest.cpp +++ b/unittests/ADT/STLExtrasTest.cpp @@ -252,12 +252,20 @@ TEST(STLExtrasTest, CountAdaptor) { EXPECT_EQ(3, count(v, 1)); EXPECT_EQ(2, count(v, 2)); EXPECT_EQ(1, count(v, 3)); - EXPECT_EQ(1, count(v, 4)); -} - -TEST(STLExtrasTest, ToVector) { - std::vector v = {'a', 'b', 'c'}; - auto Enumerated = to_vector<4>(enumerate(v)); + EXPECT_EQ(1, count(v, 4)); +} + +TEST(STLExtrasTest, for_each) { + std::vector v{ 0, 1, 2, 3, 4 }; + int count = 0; + + llvm::for_each(v, [&count](int) { ++count; }); + EXPECT_EQ(5, count); +} + +TEST(STLExtrasTest, ToVector) { + std::vector v = {'a', 'b', 'c'}; + auto Enumerated = to_vector<4>(enumerate(v)); ASSERT_EQ(3u, Enumerated.size()); for (size_t I = 0; I < v.size(); ++I) { EXPECT_EQ(I, Enumerated[I].index()); From 2619256bd715b06c947e862f5f53511795dae1a3 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 3 Nov 2017 20:05:51 +0000 Subject: [PATCH 068/238] Correcting some CRLFs that snuck in with my previous commit; NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317357 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/STLExtras.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index c42d976f46784..1be5bf91385b2 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -813,12 +813,12 @@ void DeleteContainerSeconds(Container &C) { C.clear(); } -/// Provide wrappers to std::for_each which take ranges instead of having to -/// pass begin/end explicitly. -template -UnaryPredicate for_each(R &&Range, UnaryPredicate P) { - return std::for_each(std::begin(Range), std::end(Range), P); -} +/// Provide wrappers to std::for_each which take ranges instead of having to +/// pass begin/end explicitly. +template +UnaryPredicate for_each(R &&Range, UnaryPredicate P) { + return std::for_each(std::begin(Range), std::end(Range), P); +} /// Provide wrappers to std::all_of which take ranges instead of having to pass /// begin/end explicitly. From af481e4f940025c84ce601e68fdedbc1bd22cdd2 Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Fri, 3 Nov 2017 20:09:10 +0000 Subject: [PATCH 069/238] [llvm-ar] Support an options string that start with a dash Some projects call $AR like "$AR -crs output input1 input2". Differential Revision: https://reviews.llvm.org/D39538 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317358 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-ar/default-add.test | 3 ++- tools/llvm-ar/llvm-ar.cpp | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/test/tools/llvm-ar/default-add.test b/test/tools/llvm-ar/default-add.test index 88719e4efce3c..68e41c249100c 100644 --- a/test/tools/llvm-ar/default-add.test +++ b/test/tools/llvm-ar/default-add.test @@ -4,7 +4,8 @@ RUN: yaml2obj %S/Inputs/coff.yaml -o %t-coff.o RUN: rm -f %t.ar RUN: llvm-ar crs %t.ar %t-macho.o RUN: grep -q __.SYMDEF %t.ar -RUN: llvm-ar crs %t.ar %t-coff.o +Test that an option string prefixed by a dash works. +RUN: llvm-ar -crs %t.ar %t-coff.o RUN: grep -q __.SYMDEF %t.ar RUN: rm -f %t.ar diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp index 576265cfe598f..8c19f6b6af877 100644 --- a/tools/llvm-ar/llvm-ar.cpp +++ b/tools/llvm-ar/llvm-ar.cpp @@ -127,6 +127,8 @@ static cl::extrahelp MoreHelp( " [v] - be verbose about actions taken\n" ); +static const char OptionChars[] = "dmpqrtxabiosSTucv"; + // This enumeration delineates the kinds of operations on an archive // that are permitted. enum ArchiveOperation { @@ -864,6 +866,24 @@ int main(int argc, char **argv) { Stem.find("lib") != StringRef::npos) return libDriverMain(makeArrayRef(argv, argc)); + for (int i = 1; i < argc; i++) { + // If an argument starts with a dash and only contains chars + // that belong to the options chars set, remove the dash. + // We can't handle it after the command line options parsing + // is done, since it will error out on an unrecognized string + // starting with a dash. + // Make sure this doesn't match the actual llvm-ar specific options + // that start with a dash. + StringRef S = argv[i]; + if (S.startswith("-") && + S.find_first_not_of(OptionChars, 1) == StringRef::npos) { + argv[i]++; + break; + } + if (S == "--") + break; + } + // Have the command line options parsed and handle things // like --help and --version. cl::ParseCommandLineOptions(argc, argv, From ceb5b1b4346ad8e1b2f693199153a5e68c784077 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:24:19 +0000 Subject: [PATCH 070/238] Modularize: Include some required headers DenseMaps require the definition of a type to be available when using a pointer to that type as a key to know how many bits are available for tombstone/etc. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317360 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/StackMaps.h | 2 +- lib/Bitcode/Writer/ValueEnumerator.h | 2 ++ lib/CodeGen/AsmPrinter/DwarfFile.h | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h index 8263946ed9280..4407114d2741b 100644 --- a/include/llvm/CodeGen/StackMaps.h +++ b/include/llvm/CodeGen/StackMaps.h @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include #include @@ -25,7 +26,6 @@ namespace llvm { class AsmPrinter; class MCExpr; class MCStreamer; -class MCSymbol; class raw_ostream; class TargetRegisterInfo; diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h index 730187087dc52..011356c32601e 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.h +++ b/lib/Bitcode/Writer/ValueEnumerator.h @@ -18,6 +18,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/UniqueVector.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" #include "llvm/IR/UseListOrder.h" #include #include diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 6e4625ba4116f..167ca13c19c11 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DIE.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Allocator.h" #include #include @@ -27,7 +28,6 @@ class DwarfCompileUnit; class DwarfUnit; class LexicalScope; class MCSection; -class MDNode; class DwarfFile { // Target of Dwarf emission, used for sizing of abbreviations. From f4beb75be0ff7db0d9c80bbb0efddcd20e7b1d59 Mon Sep 17 00:00:00 2001 From: Jun Bum Lim Date: Fri, 3 Nov 2017 20:41:16 +0000 Subject: [PATCH 071/238] Recommit r317351 : Add CallSiteSplitting pass This recommit r317351 after fixing a buildbot failure. Original commit message: Summary: This change add a pass which tries to split a call-site to pass more constrained arguments if its argument is predicated in the control flow so that we can expose better context to the later passes (e.g, inliner, jump threading, or IPA-CP based function cloning, etc.). As of now we support two cases : 1) If a call site is dominated by an OR condition and if any of its arguments are predicated on this OR condition, try to split the condition with more constrained arguments. For example, in the code below, we try to split the call site since we can predicate the argument (ptr) based on the OR condition. Split from : if (!ptr || c) callee(ptr); to : if (!ptr) callee(null ptr) // set the known constant value else if (c) callee(nonnull ptr) // set non-null attribute in the argument 2) We can also split a call-site based on constant incoming values of a PHI For example, from : BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2, label %BB1 BB1: br label %BB2 BB2: %p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ] call void @bar(i32 %p) to BB0: %c = icmp eq i32 %i1, %i2 br i1 %c, label %BB2-split0, label %BB1 BB1: br label %BB2-split1 BB2-split0: call void @bar(i32 0) br label %BB2 BB2-split1: call void @bar(i32 1) br label %BB2 BB2: %p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317362 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 1 + include/llvm/Transforms/Scalar.h | 8 + .../Transforms/Scalar/CallSiteSplitting.h | 29 ++ lib/Passes/PassBuilder.cpp | 9 +- lib/Passes/PassRegistry.def | 1 + lib/Transforms/IPO/PassManagerBuilder.cpp | 6 + lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/CallSiteSplitting.cpp | 493 ++++++++++++++++++ lib/Transforms/Scalar/Scalar.cpp | 1 + test/Other/new-pm-defaults.ll | 1 + test/Other/new-pm-lto-defaults.ll | 9 +- test/Other/new-pm-thinlto-defaults.ll | 1 + .../callsite-split-or-phi.ll | 339 ++++++++++++ .../CallSiteSplitting/callsite-split.ll | 119 +++++ 14 files changed, 1015 insertions(+), 3 deletions(-) create mode 100644 include/llvm/Transforms/Scalar/CallSiteSplitting.h create mode 100644 lib/Transforms/Scalar/CallSiteSplitting.cpp create mode 100644 test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll create mode 100644 test/Transforms/CallSiteSplitting/callsite-split.ll diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index b8183d1c8e2f3..9cdb49330ae14 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchFolderPassPass(PassRegistry&); void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); void initializeCFGPrinterLegacyPassPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index a78c897683fcd..0cf1115dc9735 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -73,6 +73,14 @@ FunctionPass *createDeadCodeEliminationPass(); // FunctionPass *createDeadStoreEliminationPass(); + +//===----------------------------------------------------------------------===// +// +// CallSiteSplitting - This pass split call-site based on its known argument +// values. +FunctionPass *createCallSiteSplittingPass(); + + //===----------------------------------------------------------------------===// // // AggressiveDCE - This pass uses the SSA based Aggressive DCE algorithm. This diff --git a/include/llvm/Transforms/Scalar/CallSiteSplitting.h b/include/llvm/Transforms/Scalar/CallSiteSplitting.h new file mode 100644 index 0000000000000..5ab951a49f2c8 --- /dev/null +++ b/include/llvm/Transforms/Scalar/CallSiteSplitting.h @@ -0,0 +1,29 @@ +//===- CallSiteSplitting..h - Callsite Splitting ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H +#define LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H + +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/Compiler.h" +#include + +namespace llvm { + +struct CallSiteSplittingPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_CALLSITESPLITTING__H diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 21d95a07125c3..2088ea0cea269 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -89,6 +89,7 @@ #include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" #include "llvm/Transforms/Scalar/BDCE.h" +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" #include "llvm/Transforms/Scalar/DCE.h" @@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); + if (Level == O3) + EarlyFPM.addPass(CallSiteSplittingPass()); + // In SamplePGO ThinLTO backend, we need instcombine before profile annotation // to convert bitcast to direct calls so that they can be inlined during the // profile annotation prepration step. @@ -920,13 +924,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(InferFunctionAttrsPass()); if (Level > 1) { + FunctionPassManager EarlyFPM(DebugLogging); + EarlyFPM.addPass(CallSiteSplittingPass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. MPM.addPass(PGOIndirectCallPromotion( true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty())); - // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 20d1220ac3301..40b884351fd5b 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) +FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass()) FUNCTION_PASS("dce", DCEPass()) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 828eb5eee2978..b8ff614f7c8ca 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager( addExtensionsToPM(EP_ModuleOptimizerEarly, MPM); + if (OptLevel > 2) + MPM.add(createCallSiteSplittingPass()); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); MPM.add(createGlobalOptimizerPass()); // Optimize out global vars @@ -703,6 +706,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createInferFunctionAttrsLegacyPass()); if (OptLevel > 1) { + // Split call-site with more constrained arguments. + PM.add(createCallSiteSplittingPass()); + // Indirect call promotion. This should promote all the targets that are // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d79ae851005d3..6a27fbca8b782 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts ADCE.cpp AlignmentFromAssumptions.cpp BDCE.cpp + CallSiteSplitting.cpp ConstantHoisting.cpp ConstantProp.cpp CorrelatedValuePropagation.cpp diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp new file mode 100644 index 0000000000000..2224cb2eb6231 --- /dev/null +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -0,0 +1,493 @@ +//===- CallSiteSplitting.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that tries to split a call-site to pass +// more constrained arguments if its argument is predicated in the control flow +// so that we can expose better context to the later passes (e.g, inliner, jump +// threading, or IPA-CP based function cloning, etc.). +// As of now we support two cases : +// +// 1) If a call site is dominated by an OR condition and if any of its arguments +// are predicated on this OR condition, try to split the condition with more +// constrained arguments. For example, in the code below, we try to split the +// call site since we can predicate the argument(ptr) based on the OR condition. +// +// Split from : +// if (!ptr || c) +// callee(ptr); +// to : +// if (!ptr) +// callee(null) // set the known constant value +// else if (c) +// callee(nonnull ptr) // set non-null attribute in the argument +// +// 2) We can also split a call-site based on constant incoming values of a PHI +// For example, +// from : +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail, label %TBB +// TBB: +// br label Tail% +// Tail: +// %p = phi i32 [ 0, %Header], [ 1, %TBB] +// call void @bar(i32 %p) +// to +// Header: +// %c = icmp eq i32 %i1, %i2 +// br i1 %c, label %Tail-split0, label %TBB +// TBB: +// br label %Tail-split1 +// Tail-split0: +// call void @bar(i32 0) +// br label %Tail +// Tail-split1: +// call void @bar(i32 1) +// br label %Tail +// Tail: +// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "callsite-splitting" + +STATISTIC(NumCallSiteSplit, "Number of call-site split"); + +static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI, + Value *Op) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.addParamAttr(ArgNo, Attribute::NonNull); + ++ArgNo; + } +} + +static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI, + Value *Op, Constant *ConstValue) { + if (!NewCallI) + NewCallI = CallI->clone(); + CallSite CS(NewCallI); + unsigned ArgNo = 0; + for (auto &I : CS.args()) { + if (&*I == Op) + CS.setArgument(ArgNo, ConstValue); + ++ArgNo; + } +} + +static bool createCallSitesOnOrPredicatedArgument( + CallSite CS, Instruction *&NewCSTakenFromHeader, + Instruction *&NewCSTakenFromNextCond, + SmallVectorImpl &BranchInsts, BasicBlock *HeaderBB) { + assert(BranchInsts.size() <= 2 && + "Unexpected number of blocks in the OR predicated condition"); + Instruction *Instr = CS.getInstruction(); + BasicBlock *CallSiteBB = Instr->getParent(); + TerminatorInst *HeaderTI = HeaderBB->getTerminator(); + bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0); + + for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) { + BranchInst *PBI = BranchInsts[I]; + assert(isa(PBI->getCondition()) && + "Unexpected condition in a conditional branch."); + ICmpInst *Cmp = cast(PBI->getCondition()); + Value *Arg = Cmp->getOperand(0); + assert(isa(Cmp->getOperand(1)) && + "Expected op1 to be a constant."); + Constant *ConstVal = cast(Cmp->getOperand(1)); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + if (PBI->getParent() == HeaderBB) { + Instruction *&CallTakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond; + Instruction *&CallUntakenFromHeader = + IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; + + assert(Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE && + "Unexpected predicate in an OR condition"); + + // Set the constant value for agruments in the call predicated based on + // the OR condition. + Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ + ? CallTakenFromHeader + : CallUntakenFromHeader; + setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal); + + // Add the NonNull attribute if compared with the null pointer. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) { + Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ + ? CallUntakenFromHeader + : CallTakenFromHeader; + addNonNullAttribute(Instr, CallToSetAttr, Arg); + } + continue; + } + + if (Pred == ICmpInst::ICMP_EQ) { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Set the constant value for the call taken from the second block in + // the OR condition. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } + } else { + if (PBI->getSuccessor(0) == Instr->getParent()) { + // Add the NonNull attribute if compared with the null pointer for the + // call taken from the second block in the OR condition. + if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) + addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg); + } else if (Pred == ICmpInst::ICMP_NE) { + // Set the constant value for the call in the untaken path from the + // header block. + setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal); + } else + llvm_unreachable("Unexpected condition"); + } + } + return NewCSTakenFromHeader || NewCSTakenFromNextCond; +} + +static bool canSplitCallSite(CallSite CS) { + // FIXME: As of now we handle only CallInst. InvokeInst could be handled + // without too much effort. + Instruction *Instr = CS.getInstruction(); + if (!isa(Instr)) + return false; + + // Allow splitting a call-site only when there is no instruction before the + // call-site in the basic block. Based on this constraint, we only clone the + // call instruction, and we do not move a call-site across any other + // instruction. + BasicBlock *CallSiteBB = Instr->getParent(); + if (Instr != CallSiteBB->getFirstNonPHI()) + return false; + + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + unsigned NumPreds = std::distance(PII, PIE); + + // Allow only one extra call-site. No more than two from one call-site. + if (NumPreds != 2) + return false; + + // Cannot split an edge from an IndirectBrInst. + BasicBlock *Preds[2] = {*PII++, *PII}; + if (isa(Preds[0]->getTerminator()) || + isa(Preds[1]->getTerminator())) + return false; + + return CallSiteBB->canSplitPredecessors(); +} + +/// Return true if the CS is split into its new predecessors which are directly +/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. +/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(), +/// especially for the OR predicated case where PredBB1 will point the header, +/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2 +/// will be the new call-sites placed in the new predecessors split for PredBB1 +/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed +/// between Header and Tail, and CallInst2 will be the call-site between TBB and +/// Tail. For example, in the IR below with an OR condition, the call-site can +/// be split +/// +/// from : +/// +/// Header: +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail, %TBB +/// TBB: +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail, %End +/// Tail: +/// %ca = call i1 @callee (i32* %a, i32* %b) +/// +/// to : +/// +/// Header: // PredBB1 is Header +/// %c = icmp eq i32* %a, null +/// br i1 %c %Tail-split1, %TBB +/// TBB: // PredBB2 is TBB +/// %c2 = icmp eq i32* %b, null +/// br i1 %c %Tail-split2, %End +/// Tail-split1: +/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1 +/// br %Tail +/// Tail-split2: +/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2 +/// br %Tail +/// Tail: +/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] +/// +/// Note that for an OR predicated case, CallInst1 and CallInst2 should be +/// created with more constrained arguments in +/// createCallSitesOnOrPredicatedArgument(). +static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, + Instruction *CallInst1, Instruction *CallInst2) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *TailBB = Instr->getParent(); + assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site"); + + BasicBlock *SplitBlock1 = + SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split"); + BasicBlock *SplitBlock2 = + SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split"); + + assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split."); + + if (!CallInst1) + CallInst1 = Instr->clone(); + if (!CallInst2) + CallInst2 = Instr->clone(); + + CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt()); + CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt()); + + CallSite CS1(CallInst1); + CallSite CS2(CallInst2); + + // Handle PHIs used as arguments in the call-site. + for (auto &PI : *TailBB) { + PHINode *PN = dyn_cast(&PI); + if (!PN) + break; + unsigned ArgNo = 0; + for (auto &CI : CS.args()) { + if (&*CI == PN) { + CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1)); + CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2)); + } + ++ArgNo; + } + } + + // Replace users of the original call with a PHI mering call-sites split. + if (Instr->getNumUses()) { + PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr); + PN->addIncoming(CallInst1, SplitBlock1); + PN->addIncoming(CallInst2, SplitBlock2); + Instr->replaceAllUsesWith(PN); + } + DEBUG(dbgs() << "split call-site : " << *Instr << " into \n"); + DEBUG(dbgs() << " " << *CallInst1 << " in " << SplitBlock1->getName() + << "\n"); + DEBUG(dbgs() << " " << *CallInst2 << " in " << SplitBlock2->getName() + << "\n"); + Instr->eraseFromParent(); + NumCallSiteSplit++; +} + +static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) { + assert(isa(Cmp->getOperand(1)) && "Expected a constant operand."); + Value *Op0 = Cmp->getOperand(0); + unsigned ArgNo = 0; + for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; + ++I, ++ArgNo) { + // Don't consider constant or arguments that are already known non-null. + if (isa(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + + if (*I == Op0) + return true; + } + return false; +} + +static void findOrCondRelevantToCallArgument( + CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB, + SmallVectorImpl &BranchInsts, BasicBlock *&HeaderBB) { + auto *PBI = dyn_cast(PredBB->getTerminator()); + if (!PBI || !PBI->isConditional()) + return; + + if (PBI->getSuccessor(0) == OtherPredBB || + PBI->getSuccessor(1) == OtherPredBB) + if (PredBB == OtherPredBB->getSinglePredecessor()) { + assert(!HeaderBB && "Expect to find only a single header block"); + HeaderBB = PredBB; + } + + CmpInst::Predicate Pred; + Value *Cond = PBI->getCondition(); + if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant()))) + return; + ICmpInst *Cmp = cast(Cond); + if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) + if (isCondRelevantToAnyCallArgument(Cmp, CS)) + BranchInsts.push_back(PBI); +} + +// Return true if the call-site has an argument which is a PHI with only +// constant incoming values. +static bool isPredicatedOnPHI(CallSite CS) { + Instruction *Instr = CS.getInstruction(); + BasicBlock *Parent = Instr->getParent(); + if (Instr != Parent->getFirstNonPHI()) + return false; + + for (auto &BI : *Parent) { + if (PHINode *PN = dyn_cast(&BI)) { + for (auto &I : CS.args()) + if (&*I == PN) { + assert(PN->getNumIncomingValues() == 2 && + "Unexpected number of incoming values"); + if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1)) + return false; + if (PN->getIncomingValue(0) == PN->getIncomingValue(1)) + continue; + if (isa(PN->getIncomingValue(0)) && + isa(PN->getIncomingValue(1))) + return true; + } + } + break; + } + return false; +} + +// Return true if an agument in CS is predicated on an 'or' condition. +// Create new call-site with arguments constrained based on the OR condition. +static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1, + BasicBlock *PredBB2, + Instruction *&NewCallTakenFromHeader, + Instruction *&NewCallTakenFromNextCond, + BasicBlock *&HeaderBB) { + SmallVector BranchInsts; + findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB); + findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB); + if (BranchInsts.empty() || !HeaderBB) + return false; + + // If an OR condition is detected, try to create call sites with constrained + // arguments (e.g., NonNull attribute or constant value). + return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader, + NewCallTakenFromNextCond, + BranchInsts, HeaderBB); +} + +static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1, + Instruction *&CallInst2, + BasicBlock *&PredBB1, BasicBlock *&PredBB2) { + BasicBlock *CallSiteBB = CS.getInstruction()->getParent(); + pred_iterator PII = pred_begin(CallSiteBB); + pred_iterator PIE = pred_end(CallSiteBB); + assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors."); + (void)PIE; + BasicBlock *Preds[2] = {*PII++, *PII}; + BasicBlock *&HeaderBB = PredBB1; + if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2, + HeaderBB) && + !isPredicatedOnPHI(CS)) + return false; + + if (!PredBB1) + PredBB1 = Preds[0]; + + PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0]; + return true; +} + +static bool tryToSplitCallSite(CallSite CS) { + if (!CS.arg_size()) + return false; + + BasicBlock *PredBB1 = nullptr; + BasicBlock *PredBB2 = nullptr; + Instruction *CallInst1 = nullptr; + Instruction *CallInst2 = nullptr; + if (!canSplitCallSite(CS) || + !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) { + assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned."); + return false; + } + splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2); + return true; +} + +static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) { + bool Changed = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) { + BasicBlock &BB = *BI++; + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II++; + CallSite CS(cast(I)); + if (!CS || isa(I) || isInstructionTriviallyDead(I, &TLI)) + continue; + + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + Changed |= tryToSplitCallSite(CS); + } + } + return Changed; +} + +namespace { +struct CallSiteSplittingLegacyPass : public FunctionPass { + static char ID; + CallSiteSplittingLegacyPass() : FunctionPass(ID) { + initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto &TLI = getAnalysis().getTLI(); + return doCallSiteSplitting(F, TLI); + } +}; +} // namespace + +char CallSiteSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting", + "Call-site splitting", false, false) +FunctionPass *llvm::createCallSiteSplittingPass() { + return new CallSiteSplittingLegacyPass(); +} + +PreservedAnalyses CallSiteSplittingPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TLI = AM.getResult(F); + + if (!doCallSiteSplitting(F, TLI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + return PA; +} diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index c1034ace20685..8a5ae1b87312e 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCELegacyPassPass(Registry); initializeBDCELegacyPassPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); + initializeCallSiteSplittingLegacyPassPass(Registry); initializeConstantHoistingLegacyPassPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 816f75310e305..0810a13c14182 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -76,6 +76,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index fc52f70ff4cc4..878198d1447b7 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -29,9 +29,14 @@ ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Module +; CHECK-O2-NEXT: Starting llvm::Function pass manager run. +; CHECK-O2-NEXT: Running pass: CallSiteSplittingPass on foo +; CHECK-O2-NEXT: Running analysis: TargetLibraryAnalysis on foo +; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: PGOIndirectCallPromotion ; CHECK-O2-NEXT: Running analysis: ProfileSummaryAnalysis -; CHECK-O2-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Function ; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O2-NEXT: Running pass: IPSCCPPass ; CHECK-O2-NEXT: Running pass: CalledValuePropagationPass @@ -42,7 +47,7 @@ ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: AAManager -; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: GlobalSplitPass diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll index 7d40ef3eea2e0..e83f0f8705532 100644 --- a/test/Other/new-pm-thinlto-defaults.ll +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -72,6 +72,7 @@ ; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: IPSCCPPass ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll new file mode 100644 index 0000000000000..d1d854d8f457f --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll @@ -0,0 +1,339 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +;CHECK-LABEL: @test_eq_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_eq_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_eq_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_eq_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_ne_ne_untaken +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_ne_ne_untaken(i32* %a, i32 %v) { +Header: + %tobool1 = icmp ne i32* %a, null + br i1 %tobool1, label %TBB, label %Tail + +TBB: + %cmp = icmp ne i32 %v, 1 + br i1 %cmp, label %End, label %Tail + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_const_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi +;CHECK-LABEL: Tail.predBB1.split: +;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1) +;CHECK-LABEL: Tail.predBB2.split: +;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2) +;CHECK-LABEL: Tail +;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ] +;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +;CHECK: ret i32 %[[MERGED]] +define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) { +Header: + %tobool1 = icmp eq i32* %a, %b + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, %v2 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[%v,%Header], [%v2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_fisrtnonphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_fisrtnonphi(i32* %a, i32 %v) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %Tail, label %TBB + +TBB: + %cmp = icmp eq i32 %v, 1 + br i1 %cmp, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + store i32 %v, i32* %a + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_3preds_constphi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) { +Header: + br i1 %c1, label %Tail, label %TBB1 + +TBB1: + br i1 %c2, label %Tail, label %TBB2 + +TBB2: + br i1 %c3, label %Tail, label %End + +Tail: + %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +;CHECK-LABEL: @test_indirectbr_phi +;CHECK-NOT: Tail.predBB1.split: +;CHECK-NOT: Tail.predBB2.split: +;CHECK-LABEL: Tail: +;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p) +;CHECK: ret i32 %r +define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) { +Header: + %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail] + +TBB: + %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address + indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End] + +Tail: + %p = phi i32[1,%Header], [2, %TBB] + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +define i32 @callee(i32* %a, i32 %v, i32 %p) { +entry: + %c = icmp ne i32* %a, null + br i1 %c, label %BB1, label %BB2 + +BB1: + call void @dummy(i32* %a, i32 %p) + br label %End + +BB2: + call void @dummy2(i32 %v, i32 %p) + br label %End + +End: + ret i32 %p +} + +declare void @dummy(i32*, i32) +declare void @dummy2(i32, i32) diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll new file mode 100644 index 0000000000000..419fa738563c9 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-split.ll @@ -0,0 +1,119 @@ +; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linaro-linux-gnueabi" + +%struct.bitmap = type { i32, %struct.bitmap* } + +;CHECK-LABEL: @caller +;CHECK-LABEL: NextCond: +;CHECK: br {{.*}} label %callee.exit +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false) +;CHECK-LABEL: callee.exit: +;CHECK: call void @dummy2(%struct.bitmap* %a_elt) + +define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, null + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, null + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %p = phi i1 [0, %Top], [%c, %NextCond] + call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p) + br label %End + +End: + ret void +} + +define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) { +entry: + %tobool = icmp ne %struct.bitmap* %a_elt, null + %tobool1 = icmp ne %struct.bitmap* %b_elt, null + %or.cond = and i1 %tobool, %tobool1 + br i1 %or.cond, label %Cond, label %Big + +Cond: + %cmp = icmp eq %struct.bitmap* %dst_elt, %a_elt + br i1 %cmp, label %Small, label %Big + +Small: + call void @dummy2(%struct.bitmap* %a_elt) + br label %End + +Big: + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt) + br label %End + +End: + ret void +} + +declare void @dummy2(%struct.bitmap*) +declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*) + + +;CHECK-LABEL: @caller2 +;CHECK-LABEL: CallSiteBB.predBB1.split: +;CHECK: call void @dummy4() +;CHECK-LABEL: CallSiteBB.predBB2.split: +;CHECK: call void @dummy3() +;CheCK-LABEL: CallSiteBB: +;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ] +;CHECK: call void @foo(i1 %phi.call) +define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) { +entry: + br label %Top + +Top: + %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt + br i1 %tobool1, label %CallSiteBB, label %NextCond + +NextCond: + %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt + br i1 %cmp, label %CallSiteBB, label %End + +CallSiteBB: + %phi = phi i1 [0, %Top],[1, %NextCond] + %u = call i1 @callee2(i1 %phi) + call void @foo(i1 %u) + br label %End + +End: + ret void +} + +define i1 @callee2(i1 %b) { +entry: + br i1 %b, label %BB1, label %BB2 + +BB1: + call void @dummy3() + br label %End + +BB2: + call void @dummy4() + br label %End + +End: + ret i1 %b +} + +declare void @dummy3() +declare void @dummy4() +declare void @foo(i1) From 79eed6909a1765163e2abb428a85b670fa3fb454 Mon Sep 17 00:00:00 2001 From: Mitch Phillips Date: Fri, 3 Nov 2017 20:54:26 +0000 Subject: [PATCH 072/238] [cfi-verify] Add blacklist parsing for result filtering. Adds blacklist parsing behaviour for filtering results into four categories: - Expected Protected: Things that are not in the blacklist and are protected. - Unexpected Protected: Things that are in the blacklist and are protected. - Expected Unprotected: Things that are in the blacklist and are unprotected. - Unexpected Unprotected: Things that are not in the blacklist and are unprotected. now can optionally be invoked with a second command line argument, which specifies the blacklist file that the binary was built with. Current statistics for chromium: Reviewers: vlad.tsyrklevich Subscribers: mgorny, llvm-commits, pcc, kcc Differential Revision: https://reviews.llvm.org/D39525 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317364 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/Inputs/protected-lineinfo.s | 195 +++++++++ .../X86/Inputs/unprotected-fullinfo.s | 380 ++++++++++++++++++ .../X86/Inputs/unprotected-lineinfo.s | 159 ++++++++ .../X86/Inputs/unprotected-nolineinfo.s | 87 ++++ .../X86/blacklist-expected-unprotected.s | 17 + .../llvm-cfi-verify/X86/blacklist-match-fun.s | 17 + .../X86/blacklist-unexpected-protected.s | 17 + .../X86/indirect-cf-elimination.s | 5 +- .../llvm-cfi-verify/X86/protected-lineinfo.s | 204 +--------- .../X86/unprotected-lineinfo.s | 168 +------- .../X86/unprotected-nolineinfo.s | 91 +---- tools/llvm-cfi-verify/CMakeLists.txt | 2 +- tools/llvm-cfi-verify/LLVMBuild.txt | 2 +- tools/llvm-cfi-verify/lib/CMakeLists.txt | 3 +- tools/llvm-cfi-verify/lib/FileAnalysis.cpp | 49 +-- tools/llvm-cfi-verify/lib/FileAnalysis.h | 9 +- tools/llvm-cfi-verify/lib/LLVMBuild.txt | 2 +- tools/llvm-cfi-verify/llvm-cfi-verify.cpp | 133 ++++-- .../tools/llvm-cfi-verify/CMakeLists.txt | 1 + .../tools/llvm-cfi-verify/FileAnalysis.cpp | 1 + .../tools/llvm-cfi-verify/GraphBuilder.cpp | 1 + 21 files changed, 1037 insertions(+), 506 deletions(-) create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s create mode 100644 test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s new file mode 100644 index 0000000000000..f8cfcb8d15c4d --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/protected-lineinfo.s @@ -0,0 +1,195 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with (output is in tiny.s.0): +# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt +# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps +# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt +# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm + + .text + .file "ld-temp.o" + .p2align 4, 0x90 + .type _Z1av.cfi,@function +_Z1av.cfi: +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end + popq %rbp + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi + .cfi_endproc + + .p2align 4, 0x90 + .type _Z1bv.cfi,@function +_Z1bv.cfi: +.Lfunc_begin1: + .loc 1 2 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end + popq %rbp + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi + .cfi_endproc + + .hidden main + .globl main + .p2align 4, 0x90 + .type main,@function +main: +.Lfunc_begin2: + .loc 1 4 0 + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -8(%rbp) + movl %edi, -4(%rbp) + movq %rsi, -24(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end + cmpl $1, -4(%rbp) + .loc 1 6 7 is_stmt 0 + jne .LBB2_2 + .loc 1 0 7 + leaq _Z1av(%rip), %rax + .loc 1 7 9 is_stmt 1 + movq %rax, -16(%rbp) + .loc 1 7 5 is_stmt 0 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 + leaq _Z1bv(%rip), %rax + .loc 1 9 9 is_stmt 1 + movq %rax, -16(%rbp) +.LBB2_3: + .loc 1 0 9 is_stmt 0 + leaq .L.cfi.jumptable(%rip), %rcx + .loc 1 11 3 is_stmt 1 + movq -16(%rbp), %rax + movq %rax, %rdx + subq %rcx, %rdx + movq %rdx, %rcx + shrq $3, %rcx + shlq $61, %rdx + orq %rcx, %rdx + cmpq $1, %rdx + jbe .LBB2_5 + ud2 +.LBB2_5: + callq *%rax + .loc 1 12 1 + movl -8(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Ltmp5: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + + .p2align 3, 0x90 + .type .L.cfi.jumptable,@function +.L.cfi.jumptable: +.Lfunc_begin3: + .cfi_startproc + #APP + jmp _Z1av.cfi@PLT + int3 + int3 + int3 + jmp _Z1bv.cfi@PLT + int3 + int3 + int3 + + #NO_APP +.Lfunc_end3: + .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable + .cfi_endproc + + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 316774)" +.Linfo_string1: + .asciz "tiny.cc" +.Linfo_string2: + .asciz "" + .section .debug_abbrev,"",@progbits + .byte 1 + .byte 17 + .byte 0 + .byte 37 + .byte 14 + .byte 19 + .byte 5 + .byte 3 + .byte 14 + .byte 16 + .byte 23 + .byte 27 + .byte 14 + .byte 17 + .byte 1 + .byte 18 + .byte 6 + .byte 0 + .byte 0 + .byte 0 + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 38 + .short 4 + .long .debug_abbrev + .byte 8 + .byte 1 + .long .Linfo_string0 + .short 4 + .long .Linfo_string1 + .long .Lline_table_start0 + .long .Linfo_string2 + .quad .Lfunc_begin0 + .long .Lfunc_end2-.Lfunc_begin0 + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 + + .type _Z1av,@function +_Z1av = .L.cfi.jumptable + .type _Z1bv,@function +_Z1bv = .L.cfi.jumptable+8 + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: + diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s new file mode 100644 index 0000000000000..7b5ca07d7e493 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-fullinfo.s @@ -0,0 +1,380 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ -g tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 # tiny.cc:1:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end # tiny.cc:1:11 + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv +.Lfunc_begin1: + .loc 1 2 0 # tiny.cc:2:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end # tiny.cc:2:11 + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin2: + .loc 1 4 0 # tiny.cc:4:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end # tiny.cc:6:12 + cmpl $1, -8(%rbp) +.Ltmp5: + .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 + jne .LBB2_2 +# BB#1: + .loc 1 0 7 # tiny.cc:0:7 + movabsq $_Z1av, %rax +.Ltmp6: + .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 + movq %rax, -24(%rbp) + .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 # tiny.cc:0:5 + movabsq $_Z1bv, %rax + .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 + movq %rax, -24(%rbp) +.Ltmp7: +.LBB2_3: + .loc 1 11 3 # tiny.cc:11:3 + callq *-24(%rbp) + .loc 1 12 1 # tiny.cc:12:1 + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + .cfi_def_cfa %rsp, 8 + retq +.Ltmp8: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 317104)" # string offset=0 +.Linfo_string1: + .asciz "tiny.cc" # string offset=35 +.Linfo_string2: + .asciz "/tmp/a/b" # string offset=43 +.Linfo_string3: + .asciz "_Z1av" # string offset=52 +.Linfo_string4: + .asciz "a" # string offset=58 +.Linfo_string5: + .asciz "_Z1bv" # string offset=60 +.Linfo_string6: + .asciz "b" # string offset=66 +.Linfo_string7: + .asciz "main" # string offset=68 +.Linfo_string8: + .asciz "int" # string offset=73 +.Linfo_string9: + .asciz "argc" # string offset=77 +.Linfo_string10: + .asciz "argv" # string offset=82 +.Linfo_string11: + .asciz "char" # string offset=87 +.Linfo_string12: + .asciz "ptr" # string offset=92 + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 14 # DW_FORM_strp + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .byte 14 # DW_FORM_strp + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 21 # DW_TAG_subroutine_type + .byte 0 # DW_CHILDREN_no + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 187 # Length of Unit + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0xb4 DW_TAG_compile_unit + .long .Linfo_string0 # DW_AT_producer + .short 4 # DW_AT_language + .long .Linfo_string1 # DW_AT_name + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Linfo_string2 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc + .byte 2 # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string3 # DW_AT_linkage_name + .long .Linfo_string4 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + # DW_AT_external + .byte 2 # Abbrev [2] 0x43:0x19 DW_TAG_subprogram + .quad .Lfunc_begin1 # DW_AT_low_pc + .long .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string5 # DW_AT_linkage_name + .long .Linfo_string6 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + # DW_AT_external + .byte 3 # Abbrev [3] 0x5c:0x44 DW_TAG_subprogram + .quad .Lfunc_begin2 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin2 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .long .Linfo_string7 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 160 # DW_AT_type + # DW_AT_external + .byte 4 # Abbrev [4] 0x75:0xe DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 120 + .long .Linfo_string9 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 160 # DW_AT_type + .byte 4 # Abbrev [4] 0x83:0xe DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 112 + .long .Linfo_string10 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 4 # DW_AT_decl_line + .long 167 # DW_AT_type + .byte 5 # Abbrev [5] 0x91:0xe DW_TAG_variable + .byte 2 # DW_AT_location + .byte 145 + .byte 104 + .long .Linfo_string12 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 5 # DW_AT_decl_line + .long 184 # DW_AT_type + .byte 0 # End Of Children Mark + .byte 6 # Abbrev [6] 0xa0:0x7 DW_TAG_base_type + .long .Linfo_string8 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 7 # Abbrev [7] 0xa7:0x5 DW_TAG_pointer_type + .long 172 # DW_AT_type + .byte 7 # Abbrev [7] 0xac:0x5 DW_TAG_pointer_type + .long 177 # DW_AT_type + .byte 6 # Abbrev [6] 0xb1:0x7 DW_TAG_base_type + .long .Linfo_string11 # DW_AT_name + .byte 6 # DW_AT_encoding + .byte 1 # DW_AT_byte_size + .byte 7 # Abbrev [7] 0xb8:0x5 DW_TAG_pointer_type + .long 189 # DW_AT_type + .byte 8 # Abbrev [8] 0xbd:0x1 DW_TAG_subroutine_type + .byte 0 # End Of Children Mark + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 # End Of Macro List Mark + .section .debug_pubnames,"",@progbits + .long .LpubNames_end0-.LpubNames_begin0 # Length of Public Names Info +.LpubNames_begin0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 191 # Compilation Unit Length + .long 42 # DIE offset + .asciz "a" # External Name + .long 67 # DIE offset + .asciz "b" # External Name + .long 92 # DIE offset + .asciz "main" # External Name + .long 0 # End Mark +.LpubNames_end0: + .section .debug_pubtypes,"",@progbits + .long .LpubTypes_end0-.LpubTypes_begin0 # Length of Public Types Info +.LpubTypes_begin0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 191 # Compilation Unit Length + .long 160 # DIE offset + .asciz "int" # External Name + .long 177 # DIE offset + .asciz "char" # External Name + .long 0 # End Mark +.LpubTypes_end0: + + .ident "clang version 6.0.0 (trunk 317104)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s new file mode 100644 index 0000000000000..155f5978b465a --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-lineinfo.s @@ -0,0 +1,159 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ -gmlt tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av +.Lfunc_begin0: + .file 1 "tiny.cc" + .loc 1 1 0 # tiny.cc:1:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp0: + .loc 1 1 11 prologue_end # tiny.cc:1:11 + popq %rbp + retq +.Ltmp1: +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv +.Lfunc_begin1: + .loc 1 2 0 # tiny.cc:2:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp +.Ltmp2: + .loc 1 2 11 prologue_end # tiny.cc:2:11 + popq %rbp + retq +.Ltmp3: +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin2: + .loc 1 4 0 # tiny.cc:4:0 + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) +.Ltmp4: + .loc 1 6 12 prologue_end # tiny.cc:6:12 + cmpl $1, -8(%rbp) + .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 + jne .LBB2_2 +# BB#1: + .loc 1 0 7 # tiny.cc:0:7 + movabsq $_Z1av, %rax + .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 + movq %rax, -24(%rbp) + .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 + jmp .LBB2_3 +.LBB2_2: + .loc 1 0 5 # tiny.cc:0:5 + movabsq $_Z1bv, %rax + .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 + movq %rax, -24(%rbp) +.LBB2_3: + .loc 1 11 3 # tiny.cc:11:3 + callq *-24(%rbp) + .loc 1 12 1 # tiny.cc:12:1 + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Ltmp5: +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0 +.Linfo_string1: + .asciz "tiny.cc" # string offset=35 +.Linfo_string2: + .asciz "/tmp/a/b" # string offset=43 + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 37 # DW_AT_producer + .byte 14 # DW_FORM_strp + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long 38 # Length of Unit + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit + .long .Linfo_string0 # DW_AT_producer + .short 4 # DW_AT_language + .long .Linfo_string1 # DW_AT_name + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Linfo_string2 # DW_AT_comp_dir + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc + .section .debug_ranges,"",@progbits + .section .debug_macinfo,"",@progbits +.Lcu_macro_begin0: + .byte 0 # End Of Macro List Mark + + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s new file mode 100644 index 0000000000000..2d3cf2f484e77 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/Inputs/unprotected-nolineinfo.s @@ -0,0 +1,87 @@ +# Source (tiny.cc): +# void a() {} +# void b() {} +# int main(int argc, char** argv) { +# void(*ptr)(); +# if (argc == 1) +# ptr = &a; +# else +# ptr = &b; +# ptr(); +# } +# Compile with: +# clang++ tiny.cc -S -o tiny.s + + .text + .file "tiny.cc" + .globl _Z1av # -- Begin function _Z1av + .p2align 4, 0x90 + .type _Z1av,@function +_Z1av: # @_Z1av + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + popq %rbp + retq +.Lfunc_end0: + .size _Z1av, .Lfunc_end0-_Z1av + .cfi_endproc + # -- End function + .globl _Z1bv # -- Begin function _Z1bv + .p2align 4, 0x90 + .type _Z1bv,@function +_Z1bv: # @_Z1bv + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + popq %rbp + retq +.Lfunc_end1: + .size _Z1bv, .Lfunc_end1-_Z1bv + .cfi_endproc + # -- End function + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main + .cfi_startproc +# BB#0: + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + movq %rsp, %rbp + .cfi_def_cfa_register %rbp + subq $32, %rsp + movl $0, -4(%rbp) + movl %edi, -8(%rbp) + movq %rsi, -16(%rbp) + cmpl $1, -8(%rbp) + jne .LBB2_2 +# BB#1: + movabsq $_Z1av, %rax + movq %rax, -24(%rbp) + jmp .LBB2_3 +.LBB2_2: + movabsq $_Z1bv, %rax + movq %rax, -24(%rbp) +.LBB2_3: + callq *-24(%rbp) + movl -4(%rbp), %eax + addq $32, %rsp + popq %rbp + retq +.Lfunc_end2: + .size main, .Lfunc_end2-main + .cfi_endproc + # -- End function + + .ident "clang version 6.0.0 (trunk 316774)" + .section ".note.GNU-stack","",@progbits diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s new file mode 100644 index 0000000000000..fbcfcc2a7cc08 --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-expected-unprotected.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "src:*tiny*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: U +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'src' +# CHECK-NEXT: ====> Expected Unprotected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 1 (100.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# src:*tiny* diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s new file mode 100644 index 0000000000000..3ea829395c4fe --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-match-fun.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/unprotected-fullinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "fun:*main*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: U +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'fun' +# CHECK-NEXT: ====> Expected Unprotected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 1 (100.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# fun:*main* diff --git a/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s new file mode 100644 index 0000000000000..c6ddf2b5d118d --- /dev/null +++ b/test/tools/llvm-cfi-verify/X86/blacklist-unexpected-protected.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o +# RUN: echo "src:*tiny*" > %t.blacklist.txt +# RUN: llvm-cfi-verify %t.o %t.blacklist.txt | FileCheck %s + +# CHECK-LABEL: P +# CHECK-NEXT: tiny.cc:11 +# CHECK-NEXT: BLACKLIST MATCH, 'src' +# CHECK-NEXT: ====> Unexpected Protected + +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 1 (100.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) + +# Source: (blacklist.txt): +# src:*tiny* diff --git a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s index bf1d87a2eb84f..e9b873471cb1b 100644 --- a/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s +++ b/test/tools/llvm-cfi-verify/X86/indirect-cf-elimination.s @@ -10,7 +10,10 @@ # reporting of the cfi-verify program. It should only find a single indirect CF # instruction at `tiny.cc:11` (see protected-lineinfo.s for the source). -# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%) +# CHECK: Expected Protected: 1 (100.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) .text .file "ld-temp.o" diff --git a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s index e3bb0f7af46d9..8eaf2e5e725bb 100644 --- a/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/protected-lineinfo.s @@ -1,203 +1,11 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/protected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: llvm-cfi-verify %t.o | FileCheck %s # CHECK-LABEL: P # CHECK-NEXT: tiny.cc:11 -# CHECK: Unprotected: 0 (0.00%), Protected: 1 (100.00%) - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with (output is in tiny.s.0): -# clang++ -flto -fsanitize=cfi -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt -# clang++ tiny.o -o tiny -flto -fuse-ld=gold -Wl,-plugin-opt,save-temps -# clang++ -fsanitize=cfi -flto -fvisibility=hidden -c tiny.cc -o tiny.o -gmlt -# llvm-lto2 run @tiny.resolution.txt -o tiny.s -filetype=asm - - .text - .file "ld-temp.o" - .p2align 4, 0x90 - .type _Z1av.cfi,@function -_Z1av.cfi: -.Lfunc_begin0: - .file 1 "tiny.cc" - .loc 1 1 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp0: - .loc 1 1 11 prologue_end - popq %rbp - retq -.Ltmp1: -.Lfunc_end0: - .size _Z1av.cfi, .Lfunc_end0-_Z1av.cfi - .cfi_endproc - - .p2align 4, 0x90 - .type _Z1bv.cfi,@function -_Z1bv.cfi: -.Lfunc_begin1: - .loc 1 2 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp2: - .loc 1 2 11 prologue_end - popq %rbp - retq -.Ltmp3: -.Lfunc_end1: - .size _Z1bv.cfi, .Lfunc_end1-_Z1bv.cfi - .cfi_endproc - - .hidden main - .globl main - .p2align 4, 0x90 - .type main,@function -main: -.Lfunc_begin2: - .loc 1 4 0 - .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -8(%rbp) - movl %edi, -4(%rbp) - movq %rsi, -24(%rbp) -.Ltmp4: - .loc 1 6 12 prologue_end - cmpl $1, -4(%rbp) - .loc 1 6 7 is_stmt 0 - jne .LBB2_2 - .loc 1 0 7 - leaq _Z1av(%rip), %rax - .loc 1 7 9 is_stmt 1 - movq %rax, -16(%rbp) - .loc 1 7 5 is_stmt 0 - jmp .LBB2_3 -.LBB2_2: - .loc 1 0 5 - leaq _Z1bv(%rip), %rax - .loc 1 9 9 is_stmt 1 - movq %rax, -16(%rbp) -.LBB2_3: - .loc 1 0 9 is_stmt 0 - leaq .L.cfi.jumptable(%rip), %rcx - .loc 1 11 3 is_stmt 1 - movq -16(%rbp), %rax - movq %rax, %rdx - subq %rcx, %rdx - movq %rdx, %rcx - shrq $3, %rcx - shlq $61, %rdx - orq %rcx, %rdx - cmpq $1, %rdx - jbe .LBB2_5 - ud2 -.LBB2_5: - callq *%rax - .loc 1 12 1 - movl -8(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Ltmp5: -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - - .p2align 3, 0x90 - .type .L.cfi.jumptable,@function -.L.cfi.jumptable: -.Lfunc_begin3: - .cfi_startproc - #APP - jmp _Z1av.cfi@PLT - int3 - int3 - int3 - jmp _Z1bv.cfi@PLT - int3 - int3 - int3 - - #NO_APP -.Lfunc_end3: - .size .L.cfi.jumptable, .Lfunc_end3-.L.cfi.jumptable - .cfi_endproc - - .section .debug_str,"MS",@progbits,1 -.Linfo_string0: - .asciz "clang version 6.0.0 (trunk 316774)" -.Linfo_string1: - .asciz "tiny.cc" -.Linfo_string2: - .asciz "" - .section .debug_abbrev,"",@progbits - .byte 1 - .byte 17 - .byte 0 - .byte 37 - .byte 14 - .byte 19 - .byte 5 - .byte 3 - .byte 14 - .byte 16 - .byte 23 - .byte 27 - .byte 14 - .byte 17 - .byte 1 - .byte 18 - .byte 6 - .byte 0 - .byte 0 - .byte 0 - .section .debug_info,"",@progbits -.Lcu_begin0: - .long 38 - .short 4 - .long .debug_abbrev - .byte 8 - .byte 1 - .long .Linfo_string0 - .short 4 - .long .Linfo_string1 - .long .Lline_table_start0 - .long .Linfo_string2 - .quad .Lfunc_begin0 - .long .Lfunc_end2-.Lfunc_begin0 - .section .debug_ranges,"",@progbits - .section .debug_macinfo,"",@progbits -.Lcu_macro_begin0: - .byte 0 - - .type _Z1av,@function -_Z1av = .L.cfi.jumptable - .type _Z1bv,@function -_Z1bv = .L.cfi.jumptable+8 - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits - .section .debug_line,"",@progbits -.Lline_table_start0: - +# CHECK: Expected Protected: 1 (100.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 0 (0.00%) diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s index d8819e16e37a2..65782cb5e4208 100644 --- a/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/unprotected-lineinfo.s @@ -1,167 +1,11 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/unprotected-lineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: llvm-cfi-verify %t.o | FileCheck %s # CHECK-LABEL: U # CHECK-NEXT: tiny.cc:11 -# CHECK: Unprotected: 1 (100.00%), Protected: 0 (0.00%) - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with: -# clang++ -gmlt tiny.cc -S -o tiny.s - - .text - .file "tiny.cc" - .globl _Z1av # -- Begin function _Z1av - .p2align 4, 0x90 - .type _Z1av,@function -_Z1av: # @_Z1av -.Lfunc_begin0: - .file 1 "tiny.cc" - .loc 1 1 0 # tiny.cc:1:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp0: - .loc 1 1 11 prologue_end # tiny.cc:1:11 - popq %rbp - retq -.Ltmp1: -.Lfunc_end0: - .size _Z1av, .Lfunc_end0-_Z1av - .cfi_endproc - # -- End function - .globl _Z1bv # -- Begin function _Z1bv - .p2align 4, 0x90 - .type _Z1bv,@function -_Z1bv: # @_Z1bv -.Lfunc_begin1: - .loc 1 2 0 # tiny.cc:2:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp -.Ltmp2: - .loc 1 2 11 prologue_end # tiny.cc:2:11 - popq %rbp - retq -.Ltmp3: -.Lfunc_end1: - .size _Z1bv, .Lfunc_end1-_Z1bv - .cfi_endproc - # -- End function - .globl main # -- Begin function main - .p2align 4, 0x90 - .type main,@function -main: # @main -.Lfunc_begin2: - .loc 1 4 0 # tiny.cc:4:0 - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -4(%rbp) - movl %edi, -8(%rbp) - movq %rsi, -16(%rbp) -.Ltmp4: - .loc 1 6 12 prologue_end # tiny.cc:6:12 - cmpl $1, -8(%rbp) - .loc 1 6 7 is_stmt 0 # tiny.cc:6:7 - jne .LBB2_2 -# BB#1: - .loc 1 0 7 # tiny.cc:0:7 - movabsq $_Z1av, %rax - .loc 1 7 9 is_stmt 1 # tiny.cc:7:9 - movq %rax, -24(%rbp) - .loc 1 7 5 is_stmt 0 # tiny.cc:7:5 - jmp .LBB2_3 -.LBB2_2: - .loc 1 0 5 # tiny.cc:0:5 - movabsq $_Z1bv, %rax - .loc 1 9 9 is_stmt 1 # tiny.cc:9:9 - movq %rax, -24(%rbp) -.LBB2_3: - .loc 1 11 3 # tiny.cc:11:3 - callq *-24(%rbp) - .loc 1 12 1 # tiny.cc:12:1 - movl -4(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Ltmp5: -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - # -- End function - .section .debug_str,"MS",@progbits,1 -.Linfo_string0: - .asciz "clang version 6.0.0 (trunk 316774)" # string offset=0 -.Linfo_string1: - .asciz "tiny.cc" # string offset=35 -.Linfo_string2: - .asciz "/tmp/a/b" # string offset=43 - .section .debug_abbrev,"",@progbits - .byte 1 # Abbreviation Code - .byte 17 # DW_TAG_compile_unit - .byte 0 # DW_CHILDREN_no - .byte 37 # DW_AT_producer - .byte 14 # DW_FORM_strp - .byte 19 # DW_AT_language - .byte 5 # DW_FORM_data2 - .byte 3 # DW_AT_name - .byte 14 # DW_FORM_strp - .byte 16 # DW_AT_stmt_list - .byte 23 # DW_FORM_sec_offset - .byte 27 # DW_AT_comp_dir - .byte 14 # DW_FORM_strp - .byte 17 # DW_AT_low_pc - .byte 1 # DW_FORM_addr - .byte 18 # DW_AT_high_pc - .byte 6 # DW_FORM_data4 - .byte 0 # EOM(1) - .byte 0 # EOM(2) - .byte 0 # EOM(3) - .section .debug_info,"",@progbits -.Lcu_begin0: - .long 38 # Length of Unit - .short 4 # DWARF version number - .long .debug_abbrev # Offset Into Abbrev. Section - .byte 8 # Address Size (in bytes) - .byte 1 # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit - .long .Linfo_string0 # DW_AT_producer - .short 4 # DW_AT_language - .long .Linfo_string1 # DW_AT_name - .long .Lline_table_start0 # DW_AT_stmt_list - .long .Linfo_string2 # DW_AT_comp_dir - .quad .Lfunc_begin0 # DW_AT_low_pc - .long .Lfunc_end2-.Lfunc_begin0 # DW_AT_high_pc - .section .debug_ranges,"",@progbits - .section .debug_macinfo,"",@progbits -.Lcu_macro_begin0: - .byte 0 # End Of Macro List Mark - - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits - .section .debug_line,"",@progbits -.Lline_table_start0: +# CHECK: Expected Protected: 0 (0.00%) +# CHECK: Unexpected Protected: 0 (0.00%) +# CHECK: Expected Unprotected: 0 (0.00%) +# CHECK: Unexpected Unprotected (BAD): 1 (100.00%) diff --git a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s index c023a4a84aba4..246acf35f5bec 100644 --- a/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s +++ b/test/tools/llvm-cfi-verify/X86/unprotected-nolineinfo.s @@ -1,92 +1,5 @@ -# RUN: llvm-mc %s -filetype obj -triple x86_64-linux-elf -o %t.o +# RUN: llvm-mc %S/Inputs/unprotected-nolineinfo.s -filetype obj \ +# RUN: -triple x86_64-linux-elf -o %t.o # RUN: not llvm-cfi-verify %t.o 2>&1 | FileCheck %s # CHECK: DWARF line information missing. Did you compile with '-g'? - -# Source (tiny.cc): -# void a() {} -# void b() {} -# int main(int argc, char** argv) { -# void(*ptr)(); -# if (argc == 1) -# ptr = &a; -# else -# ptr = &b; -# ptr(); -# } -# Compile with: -# clang++ tiny.cc -S -o tiny.s - - .text - .file "tiny.cc" - .globl _Z1av # -- Begin function _Z1av - .p2align 4, 0x90 - .type _Z1av,@function -_Z1av: # @_Z1av - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - popq %rbp - retq -.Lfunc_end0: - .size _Z1av, .Lfunc_end0-_Z1av - .cfi_endproc - # -- End function - .globl _Z1bv # -- Begin function _Z1bv - .p2align 4, 0x90 - .type _Z1bv,@function -_Z1bv: # @_Z1bv - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - popq %rbp - retq -.Lfunc_end1: - .size _Z1bv, .Lfunc_end1-_Z1bv - .cfi_endproc - # -- End function - .globl main # -- Begin function main - .p2align 4, 0x90 - .type main,@function -main: # @main - .cfi_startproc -# BB#0: - pushq %rbp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - subq $32, %rsp - movl $0, -4(%rbp) - movl %edi, -8(%rbp) - movq %rsi, -16(%rbp) - cmpl $1, -8(%rbp) - jne .LBB2_2 -# BB#1: - movabsq $_Z1av, %rax - movq %rax, -24(%rbp) - jmp .LBB2_3 -.LBB2_2: - movabsq $_Z1bv, %rax - movq %rax, -24(%rbp) -.LBB2_3: - callq *-24(%rbp) - movl -4(%rbp), %eax - addq $32, %rsp - popq %rbp - retq -.Lfunc_end2: - .size main, .Lfunc_end2-main - .cfi_endproc - # -- End function - - .ident "clang version 6.0.0 (trunk 316774)" - .section ".note.GNU-stack","",@progbits diff --git a/tools/llvm-cfi-verify/CMakeLists.txt b/tools/llvm-cfi-verify/CMakeLists.txt index 07c6504bf48e9..de6a46e785955 100644 --- a/tools/llvm-cfi-verify/CMakeLists.txt +++ b/tools/llvm-cfi-verify/CMakeLists.txt @@ -4,11 +4,11 @@ set(LLVM_LINK_COMPONENTS AllTargetsDescs AllTargetsDisassemblers AllTargetsInfos - DebugInfoDWARF MC MCParser Object Support + Symbolize ) add_llvm_tool(llvm-cfi-verify diff --git a/tools/llvm-cfi-verify/LLVMBuild.txt b/tools/llvm-cfi-verify/LLVMBuild.txt index 5c4ce26309031..d5e9323027289 100644 --- a/tools/llvm-cfi-verify/LLVMBuild.txt +++ b/tools/llvm-cfi-verify/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Tool name = llvm-cfi-verify parent = Tools -required_libraries = all-targets DebugInfoDWARF MC MCDisassembler MCParser Support +required_libraries = all-targets MC MCDisassembler MCParser Support Symbolize diff --git a/tools/llvm-cfi-verify/lib/CMakeLists.txt b/tools/llvm-cfi-verify/lib/CMakeLists.txt index c90e4ed485ea9..030bfa5d6c7ef 100644 --- a/tools/llvm-cfi-verify/lib/CMakeLists.txt +++ b/tools/llvm-cfi-verify/lib/CMakeLists.txt @@ -11,5 +11,6 @@ llvm_map_components_to_libnames(libs MC MCParser Object - Support) + Support + Symbolize) target_link_libraries(LLVMCFIVerify ${libs}) diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp index 278e861dfd3a6..0d4e1f497ff83 100644 --- a/tools/llvm-cfi-verify/lib/FileAnalysis.cpp +++ b/tools/llvm-cfi-verify/lib/FileAnalysis.cpp @@ -39,22 +39,20 @@ #include using Instr = llvm::cfi_verify::FileAnalysis::Instr; +using LLVMSymbolizer = llvm::symbolize::LLVMSymbolizer; namespace llvm { namespace cfi_verify { -static cl::opt IgnoreDWARF( +bool IgnoreDWARFFlag; + +static cl::opt IgnoreDWARFArg( "ignore-dwarf", cl::desc( "Ignore all DWARF data. This relaxes the requirements for all " "statically linked libraries to have been compiled with '-g', but " "will result in false positives for 'CFI unprotected' instructions."), - cl::init(false)); - -cl::opt DWARFSearchRange( - "dwarf-search-range", - cl::desc("Address search range used to determine if instruction is valid."), - cl::init(0x10)); + cl::location(IgnoreDWARFFlag), cl::init(false)); Expected FileAnalysis::Create(StringRef Filename) { // Open the filename provided. @@ -256,12 +254,16 @@ const MCInstrAnalysis *FileAnalysis::getMCInstrAnalysis() const { return MIA.get(); } +LLVMSymbolizer &FileAnalysis::getSymbolizer() { return *Symbolizer; } + Error FileAnalysis::initialiseDisassemblyMembers() { std::string TripleName = ObjectTriple.getTriple(); ArchName = ""; MCPU = ""; std::string ErrorString; + Symbolizer.reset(new LLVMSymbolizer()); + ObjectTarget = TargetRegistry::lookupTarget(ArchName, ObjectTriple, ErrorString); if (!ObjectTarget) @@ -308,8 +310,8 @@ Error FileAnalysis::initialiseDisassemblyMembers() { } Error FileAnalysis::parseCodeSections() { - if (!IgnoreDWARF) { - DWARF.reset(DWARFContext::create(*Object).release()); + if (!IgnoreDWARFFlag) { + std::unique_ptr DWARF = DWARFContext::create(*Object); if (!DWARF) return make_error("Could not create DWARF information.", inconvertibleErrorCode()); @@ -347,21 +349,9 @@ Error FileAnalysis::parseCodeSections() { return Error::success(); } -DILineInfoTable FileAnalysis::getLineInfoForAddressRange(uint64_t Address) { - if (!hasLineTableInfo()) - return DILineInfoTable(); - - return DWARF->getLineInfoForAddressRange(Address, DWARFSearchRange); -} - -bool FileAnalysis::hasValidLineInfoForAddressRange(uint64_t Address) { - return !getLineInfoForAddressRange(Address).empty(); -} - -bool FileAnalysis::hasLineTableInfo() const { return DWARF != nullptr; } - void FileAnalysis::parseSectionContents(ArrayRef SectionBytes, uint64_t SectionAddress) { + assert(Symbolizer && "Symbolizer is uninitialised."); MCInst Instruction; Instr InstrMeta; uint64_t InstructionSize; @@ -381,8 +371,19 @@ void FileAnalysis::parseSectionContents(ArrayRef SectionBytes, InstrMeta.Valid = ValidInstruction; // Check if this instruction exists in the range of the DWARF metadata. - if (hasLineTableInfo() && !hasValidLineInfoForAddressRange(VMAddress)) - continue; + if (!IgnoreDWARFFlag) { + auto LineInfo = + Symbolizer->symbolizeCode(Object->getFileName(), VMAddress); + if (!LineInfo) { + handleAllErrors(LineInfo.takeError(), [](const ErrorInfoBase &E) { + errs() << "Symbolizer failed to get line: " << E.message() << "\n"; + }); + continue; + } + + if (LineInfo->FileName == "") + continue; + } addInstruction(InstrMeta); diff --git a/tools/llvm-cfi-verify/lib/FileAnalysis.h b/tools/llvm-cfi-verify/lib/FileAnalysis.h index 9945a2110a286..e0eecb037c371 100644 --- a/tools/llvm-cfi-verify/lib/FileAnalysis.h +++ b/tools/llvm-cfi-verify/lib/FileAnalysis.h @@ -12,7 +12,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" @@ -44,6 +44,8 @@ namespace llvm { namespace cfi_verify { +extern bool IgnoreDWARFFlag; + // Disassembler and analysis tool for machine code files. Keeps track of non- // sequential control flows, including indirect control flow instructions. class FileAnalysis { @@ -120,6 +122,7 @@ class FileAnalysis { const MCRegisterInfo *getRegisterInfo() const; const MCInstrInfo *getMCInstrInfo() const; const MCInstrAnalysis *getMCInstrAnalysis() const; + symbolize::LLVMSymbolizer &getSymbolizer(); // Returns true if this class is using DWARF line tables for elimination. bool hasLineTableInfo() const; @@ -175,8 +178,8 @@ class FileAnalysis { std::unique_ptr MIA; std::unique_ptr Printer; - // DWARF debug information. - std::unique_ptr DWARF; + // Symbolizer used for debug information parsing. + std::unique_ptr Symbolizer; // A mapping between the virtual memory address to the instruction metadata // struct. TODO(hctim): Reimplement this as a sorted vector to avoid per- diff --git a/tools/llvm-cfi-verify/lib/LLVMBuild.txt b/tools/llvm-cfi-verify/lib/LLVMBuild.txt index 99b678fc88a16..c0ae1905521a2 100644 --- a/tools/llvm-cfi-verify/lib/LLVMBuild.txt +++ b/tools/llvm-cfi-verify/lib/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = CFIVerify parent = Libraries -required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support +required_libraries = DebugInfoDWARF MC MCDisassembler MCParser Support Symbolize diff --git a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp index d4a46fcc226b6..a3c202f53bbc0 100644 --- a/tools/llvm-cfi-verify/llvm-cfi-verify.cpp +++ b/tools/llvm-cfi-verify/llvm-cfi-verify.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/SpecialCaseList.h" #include @@ -32,48 +33,120 @@ using namespace llvm::cfi_verify; cl::opt InputFilename(cl::Positional, cl::desc(""), cl::Required); +cl::opt BlacklistFilename(cl::Positional, + cl::desc("[blacklist file]"), + cl::init("-")); ExitOnError ExitOnErr; -void printIndirectCFInstructions(FileAnalysis &Analysis) { - uint64_t ProtectedCount = 0; - uint64_t UnprotectedCount = 0; +void printIndirectCFInstructions(FileAnalysis &Analysis, + const SpecialCaseList *SpecialCaseList) { + uint64_t ExpectedProtected = 0; + uint64_t UnexpectedProtected = 0; + uint64_t ExpectedUnprotected = 0; + uint64_t UnexpectedUnprotected = 0; + + symbolize::LLVMSymbolizer &Symbolizer = Analysis.getSymbolizer(); for (uint64_t Address : Analysis.getIndirectInstructions()) { const auto &InstrMeta = Analysis.getInstructionOrDie(Address); - if (Analysis.isIndirectInstructionCFIProtected(Address)) { + bool CFIProtected = Analysis.isIndirectInstructionCFIProtected(Address); + + if (CFIProtected) outs() << "P "; - ProtectedCount++; - } else { + else outs() << "U "; - UnprotectedCount++; - } outs() << format_hex(Address, 2) << " | " << Analysis.getMCInstrInfo()->getName( InstrMeta.Instruction.getOpcode()) - << " "; - outs() << "\n"; - - if (Analysis.hasLineTableInfo()) { - for (const auto &LineKV : Analysis.getLineInfoForAddressRange(Address)) { - outs() << " " << format_hex(LineKV.first, 2) << " = " - << LineKV.second.FileName << ":" << LineKV.second.Line << ":" - << LineKV.second.Column << " (" << LineKV.second.FunctionName - << ")\n"; + << " \n"; + + if (IgnoreDWARFFlag) { + if (CFIProtected) + ExpectedProtected++; + else + UnexpectedUnprotected++; + continue; + } + + auto InliningInfo = Symbolizer.symbolizeInlinedCode(InputFilename, Address); + if (!InliningInfo || InliningInfo->getNumberOfFrames() == 0) { + errs() << "Failed to symbolise " << format_hex(Address, 2) + << " with line tables from " << InputFilename << "\n"; + exit(EXIT_FAILURE); + } + + const auto &LineInfo = + InliningInfo->getFrame(InliningInfo->getNumberOfFrames() - 1); + + // Print the inlining symbolisation of this instruction. + for (uint32_t i = 0; i < InliningInfo->getNumberOfFrames(); ++i) { + const auto &Line = InliningInfo->getFrame(i); + outs() << " " << format_hex(Address, 2) << " = " << Line.FileName << ":" + << Line.Line << ":" << Line.Column << " (" << Line.FunctionName + << ")\n"; + } + + if (!SpecialCaseList) { + if (CFIProtected) + ExpectedProtected++; + else + UnexpectedUnprotected++; + continue; + } + + bool MatchesBlacklistRule = false; + if (SpecialCaseList->inSection("cfi-icall", "src", LineInfo.FileName) || + SpecialCaseList->inSection("cfi-vcall", "src", LineInfo.FileName)) { + outs() << "BLACKLIST MATCH, 'src'\n"; + MatchesBlacklistRule = true; + } + + if (SpecialCaseList->inSection("cfi-icall", "fun", LineInfo.FunctionName) || + SpecialCaseList->inSection("cfi-vcall", "fun", LineInfo.FunctionName)) { + outs() << "BLACKLIST MATCH, 'fun'\n"; + MatchesBlacklistRule = true; + } + + if (MatchesBlacklistRule) { + if (CFIProtected) { + UnexpectedProtected++; + outs() << "====> Unexpected Protected\n"; + } else { + ExpectedUnprotected++; + outs() << "====> Expected Unprotected\n"; + } + } else { + if (CFIProtected) { + ExpectedProtected++; + outs() << "====> Expected Protected\n"; + } else { + UnexpectedUnprotected++; + outs() << "====> Unexpected Unprotected\n"; } } } - if (ProtectedCount || UnprotectedCount) - outs() << formatv( - "Unprotected: {0} ({1:P}), Protected: {2} ({3:P})\n", UnprotectedCount, - (((double)UnprotectedCount) / (UnprotectedCount + ProtectedCount)), - ProtectedCount, - (((double)ProtectedCount) / (UnprotectedCount + ProtectedCount))); - else + uint64_t IndirectCFInstructions = ExpectedProtected + UnexpectedProtected + + ExpectedUnprotected + UnexpectedUnprotected; + + if (IndirectCFInstructions == 0) outs() << "No indirect CF instructions found.\n"; + + outs() << formatv("Expected Protected: {0} ({1:P})\n" + "Unexpected Protected: {2} ({3:P})\n" + "Expected Unprotected: {4} ({5:P})\n" + "Unexpected Unprotected (BAD): {6} ({7:P})\n", + ExpectedProtected, + ((double)ExpectedProtected) / IndirectCFInstructions, + UnexpectedProtected, + ((double)UnexpectedProtected) / IndirectCFInstructions, + ExpectedUnprotected, + ((double)ExpectedUnprotected) / IndirectCFInstructions, + UnexpectedUnprotected, + ((double)UnexpectedUnprotected) / IndirectCFInstructions); } int main(int argc, char **argv) { @@ -89,8 +162,18 @@ int main(int argc, char **argv) { InitializeAllAsmParsers(); InitializeAllDisassemblers(); + std::unique_ptr SpecialCaseList; + if (BlacklistFilename != "-") { + std::string Error; + SpecialCaseList = SpecialCaseList::create({BlacklistFilename}, Error); + if (!SpecialCaseList) { + errs() << "Failed to get blacklist: " << Error << "\n"; + exit(EXIT_FAILURE); + } + } + FileAnalysis Analysis = ExitOnErr(FileAnalysis::Create(InputFilename)); - printIndirectCFInstructions(Analysis); + printIndirectCFInstructions(Analysis, SpecialCaseList.get()); return EXIT_SUCCESS; } diff --git a/unittests/tools/llvm-cfi-verify/CMakeLists.txt b/unittests/tools/llvm-cfi-verify/CMakeLists.txt index ad3266c277767..adb7a55327ae8 100644 --- a/unittests/tools/llvm-cfi-verify/CMakeLists.txt +++ b/unittests/tools/llvm-cfi-verify/CMakeLists.txt @@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS MCParser Object Support + Symbolize ) add_llvm_unittest(CFIVerifyTests diff --git a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp index a3da1fc3f56da..00346ab5a14e6 100644 --- a/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp +++ b/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp @@ -64,6 +64,7 @@ class ELFx86TestFileAnalysis : public FileAnalysis { class BasicFileAnalysisTest : public ::testing::Test { protected: virtual void SetUp() { + IgnoreDWARFFlag = true; SuccessfullyInitialised = true; if (auto Err = Analysis.initialiseDisassemblyMembers()) { handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) { diff --git a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp index b200677dd09b3..a7d09b547814a 100644 --- a/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp +++ b/unittests/tools/llvm-cfi-verify/GraphBuilder.cpp @@ -126,6 +126,7 @@ class ELFx86TestFileAnalysis : public FileAnalysis { class BasicGraphBuilderTest : public ::testing::Test { protected: virtual void SetUp() { + IgnoreDWARFFlag = true; SuccessfullyInitialised = true; if (auto Err = Analysis.initialiseDisassemblyMembers()) { handleAllErrors(std::move(Err), [&](const UnsupportedDisassembly &E) { From 352adf2ec9cb3ee1fc07370e4d6b34028dd80bf3 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:57:09 +0000 Subject: [PATCH 073/238] llvm-objdump: Fix unused-lambda-capture warning by removing unused lambda capture git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317365 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-objcopy/llvm-objcopy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index 52091d3e183e7..5a09f8f18dbbb 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -160,7 +160,7 @@ void CopyBinary(const ELFObjectFile &ObjFile) { } if (StripDWO || !SplitDWO.empty()) - RemovePred = [RemovePred, &Obj](const SectionBase &Sec) { + RemovePred = [RemovePred](const SectionBase &Sec) { return IsDWOSection(Sec) || RemovePred(Sec); }; From 7711c315b294abaa47e3933ec470e04fa5b8ae80 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 20:57:10 +0000 Subject: [PATCH 074/238] GCOV: Move GCOV from IR & Support into ProfileData to fix layering This class was split between libIR and libSupport, which breaks under modular code generation. Move it into the one library that uses it, ProfileData, to resolve this issue. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317366 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/{Support => ProfileData}/GCOV.h | 4 ++-- include/llvm/ProfileData/SampleProfReader.h | 2 +- lib/IR/CMakeLists.txt | 1 - lib/ProfileData/CMakeLists.txt | 1 + lib/{IR => ProfileData}/GCOV.cpp | 2 +- tools/llvm-cov/gcov.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) rename include/llvm/{Support => ProfileData}/GCOV.h (99%) rename lib/{IR => ProfileData}/GCOV.cpp (99%) diff --git a/include/llvm/Support/GCOV.h b/include/llvm/ProfileData/GCOV.h similarity index 99% rename from include/llvm/Support/GCOV.h rename to include/llvm/ProfileData/GCOV.h index 02016e7dbd624..497f80b87b267 100644 --- a/include/llvm/Support/GCOV.h +++ b/include/llvm/ProfileData/GCOV.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_SUPPORT_GCOV_H -#define LLVM_SUPPORT_GCOV_H +#ifndef LLVM_PROFILEDATA_GCOV_H +#define LLVM_PROFILEDATA_GCOV_H #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h index 9c1f357cbbd16..0e9ab2dc60ee1 100644 --- a/include/llvm/ProfileData/SampleProfReader.h +++ b/include/llvm/ProfileData/SampleProfReader.h @@ -217,10 +217,10 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/ProfileSummary.h" +#include "llvm/ProfileData/GCOV.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorOr.h" -#include "llvm/Support/GCOV.h" #include "llvm/Support/MemoryBuffer.h" #include #include diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt index eb4b9143090cf..17822bbbb5cb0 100644 --- a/lib/IR/CMakeLists.txt +++ b/lib/IR/CMakeLists.txt @@ -22,7 +22,6 @@ add_llvm_library(LLVMCore DiagnosticPrinter.cpp Dominators.cpp Function.cpp - GCOV.cpp GVMaterializer.cpp Globals.cpp IRBuilder.cpp diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt index cd65762ae6a00..3a981d8acf425 100644 --- a/lib/ProfileData/CMakeLists.txt +++ b/lib/ProfileData/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMProfileData + GCOV.cpp InstrProf.cpp InstrProfReader.cpp InstrProfWriter.cpp diff --git a/lib/IR/GCOV.cpp b/lib/ProfileData/GCOV.cpp similarity index 99% rename from lib/IR/GCOV.cpp rename to lib/ProfileData/GCOV.cpp index d4b4552282252..d6e44389f2bef 100644 --- a/lib/IR/GCOV.cpp +++ b/lib/ProfileData/GCOV.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Support/GCOV.h" +#include "llvm/ProfileData/GCOV.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" diff --git a/tools/llvm-cov/gcov.cpp b/tools/llvm-cov/gcov.cpp index 4df7f015fd188..7776f2aa9a684 100644 --- a/tools/llvm-cov/gcov.cpp +++ b/tools/llvm-cov/gcov.cpp @@ -11,11 +11,11 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ProfileData/GCOV.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/GCOV.h" #include "llvm/Support/Path.h" #include using namespace llvm; From 4fedc84270af2e8925439152e8c324487d01c8d7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 21:08:13 +0000 Subject: [PATCH 075/238] [SimplifyCFG] When merging conditional stores, don't count the store we're merging against the PHINodeFoldingThreshold Merging conditional stores tries to check to see if the code is if convertible after the store is moved. But the store hasn't been moved yet so its being counted against the threshold. The patch adds 1 to the threshold comparison to make sure we don't count the store. I've adjusted a test to use a lower threshold to ensure we still do that conversion with the lower threshold. Differential Revision: https://reviews.llvm.org/D39570 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317368 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/SimplifyCFG.cpp | 4 +++- test/Transforms/SimplifyCFG/merge-cond-stores-2.ll | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 3c4dae92ebf3e..e0045e9f48a4e 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -2901,7 +2901,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, else return false; } - return N <= PHINodeFoldingThreshold; + // The store we want to merge is counted in N, so add 1 to make sure + // we're counting the instructions that would be left. + return N <= (PHINodeFoldingThreshold + 1); }; if (!MergeCondStoresAggressively && diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll index a2b9403800168..a2ca63d0a2df9 100644 --- a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll +++ b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 | FileCheck %s +; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=1 | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "armv7--linux-gnueabihf" From 9cf32a0f1d22c5ece8a581e98166ddac8a6e61a7 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 3 Nov 2017 21:30:06 +0000 Subject: [PATCH 076/238] Revert r317046, "Object: Move some code from ELF.h into ELF.cpp." This change resulted in a measured 1.5-2% perf regression linking chrome. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317371 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Object/ELF.h | 263 ++++++++++++++++++++++++++++++++++++++ lib/Object/ELF.cpp | 263 -------------------------------------- 2 files changed, 263 insertions(+), 263 deletions(-) diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h index 92fb46e8e9351..c24b6310465e4 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -204,6 +204,46 @@ getExtendedSymbolTableIndex(const typename ELFT::Sym *Sym, return ShndxTable[Index]; } +template +Expected +ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, + ArrayRef ShndxTable) const { + uint32_t Index = Sym->st_shndx; + if (Index == ELF::SHN_XINDEX) { + auto ErrorOrIndex = getExtendedSymbolTableIndex( + Sym, Syms.begin(), ShndxTable); + if (!ErrorOrIndex) + return ErrorOrIndex.takeError(); + return *ErrorOrIndex; + } + if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE) + return 0; + return Index; +} + +template +Expected +ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, + ArrayRef ShndxTable) const { + auto SymsOrErr = symbols(SymTab); + if (!SymsOrErr) + return SymsOrErr.takeError(); + return getSection(Sym, *SymsOrErr, ShndxTable); +} + +template +Expected +ELFFile::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols, + ArrayRef ShndxTable) const { + auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable); + if (!IndexOrErr) + return IndexOrErr.takeError(); + uint32_t Index = *IndexOrErr; + if (Index == 0) + return nullptr; + return getSection(Index); +} + template inline Expected getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) { @@ -212,6 +252,15 @@ getSymbol(typename ELFT::SymRange Symbols, uint32_t Index) { return &Symbols[Index]; } +template +Expected +ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { + auto SymtabOrErr = symbols(Sec); + if (!SymtabOrErr) + return SymtabOrErr.takeError(); + return object::getSymbol(*SymtabOrErr, Index); +} + template template Expected> @@ -232,6 +281,119 @@ ELFFile::getSectionContentsAsArray(const Elf_Shdr *Sec) const { return makeArrayRef(Start, Size / sizeof(T)); } +template +Expected> +ELFFile::getSectionContents(const Elf_Shdr *Sec) const { + return getSectionContentsAsArray(Sec); +} + +template +StringRef ELFFile::getRelocationTypeName(uint32_t Type) const { + return getELFRelocationTypeName(getHeader()->e_machine, Type); +} + +template +void ELFFile::getRelocationTypeName(uint32_t Type, + SmallVectorImpl &Result) const { + if (!isMipsELF64()) { + StringRef Name = getRelocationTypeName(Type); + Result.append(Name.begin(), Name.end()); + } else { + // The Mips N64 ABI allows up to three operations to be specified per + // relocation record. Unfortunately there's no easy way to test for the + // presence of N64 ELFs as they have no special flag that identifies them + // as being N64. We can safely assume at the moment that all Mips + // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough + // information to disambiguate between old vs new ABIs. + uint8_t Type1 = (Type >> 0) & 0xFF; + uint8_t Type2 = (Type >> 8) & 0xFF; + uint8_t Type3 = (Type >> 16) & 0xFF; + + // Concat all three relocation type names. + StringRef Name = getRelocationTypeName(Type1); + Result.append(Name.begin(), Name.end()); + + Name = getRelocationTypeName(Type2); + Result.append(1, '/'); + Result.append(Name.begin(), Name.end()); + + Name = getRelocationTypeName(Type3); + Result.append(1, '/'); + Result.append(Name.begin(), Name.end()); + } +} + +template +Expected +ELFFile::getRelocationSymbol(const Elf_Rel *Rel, + const Elf_Shdr *SymTab) const { + uint32_t Index = Rel->getSymbol(isMips64EL()); + if (Index == 0) + return nullptr; + return getEntry(SymTab, Index); +} + +template +Expected +ELFFile::getSectionStringTable(Elf_Shdr_Range Sections) const { + uint32_t Index = getHeader()->e_shstrndx; + if (Index == ELF::SHN_XINDEX) + Index = Sections[0].sh_link; + + if (!Index) // no section string table. + return ""; + if (Index >= Sections.size()) + return createError("invalid section index"); + return getStringTable(&Sections[Index]); +} + +template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} + +template +Expected> ELFFile::create(StringRef Object) { + if (sizeof(Elf_Ehdr) > Object.size()) + return createError("Invalid buffer"); + return ELFFile(Object); +} + +template +Expected ELFFile::sections() const { + const uintX_t SectionTableOffset = getHeader()->e_shoff; + if (SectionTableOffset == 0) + return ArrayRef(); + + if (getHeader()->e_shentsize != sizeof(Elf_Shdr)) + return createError( + "invalid section header entry size (e_shentsize) in ELF header"); + + const uint64_t FileSize = Buf.size(); + + if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize) + return createError("section header table goes past the end of the file"); + + // Invalid address alignment of section headers + if (SectionTableOffset & (alignof(Elf_Shdr) - 1)) + return createError("invalid alignment of section headers"); + + const Elf_Shdr *First = + reinterpret_cast(base() + SectionTableOffset); + + uintX_t NumSections = getHeader()->e_shnum; + if (NumSections == 0) + NumSections = First->sh_size; + + if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) + return createError("section table goes past the end of file"); + + const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr); + + // Section table goes past end of file! + if (SectionTableOffset + SectionTableSize > FileSize) + return createError("section table goes past the end of file"); + + return makeArrayRef(First, NumSections); +} + template template Expected ELFFile::getEntry(uint32_t Section, @@ -254,6 +416,107 @@ Expected ELFFile::getEntry(const Elf_Shdr *Section, return reinterpret_cast(base() + Pos); } +template +Expected +ELFFile::getSection(uint32_t Index) const { + auto TableOrErr = sections(); + if (!TableOrErr) + return TableOrErr.takeError(); + return object::getSection(*TableOrErr, Index); +} + +template +Expected +ELFFile::getStringTable(const Elf_Shdr *Section) const { + if (Section->sh_type != ELF::SHT_STRTAB) + return createError("invalid sh_type for string table, expected SHT_STRTAB"); + auto V = getSectionContentsAsArray(Section); + if (!V) + return V.takeError(); + ArrayRef Data = *V; + if (Data.empty()) + return createError("empty string table"); + if (Data.back() != '\0') + return createError("string table non-null terminated"); + return StringRef(Data.begin(), Data.size()); +} + +template +Expected> +ELFFile::getSHNDXTable(const Elf_Shdr &Section) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + return getSHNDXTable(Section, *SectionsOrErr); +} + +template +Expected> +ELFFile::getSHNDXTable(const Elf_Shdr &Section, + Elf_Shdr_Range Sections) const { + assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX); + auto VOrErr = getSectionContentsAsArray(&Section); + if (!VOrErr) + return VOrErr.takeError(); + ArrayRef V = *VOrErr; + auto SymTableOrErr = object::getSection(Sections, Section.sh_link); + if (!SymTableOrErr) + return SymTableOrErr.takeError(); + const Elf_Shdr &SymTable = **SymTableOrErr; + if (SymTable.sh_type != ELF::SHT_SYMTAB && + SymTable.sh_type != ELF::SHT_DYNSYM) + return createError("invalid sh_type"); + if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym))) + return createError("invalid section contents size"); + return V; +} + +template +Expected +ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + return getStringTableForSymtab(Sec, *SectionsOrErr); +} + +template +Expected +ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec, + Elf_Shdr_Range Sections) const { + + if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) + return createError( + "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM"); + auto SectionOrErr = object::getSection(Sections, Sec.sh_link); + if (!SectionOrErr) + return SectionOrErr.takeError(); + return getStringTable(*SectionOrErr); +} + +template +Expected +ELFFile::getSectionName(const Elf_Shdr *Section) const { + auto SectionsOrErr = sections(); + if (!SectionsOrErr) + return SectionsOrErr.takeError(); + auto Table = getSectionStringTable(*SectionsOrErr); + if (!Table) + return Table.takeError(); + return getSectionName(Section, *Table); +} + +template +Expected ELFFile::getSectionName(const Elf_Shdr *Section, + StringRef DotShstrtab) const { + uint32_t Offset = Section->sh_name; + if (Offset == 0) + return StringRef(); + if (Offset >= DotShstrtab.size()) + return createError("invalid string offset"); + return StringRef(DotShstrtab.data() + Offset); +} + /// This function returns the hash value for a symbol in the .dynsym section /// Name of the API remains consistent as specified in the libelf /// REF : http://www.sco.com/developers/gabi/latest/ch5.dynamic.html#hash diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index ef8c844a66f10..c72a1258c1eef 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -214,269 +214,6 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) { } } -template -Expected -ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, - ArrayRef ShndxTable) const { - uint32_t Index = Sym->st_shndx; - if (Index == ELF::SHN_XINDEX) { - auto ErrorOrIndex = getExtendedSymbolTableIndex( - Sym, Syms.begin(), ShndxTable); - if (!ErrorOrIndex) - return ErrorOrIndex.takeError(); - return *ErrorOrIndex; - } - if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE) - return 0; - return Index; -} - -template -Expected -ELFFile::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab, - ArrayRef ShndxTable) const { - auto SymsOrErr = symbols(SymTab); - if (!SymsOrErr) - return SymsOrErr.takeError(); - return getSection(Sym, *SymsOrErr, ShndxTable); -} - -template -Expected -ELFFile::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols, - ArrayRef ShndxTable) const { - auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable); - if (!IndexOrErr) - return IndexOrErr.takeError(); - uint32_t Index = *IndexOrErr; - if (Index == 0) - return nullptr; - return getSection(Index); -} - -template -Expected -ELFFile::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const { - auto SymtabOrErr = symbols(Sec); - if (!SymtabOrErr) - return SymtabOrErr.takeError(); - return object::getSymbol(*SymtabOrErr, Index); -} - -template -Expected> -ELFFile::getSectionContents(const Elf_Shdr *Sec) const { - return getSectionContentsAsArray(Sec); -} - -template -StringRef ELFFile::getRelocationTypeName(uint32_t Type) const { - return getELFRelocationTypeName(getHeader()->e_machine, Type); -} - -template -void ELFFile::getRelocationTypeName(uint32_t Type, - SmallVectorImpl &Result) const { - if (!isMipsELF64()) { - StringRef Name = getRelocationTypeName(Type); - Result.append(Name.begin(), Name.end()); - } else { - // The Mips N64 ABI allows up to three operations to be specified per - // relocation record. Unfortunately there's no easy way to test for the - // presence of N64 ELFs as they have no special flag that identifies them - // as being N64. We can safely assume at the moment that all Mips - // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough - // information to disambiguate between old vs new ABIs. - uint8_t Type1 = (Type >> 0) & 0xFF; - uint8_t Type2 = (Type >> 8) & 0xFF; - uint8_t Type3 = (Type >> 16) & 0xFF; - - // Concat all three relocation type names. - StringRef Name = getRelocationTypeName(Type1); - Result.append(Name.begin(), Name.end()); - - Name = getRelocationTypeName(Type2); - Result.append(1, '/'); - Result.append(Name.begin(), Name.end()); - - Name = getRelocationTypeName(Type3); - Result.append(1, '/'); - Result.append(Name.begin(), Name.end()); - } -} - -template -Expected -ELFFile::getRelocationSymbol(const Elf_Rel *Rel, - const Elf_Shdr *SymTab) const { - uint32_t Index = Rel->getSymbol(isMips64EL()); - if (Index == 0) - return nullptr; - return getEntry(SymTab, Index); -} - -template -Expected -ELFFile::getSectionStringTable(Elf_Shdr_Range Sections) const { - uint32_t Index = getHeader()->e_shstrndx; - if (Index == ELF::SHN_XINDEX) - Index = Sections[0].sh_link; - - if (!Index) // no section string table. - return ""; - if (Index >= Sections.size()) - return createError("invalid section index"); - return getStringTable(&Sections[Index]); -} - -template ELFFile::ELFFile(StringRef Object) : Buf(Object) {} - -template -Expected> ELFFile::create(StringRef Object) { - if (sizeof(Elf_Ehdr) > Object.size()) - return createError("Invalid buffer"); - return ELFFile(Object); -} - -template -Expected ELFFile::sections() const { - const uintX_t SectionTableOffset = getHeader()->e_shoff; - if (SectionTableOffset == 0) - return ArrayRef(); - - if (getHeader()->e_shentsize != sizeof(Elf_Shdr)) - return createError( - "invalid section header entry size (e_shentsize) in ELF header"); - - const uint64_t FileSize = Buf.size(); - - if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize) - return createError("section header table goes past the end of the file"); - - // Invalid address alignment of section headers - if (SectionTableOffset & (alignof(Elf_Shdr) - 1)) - return createError("invalid alignment of section headers"); - - const Elf_Shdr *First = - reinterpret_cast(base() + SectionTableOffset); - - uintX_t NumSections = getHeader()->e_shnum; - if (NumSections == 0) - NumSections = First->sh_size; - - if (NumSections > UINT64_MAX / sizeof(Elf_Shdr)) - return createError("section table goes past the end of file"); - - const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr); - - // Section table goes past end of file! - if (SectionTableOffset + SectionTableSize > FileSize) - return createError("section table goes past the end of file"); - - return makeArrayRef(First, NumSections); -} - -template -Expected -ELFFile::getSection(uint32_t Index) const { - auto TableOrErr = sections(); - if (!TableOrErr) - return TableOrErr.takeError(); - return object::getSection(*TableOrErr, Index); -} - -template -Expected -ELFFile::getStringTable(const Elf_Shdr *Section) const { - if (Section->sh_type != ELF::SHT_STRTAB) - return createError("invalid sh_type for string table, expected SHT_STRTAB"); - auto V = getSectionContentsAsArray(Section); - if (!V) - return V.takeError(); - ArrayRef Data = *V; - if (Data.empty()) - return createError("empty string table"); - if (Data.back() != '\0') - return createError("string table non-null terminated"); - return StringRef(Data.begin(), Data.size()); -} - -template -Expected> -ELFFile::getSHNDXTable(const Elf_Shdr &Section) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - return getSHNDXTable(Section, *SectionsOrErr); -} - -template -Expected> -ELFFile::getSHNDXTable(const Elf_Shdr &Section, - Elf_Shdr_Range Sections) const { - assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX); - auto VOrErr = getSectionContentsAsArray(&Section); - if (!VOrErr) - return VOrErr.takeError(); - ArrayRef V = *VOrErr; - auto SymTableOrErr = object::getSection(Sections, Section.sh_link); - if (!SymTableOrErr) - return SymTableOrErr.takeError(); - const Elf_Shdr &SymTable = **SymTableOrErr; - if (SymTable.sh_type != ELF::SHT_SYMTAB && - SymTable.sh_type != ELF::SHT_DYNSYM) - return createError("invalid sh_type"); - if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym))) - return createError("invalid section contents size"); - return V; -} - -template -Expected -ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - return getStringTableForSymtab(Sec, *SectionsOrErr); -} - -template -Expected -ELFFile::getStringTableForSymtab(const Elf_Shdr &Sec, - Elf_Shdr_Range Sections) const { - - if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM) - return createError( - "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM"); - auto SectionOrErr = object::getSection(Sections, Sec.sh_link); - if (!SectionOrErr) - return SectionOrErr.takeError(); - return getStringTable(*SectionOrErr); -} - -template -Expected -ELFFile::getSectionName(const Elf_Shdr *Section) const { - auto SectionsOrErr = sections(); - if (!SectionsOrErr) - return SectionsOrErr.takeError(); - auto Table = getSectionStringTable(*SectionsOrErr); - if (!Table) - return Table.takeError(); - return getSectionName(Section, *Table); -} - -template -Expected ELFFile::getSectionName(const Elf_Shdr *Section, - StringRef DotShstrtab) const { - uint32_t Offset = Section->sh_name; - if (Offset == 0) - return StringRef(); - if (Offset >= DotShstrtab.size()) - return createError("invalid string offset"); - return StringRef(DotShstrtab.data() + Offset); -} - template Expected> ELFFile::android_relas(const Elf_Shdr *Sec) const { From ce8f24e6d75e12371b723070fbcd546ebb01598b Mon Sep 17 00:00:00 2001 From: Kevin Enderby Date: Fri, 3 Nov 2017 21:32:44 +0000 Subject: [PATCH 077/238] Fix a crash in llvm-objdump when printing a bad x86_64 relocation in a Mach-O file with a bad section number. rdar://35207539 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317373 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/Inputs/macho-invalid-reloc-section-index | Bin 0 -> 2768 bytes .../tools/llvm-objdump/X86/malformed-machos.test | 3 +++ tools/llvm-objdump/llvm-objdump.cpp | 15 +++++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-reloc-section-index new file mode 100644 index 0000000000000000000000000000000000000000..a9d0b48449b733af6a717bf6e71425bc17e2be0a GIT binary patch literal 2768 zcmd5-O-vI(6dwKrY7Y<&Dq`G5RE!C>*cMbuFo2ebLTpPg6t@j6P?46DlJww##2AB3 z4DrMfJelahqi`@yh%p*bFWxxlp@|-dTs){r@q4o~bt!GcCI(+J`~GI$?!5Wt+t=yk z=Zvx9BF4(0OEAogl`TLQV~AMUd&JIy}~9IAS@~WLAEQ#cH;+*lc!lv!ZBm z?P)Qa?N+OzSel!}WS>D*8(r5fcXnM+T)kJ_U7b|8fgX?zj5?uhLc4|bLMQQsW6)}^ z>ryZtyBLghOh==^aI8CkM`$DzjBd~MMEn7_e{_4IKNy_~MZ&IFG!#Z!or?LRF|H$% zcnKu$KJB^pX}8O+_zc5Zy~c=Eia+1|SWly=Mbv2YoOkszWi~hI$qr!5b1V0wM}!+_ zF+}=wP&7msdx)VuG(!_rb$1O=BeO)Vm-XT(P)c)3*^GWFfnh(!0`hw(fxryZK@5to zFP(ik{jkmZxco*aofsTEhFBHGJmyzGpAbH+--&pfvngkSW6qgi&O6Ci$J9egFhQ5$%bU1njt-$~tE*vYdq0eCtmPhK3N-nx!apPY zlfoYmzHpfj2Bv3ZJj z*YF&f)Zm;K#|rK6&p4L-mAI3Bp*WW7Wj!YjfuBPj91kE#m?pd{&=SXMmBY&)zPBtb ztro4{JOAyI#Bm?y>2XZ!cOrVUZpAU7Qo|)mIb;F>kS+wnay^x9 zMIFL{o`WU-`j8|3eFPncmbgtl{rc*K%@M!5_ND(xuzpqI)+=gB{-yPr^m{xWX$zgn zG3J>~EmBlZytAoiMub4Hu;|CZtwp*PoMh>nF^v15(Kn+>XyQupYYYayFXGc0#1ss& zk3&mdJp^0NsXq9a!X=wacY{a&mOoG1K&(2vN&YMpXVf>s8RrjXHR>zSN zXAMY@-40FvcZsue*N>X61Z?*o%y@rRc8_jIoYi91~{z}Xzt!E(Y{1}|i1 Jb73wX`~|bv!9V~2 literal 0 HcmV?d00001 diff --git a/test/tools/llvm-objdump/X86/malformed-machos.test b/test/tools/llvm-objdump/X86/malformed-machos.test index 292666a37254c..e29df464a4ef7 100644 --- a/test/tools/llvm-objdump/X86/malformed-machos.test +++ b/test/tools/llvm-objdump/X86/malformed-machos.test @@ -66,3 +66,6 @@ INVALID-SYMBOL-LIB_ORDINAL: macho-invalid-symbol-lib_ordinal': truncated or malf RUN: not llvm-objdump -macho -objc-meta-data %p/Inputs/macho-invalid-bind-entry 2>&1 | FileCheck -check-prefix INVALID-BIND-ENTRY %s INVALID-BIND-ENTRY: macho-invalid-bind-entry': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 83 (max 0) for opcode at: 0x0) + +RUN: llvm-objdump -macho -r %p/Inputs/macho-invalid-reloc-section-index | FileCheck -check-prefix INVALID-RELOC-SECTION-INDEX %s +INVALID-RELOC-SECTION-INDEX: 0000000000000021 X86_64_RELOC_UNSIGNED 8388613 (?,?) diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index d80f1cb049da6..02eaa89f088a5 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -865,8 +865,19 @@ static void printRelocationTargetName(const MachOObjectFile *O, } else { section_iterator SI = O->section_begin(); // Adjust for the fact that sections are 1-indexed. - advance(SI, Val - 1); - SI->getName(S); + if (Val == 0) { + fmt << "0 (?,?)"; + return; + } + uint32_t i = Val - 1; + while (i != 0 && SI != O->section_end()) { + i--; + advance(SI, 1); + } + if (SI == O->section_end()) + fmt << Val << " (?,?)"; + else + SI->getName(S); } fmt << S; From cdc57825ed64b0995a34dcbf9f330e8b2d5cd5bd Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Fri, 3 Nov 2017 21:45:55 +0000 Subject: [PATCH 078/238] [LTO][ThinLTO] Use the linker resolutions to mark global values as dso_local. Now that we have a way to mark GlobalValues as local we can use the symbol resolutions that the linker plugin provides as part of lto/thinlto link step to refine the compilers view on what symbols will end up being local. Differential Revision: https://reviews.llvm.org/D35702 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317374 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 +++++-- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 +++-- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 +++--- lib/Bitcode/Reader/BitcodeReader.cpp | 4 ++- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ lib/LTO/LTO.cpp | 21 +++++++++--- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll | 22 +++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 0 -> 1028 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 +++++++++++------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 ++++-- test/ThinLTO/X86/reference_non_importable.ll | 2 +- .../Transforms/LowerTypeTests/import-unsat.ll | 1 + .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- .../WholeProgramDevirt/import-indir.ll | 1 + 19 files changed, 115 insertions(+), 37 deletions(-) create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 2d664f41e3ce5..b1e58a2a0d9b3 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,11 +148,15 @@ class GlobalValueSummary { /// In combined summary, indicate that the global value is live. unsigned Live : 1; + /// Indicates that the linker resolved the symbol to a definition from + /// within the same linkage unit. + unsigned DSOLocal : 1; + /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live) + bool NotEligibleToImport, bool Live, bool IsLocal) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live) {} + Live(Live), DSOLocal(IsLocal) {} }; private: @@ -229,6 +233,10 @@ class GlobalValueSummary { void setLive(bool Live) { Flags.Live = Live; } + void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } + + bool isDSOLocal() const { return Flags.DSOLocal; } + /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 2f9990ca03d85..4687f2d53e7ed 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live; + bool NotEligibleToImport, Live, IsLocal; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,6 +177,7 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); + io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -211,7 +212,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live), + FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -228,7 +229,8 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), FSum->type_tests(), + static_cast(FSum->flags().Live), + static_cast(FSum->flags().DSOLocal), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index afd575e7273cf..82db09ca97b07 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false); + /* Live = */ false, F.isDSOLocal()); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, V.isDSOLocal()); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, A.isDSOLocal()); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true); + /* Live = */ true, + /* Local */ GV->isDSOLocal()); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index c2272260f44c5..d0f11db8f61f3 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); + bool Local = (RawFlags & 0x4); + + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1e491aa066ee5..c5d376c9426b8 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); + RawFlags |= (Flags.DSOLocal << 2); + // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 017dd201f9c86..9c737795b5a99 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } + + // Set the 'local' flag based on the linker resolution for this symbol. + GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } - // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (Res.Prevailing) { - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (Res.Prevailing) { ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } + + // If the linker resolved the symbol to a local definition then mark it + // as local in the summary for the module we are adding. + if (Res.FinalDefinitionInLinkageUnit) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( + GUID, BM.getModuleIdentifier())) { + S->setDSOLocal(true); + } + } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index fbb61ac1ae914..2e6fc4e8482e1 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { + + // Check the summaries to see if the symbol gets resolved to a known local + // definition. + if (GV.hasName()) { + ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); + if (VI) { + // Need to check all summaries are local in case of hash collisions. + bool IsLocal = VI.getSummaryList().size() && + llvm::all_of(VI.getSummaryList(), + [](const std::unique_ptr &Summary) { + return Summary->isDSOLocal(); + }); + if (IsLocal) + GV.setDSOLocal(true); + } + } + bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll new file mode 100644 index 0000000000000..cbc48d23df3c7 --- /dev/null +++ b/test/Bitcode/thinlto-summary-local-5.0.ll @@ -0,0 +1,22 @@ +; Bitcode compatibility test for dso_local flag in thin-lto summaries. +; Checks that older bitcode summaries without the dso_local op are still +; properly parsed and don't set GlobalValues as dso_local. + +; RUN: llvm-dis < %s.bc | FileCheck %s +; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN + +define void @foo() { +;CHECK-DAG:define void @foo() + ret void +} + +@bar = global i32 0 +;CHECK-DAG: @bar = global i32 0 + +@baz = alias i32, i32* @bar +;CHECK-DAG: @bar = global i32 0 + +;BCAN: +;BCAN-NEXT: +;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc new file mode 100644 index 0000000000000000000000000000000000000000..8dc7ca0a74b760ce63a2967d78da0abefd37c9aa GIT binary patch literal 1028 zcmZ8fZ)h837=N#q=59%LxpqMlc6Z!$Q}V&g<7!#zCD7!w)>$ECei%$K7eh=XO|NN8 z(+nipWxY&;-e!@2{jlg93>7N)L2zK3)EY`0^}k3d3(dMAwFFnu5BuPIai4s6-{*PX zhv)Zu-{((ITG<|Q0B{%p5V7%8VBPXb*!pP*O9Erv(TRgi^S!Wrf73k0VhWEbA#9E$ zf)4gHL;%3q0Qd!3YYzu{pgjUNRycMk&@eq~CWYb4NYVC4FYaPYZ$x=vwMGgo5`PELlrlr;m=Xu4b@jf#b(Em31jEt*vYt-f;z7$x~7D&j2p}R z(+g?m^4iO|HsjP%6>Yv!cx=Bz?B62(S|Rp$G}5S2?JCvm>Mfbt;Hb3%y&;=_)}`Tw zG+gQGFY4Y!$y-9j#ros$(fjAcA5n3+Zu@3ZQ_g59MNP-CY*we%Iub@Q}+AK3p4 z5pA8=8L_umVh?7)y6LS|rWRnn=cug$vvrGVx9HyyCgz;*E=x}?sbVv9{6QjNZ7z#t zD-+`fy4|P%^BS`uQ+GLfdzF3zVT=!RbVoM7a6$Dp>*BKHhXuJ`^R*?h1h4h-*A|$m zHV6NMV>vgLf-M8rVegE&vRYDZSQ9LFMyW`CnA);5yjefLORZNA-whZo+vVv0-U8o| z0GJ^LYx=IL;u0(X>c67;U@`Q%F?TWxfMJiAke(_$8~h)d)Fq_YAO|AFwu60Yf$TeW z{5If((Xj>^jN%i31bGCk9D<=>24djE0f8U~ipUQJRaahuBl!?K^`na)8;*#aicL>P zB literal 0 HcmV?d00001 diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index f6ee22e4161d9..d6022c643519f 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 60d082b3e0f78..94f2838423122 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 28bf1ada4a862..8adfb87d6edf5 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = global i32 42, align 4 +; CHECK: @x = dso_local global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index c19ccb01be3cf..90de3bb9a3223 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: Date: Fri, 3 Nov 2017 21:55:03 +0000 Subject: [PATCH 079/238] Invoke salvageDebugInfo from CodeGenPrepare's SinkCast() This preserves the debug info for the cast operation in the original location. rdar://problem/33460652 Reapplied r317340 with the test moved into an ARM-specific directory. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317375 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 1 + lib/Transforms/Utils/Local.cpp | 2 +- test/DebugInfo/ARM/salvage-debug-info.ll | 118 +++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 test/DebugInfo/ARM/salvage-debug-info.ll diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 973ddebd987cf..73f014704b879 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1171,6 +1171,7 @@ static bool SinkCast(CastInst *CI) { // If we removed all uses, nuke the cast. if (CI->use_empty()) { + salvageDebugInfo(*CI); CI->eraseFromParent(); MadeChange = true; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 8c643c93ec4dc..cb7978f76aa0b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1366,7 +1366,7 @@ void llvm::salvageDebugInfo(Instruction &I) { return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V)); }; - if (isa(&I)) { + if (isa(&I) || isa(&I)) { findDbgValues(DbgValues, &I); for (auto *DVI : DbgValues) { // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value diff --git a/test/DebugInfo/ARM/salvage-debug-info.ll b/test/DebugInfo/ARM/salvage-debug-info.ll new file mode 100644 index 0000000000000..5509b92a5c130 --- /dev/null +++ b/test/DebugInfo/ARM/salvage-debug-info.ll @@ -0,0 +1,118 @@ +; RUN: opt -codegenprepare -S %s -o - | FileCheck %s +; typedef struct info { +; unsigned long long size; +; } info_t; +; extern unsigned p; +; extern unsigned n; +; void f() { +; unsigned int i; +; if (p) { +; info_t *info = (info_t *)p; +; for (i = 0; i < n; i++) +; use(info[i].size); +; } +; } +source_filename = "debug.i" +target datalayout = "e-m:o-p:32:32-i64:64-a:0:32-n32-S128" +target triple = "thumbv7k-apple-ios10.0.0" + +%struct.info = type { i64 } + +@p = external local_unnamed_addr global i32, align 4 +@n = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind ssp uwtable +define void @f() local_unnamed_addr #0 !dbg !16 { +entry: + %0 = load i32, i32* @p, align 4, !dbg !25 + %tobool = icmp eq i32 %0, 0, !dbg !25 + br i1 %tobool, label %if.end, label %if.then, !dbg !26 + +if.then: ; preds = %entry + %1 = inttoptr i32 %0 to %struct.info*, !dbg !27 + tail call void @llvm.dbg.value(metadata %struct.info* %1, metadata !22, metadata !DIExpression()), !dbg !28 + ; CHECK: call void @llvm.dbg.value(metadata i32 %0, metadata !22, metadata !DIExpression()) + tail call void @llvm.dbg.value(metadata i32 0, metadata !20, metadata !DIExpression()), !dbg !29 + %2 = load i32, i32* @n, align 4, !dbg !30 + %cmp5 = icmp eq i32 %2, 0, !dbg !33 + br i1 %cmp5, label %if.end, label %for.body.preheader, !dbg !34 + +for.body.preheader: ; preds = %if.then + ; CHECK: for.body.preheader: + ; CHECK: %2 = inttoptr i32 %0 to %struct.info* + br label %for.body, !dbg !35 + +for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi %struct.info* [ %1, %for.body.preheader ], [ %scevgep, %for.body ] + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %lsr.iv7 = bitcast %struct.info* %lsr.iv to i64* + tail call void @llvm.dbg.value(metadata i32 %i.06, metadata !20, metadata !DIExpression()), !dbg !29 + %3 = load i64, i64* %lsr.iv7, align 8, !dbg !35 + %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64)*)(i64 %3) #3, !dbg !36 + %inc = add nuw i32 %i.06, 1, !dbg !37 + tail call void @llvm.dbg.value(metadata i32 %inc, metadata !20, metadata !DIExpression()), !dbg !29 + %4 = load i32, i32* @n, align 4, !dbg !30 + %scevgep = getelementptr %struct.info, %struct.info* %lsr.iv, i32 1, !dbg !33 + %cmp = icmp ult i32 %inc, %4, !dbg !33 + br i1 %cmp, label %for.body, label %if.end.loopexit, !dbg !34, !llvm.loop !38 + +if.end.loopexit: ; preds = %for.body + br label %if.end, !dbg !40 + +if.end: ; preds = %if.end.loopexit, %if.then, %entry + ret void, !dbg !40 +} +declare i32 @use(...) local_unnamed_addr #1 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #2 + +attributes #0 = { nounwind ssp uwtable } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nobuiltin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!10, !11, !12, !13, !14} +!llvm.ident = !{!15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "debug.i", directory: "/Data/radar/35321562") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32) +!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "info_t", file: !1, line: 3, baseType: !6) +!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "info", file: !1, line: 1, size: 64, elements: !7) +!7 = !{!8} +!8 = !DIDerivedType(tag: DW_TAG_member, name: "size", scope: !6, file: !1, line: 2, baseType: !9, size: 64) +!9 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!10 = !{i32 2, !"Dwarf Version", i32 4} +!11 = !{i32 2, !"Debug Info Version", i32 3} +!12 = !{i32 1, !"wchar_size", i32 4} +!13 = !{i32 1, !"min_enum_size", i32 4} +!14 = !{i32 7, !"PIC Level", i32 2} +!15 = !{!"clang version 6.0.0 (trunk 317231) (llvm/trunk 317262)"} +!16 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !17, isLocal: false, isDefinition: true, scopeLine: 6, isOptimized: true, unit: !0, variables: !19) +!17 = !DISubroutineType(types: !18) +!18 = !{null} +!19 = !{!20, !22} +!20 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 7, type: !21) +!21 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!22 = !DILocalVariable(name: "info", scope: !23, file: !1, line: 9, type: !4) +!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 8, column: 10) +!24 = distinct !DILexicalBlock(scope: !16, file: !1, line: 8, column: 7) +!25 = !DILocation(line: 8, column: 7, scope: !24) +!26 = !DILocation(line: 8, column: 7, scope: !16) +!27 = !DILocation(line: 9, column: 20, scope: !23) +!28 = !DILocation(line: 9, column: 13, scope: !23) +!29 = !DILocation(line: 7, column: 16, scope: !16) +!30 = !DILocation(line: 10, column: 21, scope: !31) +!31 = distinct !DILexicalBlock(scope: !32, file: !1, line: 10, column: 5) +!32 = distinct !DILexicalBlock(scope: !23, file: !1, line: 10, column: 5) +!33 = !DILocation(line: 10, column: 19, scope: !31) +!34 = !DILocation(line: 10, column: 5, scope: !32) +!35 = !DILocation(line: 11, column: 19, scope: !31) +!36 = !DILocation(line: 11, column: 7, scope: !31) +!37 = !DILocation(line: 10, column: 25, scope: !31) +!38 = distinct !{!38, !34, !39} +!39 = !DILocation(line: 11, column: 23, scope: !32) +!40 = !DILocation(line: 13, column: 1, scope: !16) From 803f827385f6dce7f4b44867efdc84b332fd82d2 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 3 Nov 2017 22:32:11 +0000 Subject: [PATCH 080/238] Move TargetFrameLowering.h to CodeGen where it's implemented This header already includes a CodeGen header and is implemented in lib/CodeGen, so move the header there to match. This fixes a link error with modular codegeneration builds - where a header and its implementation are circularly dependent and so need to be in the same library, not split between two like this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317379 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/{Target => CodeGen}/TargetFrameLowering.h | 6 +++--- lib/CodeGen/AsmPrinter/ARMException.cpp | 2 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 2 +- lib/CodeGen/AsmPrinter/DwarfCFIException.cpp | 2 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 2 +- lib/CodeGen/AsmPrinter/WinException.cpp | 2 +- lib/CodeGen/FEntryInserter.cpp | 2 +- lib/CodeGen/GCRootLowering.cpp | 2 +- lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 +- lib/CodeGen/LiveDebugValues.cpp | 2 +- lib/CodeGen/LocalStackSlotAllocation.cpp | 2 +- lib/CodeGen/MachineFrameInfo.cpp | 2 +- lib/CodeGen/MachineFunction.cpp | 2 +- lib/CodeGen/PatchableFunction.cpp | 2 +- lib/CodeGen/PrologEpilogInserter.cpp | 2 +- lib/CodeGen/RegUsageInfoCollector.cpp | 2 +- lib/CodeGen/RegisterClassInfo.cpp | 2 +- lib/CodeGen/RegisterScavenging.cpp | 2 +- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 2 +- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +- lib/CodeGen/ShrinkWrap.cpp | 2 +- lib/CodeGen/TargetFrameLoweringImpl.cpp | 2 +- lib/CodeGen/TargetInstrInfo.cpp | 2 +- lib/CodeGen/TargetOptionsImpl.cpp | 2 +- lib/CodeGen/TargetRegisterInfo.cpp | 2 +- lib/Target/AArch64/AArch64FrameLowering.h | 2 +- lib/Target/AArch64/AArch64RegisterInfo.cpp | 2 +- lib/Target/AMDGPU/AMDGPUFrameLowering.h | 2 +- lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 +- lib/Target/ARC/ARCFrameLowering.h | 2 +- lib/Target/ARC/ARCRegisterInfo.cpp | 2 +- lib/Target/ARM/ARMFrameLowering.h | 2 +- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 2 +- lib/Target/ARM/ThumbRegisterInfo.cpp | 2 +- lib/Target/AVR/AVRFrameLowering.h | 2 +- lib/Target/AVR/AVRRegisterInfo.cpp | 2 +- lib/Target/BPF/BPFFrameLowering.h | 2 +- lib/Target/BPF/BPFRegisterInfo.cpp | 2 +- lib/Target/Hexagon/HexagonFrameLowering.h | 2 +- lib/Target/Lanai/LanaiFrameLowering.h | 2 +- lib/Target/Lanai/LanaiRegisterInfo.cpp | 2 +- lib/Target/Lanai/LanaiSubtarget.h | 2 +- lib/Target/Lanai/LanaiTargetMachine.h | 2 +- lib/Target/MSP430/MSP430FrameLowering.h | 2 +- lib/Target/MSP430/MSP430TargetMachine.h | 2 +- lib/Target/Mips/Mips16FrameLowering.cpp | 2 +- lib/Target/Mips/Mips16RegisterInfo.cpp | 2 +- lib/Target/Mips/MipsFrameLowering.h | 2 +- lib/Target/Mips/MipsISelLowering.cpp | 2 +- lib/Target/Mips/MipsRegisterInfo.cpp | 2 +- lib/Target/Mips/MipsSERegisterInfo.cpp | 2 +- lib/Target/NVPTX/NVPTXFrameLowering.h | 2 +- lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 2 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 2 +- lib/Target/Nios2/Nios2FrameLowering.h | 2 +- lib/Target/PowerPC/PPCBranchCoalescing.cpp | 2 +- lib/Target/PowerPC/PPCFrameLowering.h | 2 +- lib/Target/PowerPC/PPCRegisterInfo.cpp | 2 +- lib/Target/RISCV/RISCVFrameLowering.h | 2 +- lib/Target/RISCV/RISCVRegisterInfo.cpp | 2 +- lib/Target/Sparc/SparcFrameLowering.h | 2 +- lib/Target/Sparc/SparcSubtarget.h | 2 +- lib/Target/SystemZ/SystemZFrameLowering.h | 2 +- lib/Target/SystemZ/SystemZRegisterInfo.cpp | 2 +- lib/Target/WebAssembly/WebAssemblyFrameLowering.h | 2 +- lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp | 2 +- lib/Target/X86/X86FrameLowering.h | 2 +- lib/Target/X86/X86RegisterInfo.cpp | 2 +- lib/Target/XCore/XCoreFrameLowering.h | 2 +- lib/Target/XCore/XCoreRegisterInfo.cpp | 2 +- unittests/CodeGen/MachineInstrTest.cpp | 2 +- 73 files changed, 75 insertions(+), 75 deletions(-) rename include/llvm/{Target => CodeGen}/TargetFrameLowering.h (98%) diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h similarity index 98% rename from include/llvm/Target/TargetFrameLowering.h rename to include/llvm/CodeGen/TargetFrameLowering.h index 31017cbc27b85..5cf4627f3c964 100644 --- a/include/llvm/Target/TargetFrameLowering.h +++ b/include/llvm/CodeGen/TargetFrameLowering.h @@ -1,4 +1,4 @@ -//===-- llvm/Target/TargetFrameLowering.h ---------------------------*- C++ -*-===// +//===-- llvm/CodeGen/TargetFrameLowering.h ---------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_TARGETFRAMELOWERING_H -#define LLVM_TARGET_TARGETFRAMELOWERING_H +#ifndef LLVM_CODEGEN_TARGETFRAMELOWERING_H +#define LLVM_CODEGEN_TARGETFRAMELOWERING_H #include "llvm/CodeGen/MachineBasicBlock.h" #include diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp index 8b1376ab363d7..973816d563555 100644 --- a/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -29,7 +29,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index a35fcdaaf9aa5..3081e76158629 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -100,7 +100,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 67bab8c768418..5aa3f4ae1030a 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -68,7 +68,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/SMLoc.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index dd7f7931b06b8..1a6cb9679925a 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -31,7 +31,7 @@ #include "llvm/MC/MachineLocation.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 06b5b06c41bf3..603d0f7f4700b 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -36,7 +36,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index 5d485f213573d..35ce1fec3858d 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -33,7 +33,7 @@ #include "llvm/MC/MCWin64EH.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp index 9781338f95262..3b38b5966b681 100644 --- a/lib/CodeGen/FEntryInserter.cpp +++ b/lib/CodeGen/FEntryInserter.cpp @@ -17,7 +17,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp index 35246545ca91c..9c0eea7877780 100644 --- a/lib/CodeGen/GCRootLowering.cpp +++ b/lib/CodeGen/GCRootLowering.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index 8e31ed0a01539..45eb605c3c2c9 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -54,7 +54,7 @@ #include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp index a45b1e39feed0..bf6d53889376c 100644 --- a/lib/CodeGen/LiveDebugValues.cpp +++ b/lib/CodeGen/LiveDebugValues.cpp @@ -46,7 +46,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp index 2eab0376da2fb..33ae476bf4a0b 100644 --- a/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp index be8adf75fb7ca..ba38005a93fe2 100644 --- a/lib/CodeGen/MachineFrameInfo.cpp +++ b/lib/CodeGen/MachineFrameInfo.cpp @@ -18,7 +18,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 250a10c7d0768..570c410e1fe24 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -58,7 +58,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp index 513e82716564e..b0424e70a47ab 100644 --- a/lib/CodeGen/PatchableFunction.cpp +++ b/lib/CodeGen/PatchableFunction.cpp @@ -16,7 +16,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index d611c9b45c51a..92a2566f0c18f 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -55,7 +55,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOpcodes.h" diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp index 214c6d2c820d3..3aaa5a4738d5f 100644 --- a/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/lib/CodeGen/RegUsageInfoCollector.cpp @@ -27,7 +27,7 @@ #include "llvm/CodeGen/RegisterUsageInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" using namespace llvm; diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp index 956dec39fc381..8e463ff272d23 100644 --- a/lib/CodeGen/RegisterClassInfo.cpp +++ b/lib/CodeGen/RegisterClassInfo.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 844ddb9ed3ffa..32194e6d76fcd 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -33,7 +33,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index b736037d71ddc..283ef1efd4638 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -32,7 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index ff49134f7b997..356f25850460d 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -40,7 +40,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ccc06fa3ee150..c8abe25b7c6b1 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -98,7 +98,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetLowering.h" diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp index 5fb6afee88a84..1c6fb1ce78527 100644 --- a/lib/CodeGen/ShrinkWrap.cpp +++ b/lib/CodeGen/ShrinkWrap.cpp @@ -73,7 +73,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp index 9dd98b4020d25..64962a5b796ab 100644 --- a/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/Function.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Compiler.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index bac12efd6395d..3010ab23828f8 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -26,7 +26,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp index ed845e1706f8c..99ff4931e2fd6 100644 --- a/lib/CodeGen/TargetOptionsImpl.cpp +++ b/lib/CodeGen/TargetOptionsImpl.cpp @@ -15,7 +15,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index 55318237e95e4..758fdabf5dd87 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -27,7 +27,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index c351efb0c39b1..55a256867fabb 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 91b1481f5ef07..1059bc37c8f28 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -26,7 +26,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 2329fffd52121..91fe921bfeecd 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 9fc9592bdc578..83122281d2b2b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -23,7 +23,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include using namespace llvm; diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h index ac5378adbd8a4..c042bec016cac 100644 --- a/lib/Target/ARC/ARCFrameLowering.h +++ b/lib/Target/ARC/ARCFrameLowering.h @@ -17,7 +17,7 @@ #include "ARC.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp index 66f95911d3e88..bed47a0eab5bb 100644 --- a/lib/Target/ARC/ARCRegisterInfo.cpp +++ b/lib/Target/ARC/ARCRegisterInfo.cpp @@ -25,7 +25,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h index 2c10031e3f8c7..1f18e2bf80c46 100644 --- a/lib/Target/ARM/ARMFrameLowering.h +++ b/lib/Target/ARM/ARMFrameLowering.h @@ -11,7 +11,7 @@ #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include namespace llvm { diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 4aa7e1503427e..2b4cdb7d97cec 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -53,7 +53,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetRegisterInfo.h" diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp index 15a5675233364..d2bebb9eeeca1 100644 --- a/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -29,7 +29,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h index 30ef441183a9d..a0ba6c9512765 100644 --- a/lib/Target/AVR/AVRFrameLowering.h +++ b/lib/Target/AVR/AVRFrameLowering.h @@ -10,7 +10,7 @@ #ifndef LLVM_AVR_FRAME_LOWERING_H #define LLVM_AVR_FRAME_LOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 7099b29a8bcdf..b6ac93452cb1e 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -18,7 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/IR/Function.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "AVR.h" #include "AVRInstrInfo.h" diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h index 5db963f518b1d..b4ffa0713fa6a 100644 --- a/lib/Target/BPF/BPFFrameLowering.h +++ b/lib/Target/BPF/BPFFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H #define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class BPFSubtarget; diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp index 273843e92701e..cef905170f46b 100644 --- a/lib/Target/BPF/BPFRegisterInfo.cpp +++ b/lib/Target/BPF/BPFRegisterInfo.cpp @@ -20,7 +20,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h index 296edbe1effb4..988718860c5b8 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/lib/Target/Hexagon/HexagonFrameLowering.h @@ -15,7 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include namespace llvm { diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h index 2f9b6c3c158f1..ca690d513fc2d 100644 --- a/lib/Target/Lanai/LanaiFrameLowering.h +++ b/lib/Target/Lanai/LanaiFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H #include "Lanai.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp index 6ea477dce3e8e..7d444a46d0f70 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.cpp +++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp @@ -23,7 +23,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h index 2732ef3097ecb..313d950e8aa7d 100644 --- a/lib/Target/Lanai/LanaiSubtarget.h +++ b/lib/Target/Lanai/LanaiSubtarget.h @@ -19,7 +19,7 @@ #include "LanaiInstrInfo.h" #include "LanaiSelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index ce1271d9deaa2..2fb1a05361041 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -19,7 +19,7 @@ #include "LanaiInstrInfo.h" #include "LanaiSelectionDAGInfo.h" #include "LanaiSubtarget.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h index fdc4aa52a1950..8807101f37cab 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.h +++ b/lib/Target/MSP430/MSP430FrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H #include "MSP430.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MSP430FrameLowering : public TargetFrameLowering { diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h index 97b5e810a1d3b..4935b80cfdd99 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.h +++ b/lib/Target/MSP430/MSP430TargetMachine.h @@ -16,7 +16,7 @@ #define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H #include "MSP430Subtarget.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index 76bca3df2bcdc..cb59e2ddb1c6a 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -30,7 +30,7 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MachineLocation.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include #include #include diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp index 44771cbe8be18..0ee0d73dc0a06 100644 --- a/lib/Target/Mips/Mips16RegisterInfo.cpp +++ b/lib/Target/Mips/Mips16RegisterInfo.cpp @@ -29,7 +29,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h index 8c4214c4c21d0..883c3267d51a7 100644 --- a/lib/Target/Mips/MipsFrameLowering.h +++ b/lib/Target/Mips/MipsFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H #include "Mips.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MipsSubtarget; diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 38b3c3fb16020..22a5a80a75c14 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -62,7 +62,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp index 9c64a0ecbb152..ec966afee0e97 100644 --- a/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -28,7 +28,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp index 86bd24166bb6d..bd65cbf74af13 100644 --- a/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h index 320ca9a2f095d..a802cf85d2e00 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.h +++ b/lib/Target/NVPTX/NVPTXFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H #define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class NVPTXSubtarget; diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 88288abe64f93..3957d42665396 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -20,7 +20,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 7674135f0a7c3..54a72a688ee39 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -17,7 +17,7 @@ #include "ManagedStringPool.h" #include "NVPTXSubtarget.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/Nios2/Nios2FrameLowering.h b/lib/Target/Nios2/Nios2FrameLowering.h index 2aaea678d9eed..2d9e84b2c72b3 100644 --- a/lib/Target/Nios2/Nios2FrameLowering.h +++ b/lib/Target/Nios2/Nios2FrameLowering.h @@ -14,7 +14,7 @@ #define LLVM_LIB_TARGET_NIOS2_NIOS2FRAMELOWERING_H #include "Nios2.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class Nios2Subtarget; diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp index 33085a4236198..1ba82042e6ef6 100644 --- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -22,7 +22,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h index fa813db5fef36..f845d5a9ac64a 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.h +++ b/lib/Target/PowerPC/PPCFrameLowering.h @@ -15,7 +15,7 @@ #include "PPC.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index d46c1383297ff..e476ca0494d54 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -37,7 +37,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h index 14772ddac4acd..71f85864a39cb 100644 --- a/lib/Target/RISCV/RISCVFrameLowering.h +++ b/lib/Target/RISCV/RISCVFrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H #define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class RISCVSubtarget; diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp index 4f6c528061cb4..740b206b80283 100644 --- a/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -19,7 +19,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h index ac0e69ccde1e7..6098afa689852 100644 --- a/lib/Target/Sparc/SparcFrameLowering.h +++ b/lib/Target/Sparc/SparcFrameLowering.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H #include "Sparc.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index bfbdb8d0b44bd..ad6b55a9fc945 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -19,7 +19,7 @@ #include "SparcInstrInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" #include diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h index 91c5a5d53a157..a75d111b02949 100644 --- a/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/lib/Target/SystemZ/SystemZFrameLowering.h @@ -11,7 +11,7 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H #include "llvm/ADT/IndexedMap.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class SystemZTargetMachine; diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp index 05f93ce516210..a44fae523fec4 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -13,7 +13,7 @@ #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" using namespace llvm; diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index bf326fce88fa9..4cc7f5ae058a9 100644 --- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -16,7 +16,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { class MachineFrameInfo; diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index 9367464c806eb..5e7ebd19fac71 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -24,7 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 38ac96e16d4e0..909319fc18fcb 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 1f49650340e73..a9ea94337b968 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -33,7 +33,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h index 27584f4e2b6c0..e98e9cda11db6 100644 --- a/lib/Target/XCore/XCoreFrameLowering.h +++ b/lib/Target/XCore/XCoreFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H #define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" namespace llvm { diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp index d34e928b14f70..a6cf68370093e 100644 --- a/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" diff --git a/unittests/CodeGen/MachineInstrTest.cpp b/unittests/CodeGen/MachineInstrTest.cpp index 89041e2ab22b6..ac2fffe8502b9 100644 --- a/unittests/CodeGen/MachineInstrTest.cpp +++ b/unittests/CodeGen/MachineInstrTest.cpp @@ -12,7 +12,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" From be2858c001c1456d7cbaee597866acdcdc74a229 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 3 Nov 2017 22:48:13 +0000 Subject: [PATCH 081/238] [X86] Give unary PERMI priority over SHUF128 in lowerV8I64VectorShuffle to make it possible to fold a load. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317382 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- test/CodeGen/X86/vector-shuffle-512-v8.ll | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d65a65e365c42..ea97dc2dccdac 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13709,10 +13709,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) - return Shuf128; - if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -13734,6 +13730,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6c9805597215f..1d17ef109d263 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01014545: ; AVX512F: # BB#0: -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01014545: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: retl + + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] ; AVX512F-32-NEXT: retl + %a = load <8 x i64>, <8 x i64>* %ptr %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } From 1024a3777d981698e8a60c5b86c21ea5b05c5835 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Fri, 3 Nov 2017 23:03:38 +0000 Subject: [PATCH 082/238] [CallSiteSplitting] Silence GCC's -Wparentheses. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317385 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/CallSiteSplitting.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 2224cb2eb6231..5e6bfc73ca5db 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -126,8 +126,8 @@ static bool createCallSitesOnOrPredicatedArgument( Instruction *&CallUntakenFromHeader = IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - assert(Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE && + assert((Pred == ICmpInst::ICMP_EQ || + Pred == ICmpInst::ICMP_NE) && "Unexpected predicate in an OR condition"); // Set the constant value for agruments in the call predicated based on From f0732934fcd3a3abd9af1344c71de28778c1a9a2 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Sat, 4 Nov 2017 00:44:01 +0000 Subject: [PATCH 083/238] [CallSiteSplitting] clang-format my last commit. NFCI. Thanks to Rui for pointing out. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317393 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/CallSiteSplitting.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 5e6bfc73ca5db..b70ed8d7d4cde 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -126,9 +126,8 @@ static bool createCallSitesOnOrPredicatedArgument( Instruction *&CallUntakenFromHeader = IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader; - assert((Pred == ICmpInst::ICMP_EQ || - Pred == ICmpInst::ICMP_NE) && - "Unexpected predicate in an OR condition"); + assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + "Unexpected predicate in an OR condition"); // Set the constant value for agruments in the call predicated based on // the OR condition. From f1b2e0b26a4eac07e92c73c5aeaac14f83724198 Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Sat, 4 Nov 2017 01:54:20 +0000 Subject: [PATCH 084/238] Revert "[LTO][ThinLTO] Use the linker resolutions to mark global values ..." Changes more tests then expected on one of the build bots. reverting to investigate. This reverts https://llvm.org/svn/llvm-project/llvm/trunk@317374 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317395 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 ++----- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 ++--- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 +++--- lib/Bitcode/Reader/BitcodeReader.cpp | 4 +-- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 -- lib/LTO/LTO.cpp | 21 +++--------- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ---------- test/Bitcode/thinlto-summary-local-5.0.ll | 22 ------------- test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 1028 -> 0 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 +++++++----------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 ++---- test/ThinLTO/X86/reference_non_importable.ll | 2 +- .../Transforms/LowerTypeTests/import-unsat.ll | 1 - .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- .../WholeProgramDevirt/import-indir.ll | 1 - 19 files changed, 37 insertions(+), 115 deletions(-) delete mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll delete mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index b1e58a2a0d9b3..2d664f41e3ce5 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,15 +148,11 @@ class GlobalValueSummary { /// In combined summary, indicate that the global value is live. unsigned Live : 1; - /// Indicates that the linker resolved the symbol to a definition from - /// within the same linkage unit. - unsigned DSOLocal : 1; - /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live, bool IsLocal) + bool NotEligibleToImport, bool Live) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live), DSOLocal(IsLocal) {} + Live(Live) {} }; private: @@ -233,10 +229,6 @@ class GlobalValueSummary { void setLive(bool Live) { Flags.Live = Live; } - void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } - - bool isDSOLocal() const { return Flags.DSOLocal; } - /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 4687f2d53e7ed..2f9990ca03d85 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live, IsLocal; + bool NotEligibleToImport, Live; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,7 +177,6 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); - io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -212,7 +211,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), + FSum.NotEligibleToImport, FSum.Live), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -229,8 +228,7 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), - static_cast(FSum->flags().DSOLocal), FSum->type_tests(), + static_cast(FSum->flags().Live), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 82db09ca97b07..afd575e7273cf 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false, F.isDSOLocal()); + /* Live = */ false); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false, V.isDSOLocal()); + /* Live = */ false); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false, A.isDSOLocal()); + /* Live = */ false); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,8 +410,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true, - /* Local */ GV->isDSOLocal()); + /* Live = */ true); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index d0f11db8f61f3..c2272260f44c5 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,9 +889,7 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - bool Local = (RawFlags & 0x4); - - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index c5d376c9426b8..1e491aa066ee5 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,8 +955,6 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); - RawFlags |= (Flags.DSOLocal << 2); - // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 9c737795b5a99..017dd201f9c86 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,9 +630,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } - - // Set the 'local' flag based on the linker resolution for this symbol. - GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -646,6 +643,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } + // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -700,10 +698,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); - if (Res.Prevailing) { + if (Res.Prevailing) { + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -715,15 +713,6 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } - - // If the linker resolved the symbol to a local definition then mark it - // as local in the summary for the module we are adding. - if (Res.FinalDefinitionInLinkageUnit) { - if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( - GUID, BM.getModuleIdentifier())) { - S->setDSOLocal(true); - } - } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index 2e6fc4e8482e1..fbb61ac1ae914 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,23 +203,6 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { - - // Check the summaries to see if the symbol gets resolved to a known local - // definition. - if (GV.hasName()) { - ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); - if (VI) { - // Need to check all summaries are local in case of hash collisions. - bool IsLocal = VI.getSummaryList().size() && - llvm::all_of(VI.getSummaryList(), - [](const std::unique_ptr &Summary) { - return Summary->isDSOLocal(); - }); - if (IsLocal) - GV.setDSOLocal(true); - } - } - bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll deleted file mode 100644 index cbc48d23df3c7..0000000000000 --- a/test/Bitcode/thinlto-summary-local-5.0.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Bitcode compatibility test for dso_local flag in thin-lto summaries. -; Checks that older bitcode summaries without the dso_local op are still -; properly parsed and don't set GlobalValues as dso_local. - -; RUN: llvm-dis < %s.bc | FileCheck %s -; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN - -define void @foo() { -;CHECK-DAG:define void @foo() - ret void -} - -@bar = global i32 0 -;CHECK-DAG: @bar = global i32 0 - -@baz = alias i32, i32* @bar -;CHECK-DAG: @bar = global i32 0 - -;BCAN: -;BCAN-NEXT: -;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc deleted file mode 100644 index 8dc7ca0a74b760ce63a2967d78da0abefd37c9aa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1028 zcmZ8fZ)h837=N#q=59%LxpqMlc6Z!$Q}V&g<7!#zCD7!w)>$ECei%$K7eh=XO|NN8 z(+nipWxY&;-e!@2{jlg93>7N)L2zK3)EY`0^}k3d3(dMAwFFnu5BuPIai4s6-{*PX zhv)Zu-{((ITG<|Q0B{%p5V7%8VBPXb*!pP*O9Erv(TRgi^S!Wrf73k0VhWEbA#9E$ zf)4gHL;%3q0Qd!3YYzu{pgjUNRycMk&@eq~CWYb4NYVC4FYaPYZ$x=vwMGgo5`PELlrlr;m=Xu4b@jf#b(Em31jEt*vYt-f;z7$x~7D&j2p}R z(+g?m^4iO|HsjP%6>Yv!cx=Bz?B62(S|Rp$G}5S2?JCvm>Mfbt;Hb3%y&;=_)}`Tw zG+gQGFY4Y!$y-9j#ros$(fjAcA5n3+Zu@3ZQ_g59MNP-CY*we%Iub@Q}+AK3p4 z5pA8=8L_umVh?7)y6LS|rWRnn=cug$vvrGVx9HyyCgz;*E=x}?sbVv9{6QjNZ7z#t zD-+`fy4|P%^BS`uQ+GLfdzF3zVT=!RbVoM7a6$Dp>*BKHhXuJ`^R*?h1h4h-*A|$m zHV6NMV>vgLf-M8rVegE&vRYDZSQ9LFMyW`CnA);5yjefLORZNA-whZo+vVv0-U8o| z0GJ^LYx=IL;u0(X>c67;U@`Q%F?TWxfMJiAke(_$8~h)d)Fq_YAO|AFwu60Yf$TeW z{5If((Xj>^jN%i31bGCk9D<=>24djE0f8U~ipUQJRaahuBl!?K^`na)8;*#aicL>P zB diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index d6022c643519f..f6ee22e4161d9 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 94f2838423122..60d082b3e0f78 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 8adfb87d6edf5..28bf1ada4a862 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = dso_local global i32 42, align 4 +; CHECK: @x = global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index 90de3bb9a3223..c19ccb01be3cf 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: Date: Sat, 4 Nov 2017 06:00:11 +0000 Subject: [PATCH 085/238] llvm/test/Object/archive-SYM64-write.test: Delete large temp files. They are 8GiB total. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317401 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Object/archive-SYM64-write.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Object/archive-SYM64-write.test b/test/Object/archive-SYM64-write.test index d03b54c58b37d..161d6cb819160 100644 --- a/test/Object/archive-SYM64-write.test +++ b/test/Object/archive-SYM64-write.test @@ -8,6 +8,9 @@ # RUN: llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64 # RUN: llvm-nm --print-armap %t.lib | FileCheck %s +# Delete temp files. They are too large. +# RUN: rm -f %t %t2 %t.lib + !ELF FileHeader: Class: ELFCLASS64 From 5473af6661103fb6509c89880a08b2f24a3f760a Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 4 Nov 2017 06:03:29 +0000 Subject: [PATCH 086/238] CMake: Let LLVM_BUILD_32_BITS aware of large file. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317402 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/HandleLLVMOptions.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 03b9664524921..cf1ece24848d2 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -232,6 +232,10 @@ if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -m32") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -m32") set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -m32") + + # FIXME: CMAKE_SIZEOF_VOID_P is still 8 + add_definitions(-D_LARGEFILE_SOURCE) + add_definitions(-D_FILE_OFFSET_BITS=64) endif( LLVM_BUILD_32_BITS ) endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) @@ -242,6 +246,7 @@ if (ANDROID AND (ANDROID_NATIVE_API_LEVEL LESS 24)) set(LLVM_FORCE_SMALLFILE_FOR_ANDROID TRUE) endif() if( CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT LLVM_FORCE_SMALLFILE_FOR_ANDROID) + # FIXME: It isn't handled in LLVM_BUILD_32_BITS. add_definitions( -D_LARGEFILE_SOURCE ) add_definitions( -D_FILE_OFFSET_BITS=64 ) endif() From 19bc3f9a843b7072d63c2545d771b08d8d821d8d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Nov 2017 06:44:47 +0000 Subject: [PATCH 087/238] [X86] Teach shuffle lowering to use 256-bit SHUF128 when possible. This allows masked operations to be used and allows the register allocator to use YMM16-31 if necessary. As a follow up I'll look into teaching EVEX->VEX how to turn this back into PERM2X128 if any of the additional features don't work out. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317403 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 + test/CodeGen/X86/avx-schedule.ll | 4 +- test/CodeGen/X86/avx2-schedule.ll | 4 +- test/CodeGen/X86/avx512-shuffle-schedule.ll | 768 ++++++++---------- .../X86/avx512-shuffles/shuffle-vec.ll | 384 ++++----- test/CodeGen/X86/vector-shuffle-256-v16.ll | 18 +- test/CodeGen/X86/vector-shuffle-256-v32.ll | 32 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 52 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 90 +- 9 files changed, 678 insertions(+), 684 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ea97dc2dccdac..3883415501ba7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12384,6 +12384,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } + + // Try to use SHUF128 if possible. + if (Subtarget.hasVLX()) { + if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { + unsigned PermMask = ((WidenedMask[0] % 2) << 0) | + ((WidenedMask[1] % 2) << 1); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); + } + } } // Otherwise form a 128-bit permutation. After accounting for undefs, diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 44d13db65c9d6..a3e6a18fbc904 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -3447,8 +3447,8 @@ define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x dou ; ; SKX-LABEL: test_perm2f128: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index cec8ca94409d2..8febe046d8106 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -2531,8 +2531,8 @@ define <4 x i64> @test_perm2i128(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) { ; ; SKX-LABEL: test_perm2i128: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index c59fb5b97bcae..d1b6e1f7bd349 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -9520,12 +9520,12 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; GENERIC-LABEL: test2_8xfloat_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9662,12 +9656,12 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; GENERIC-LABEL: test2_8xfloat_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9718,12 +9710,12 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9870,12 +9856,12 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -10324,12 +10308,12 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; GENERIC-LABEL: test_4xdouble_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10466,12 +10444,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; GENERIC-LABEL: test_4xdouble_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10522,12 +10498,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10674,12 +10644,12 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll index c957a85a88520..0a4334e810803 100644 --- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll +++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll @@ -6,7 +6,7 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -95,7 +92,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -130,7 +126,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -229,7 +222,7 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -522,7 +514,7 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -611,7 +600,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -646,7 +634,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, < define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -745,7 +730,7 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res @@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res @@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index dd329d21dc977..cbc190d0db35f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i } define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %bc0hi = bitcast <8 x i16> %ahi to <16 x i8> diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 3c69f6160ddb4..c5c2312b161e3 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1682,11 +1682,17 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1702,11 +1708,17 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index cf1aaca4ee20e..6e0e80b401650 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -418,21 +418,45 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_3254: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_3254: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3254: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3254: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_3276: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_3276: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3276: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3276: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -1053,8 +1077,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3254: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1075,8 +1099,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3276: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index b95e7cf008aa2..38891b465776d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -768,21 +768,33 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_7654fedc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_7654fedc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_7654fedc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_fedc7654: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_fedc7654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8f32_fedc7654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1789,21 +1801,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_7654fedc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_7654fedc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_fedc7654: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_fedc7654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2177,10 +2201,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { -; ALL-LABEL: concat_v8i32_4567CDEF_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufi64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64> @@ -2191,10 +2220,15 @@ define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { } define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) { -; ALL-LABEL: concat_v8f32_4567CDEF_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: concat_v8f32_4567CDEF_bc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8f32_4567CDEF_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0 = bitcast <8 x float> %f0 to <4 x i64> %a1 = bitcast <8 x float> %f1 to <8 x i32> %a0hi = shufflevector <4 x i64> %a0, <4 x i64> undef, <2 x i32> From ce4da272347af8a89a8f2dc388565527a22aa75c Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Sat, 4 Nov 2017 06:55:55 +0000 Subject: [PATCH 088/238] llvm/test/lit.cfg.py: Don't set the feature "llvm-64-bits" if -m32 is specified. FIXME: LLVM_BUILD_32_BITS should modify host_triple. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317404 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/lit.cfg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 57dc1f0704920..73a3b4b58a8c7 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -169,7 +169,8 @@ def get_asan_rtlib(): # Features known_arches = ["x86_64", "mips64", "ppc64", "aarch64"] -if any(config.llvm_host_triple.startswith(x) for x in known_arches): +if (config.host_ldflags.find("-m32") < 0 + and any(config.llvm_host_triple.startswith(x) for x in known_arches)): config.available_features.add("llvm-64-bits") # Others/can-execute.txt From dcf1ffe8a0867a311092f2379195e9b646e42c1d Mon Sep 17 00:00:00 2001 From: Sean Fertile Date: Sat, 4 Nov 2017 17:04:39 +0000 Subject: [PATCH 089/238] [LTO][ThinLTO] Use the linker resolutions to mark global values as dso_local. Now that we have a way to mark GlobalValues as local we can use the symbol resolutions that the linker plugin provides as part of lto/thinlto link step to refine the compilers view on what symbols will end up being local. Originally commited as r317374, but reverted in r317395 to update some missed tests. Differential Revision: https://reviews.llvm.org/D35702 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317408 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/ModuleSummaryIndex.h | 12 +++++-- include/llvm/IR/ModuleSummaryIndexYAML.h | 8 +++-- lib/Analysis/ModuleSummaryAnalysis.cpp | 9 +++--- lib/Bitcode/Reader/BitcodeReader.cpp | 4 ++- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ lib/LTO/LTO.cpp | 21 +++++++++--- lib/Transforms/Utils/FunctionImportUtils.cpp | 17 ++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll | 22 +++++++++++++ test/Bitcode/thinlto-summary-local-5.0.ll.bc | Bin 0 -> 1028 bytes test/LTO/Resolution/X86/comdat-mixed-lto.ll | 2 +- test/LTO/Resolution/X86/comdat.ll | 4 +-- test/LTO/Resolution/X86/commons.ll | 2 +- test/ThinLTO/X86/deadstrip.ll | 30 +++++++++++------- test/ThinLTO/X86/funcimport2.ll | 4 +-- test/ThinLTO/X86/internalize.ll | 9 ++++-- test/ThinLTO/X86/reference_non_importable.ll | 2 +- .../Transforms/LowerTypeTests/import-unsat.ll | 1 + .../PGOProfile/thinlto_samplepgo_icp2.ll | 2 +- .../WholeProgramDevirt/import-indir.ll | 1 + test/tools/gold/X86/asm_undefined2.ll | 3 +- test/tools/gold/X86/coff.ll | 2 +- test/tools/gold/X86/common.ll | 2 +- test/tools/gold/X86/emit-llvm.ll | 6 ++-- test/tools/gold/X86/global_with_section.ll | 16 +++++----- test/tools/gold/X86/parallel.ll | 8 ++--- .../gold/X86/thinlto_linkonceresolution.ll | 2 +- test/tools/gold/X86/thinlto_weak_library.ll | 2 +- test/tools/gold/X86/visibility.ll | 2 +- 28 files changed, 137 insertions(+), 58 deletions(-) create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll create mode 100644 test/Bitcode/thinlto-summary-local-5.0.ll.bc diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 2d664f41e3ce5..b1e58a2a0d9b3 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -148,11 +148,15 @@ class GlobalValueSummary { /// In combined summary, indicate that the global value is live. unsigned Live : 1; + /// Indicates that the linker resolved the symbol to a definition from + /// within the same linkage unit. + unsigned DSOLocal : 1; + /// Convenience Constructors explicit GVFlags(GlobalValue::LinkageTypes Linkage, - bool NotEligibleToImport, bool Live) + bool NotEligibleToImport, bool Live, bool IsLocal) : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), - Live(Live) {} + Live(Live), DSOLocal(IsLocal) {} }; private: @@ -229,6 +233,10 @@ class GlobalValueSummary { void setLive(bool Live) { Flags.Live = Live; } + void setDSOLocal(bool Local) { Flags.DSOLocal = Local; } + + bool isDSOLocal() const { return Flags.DSOLocal; } + /// Flag that this global value cannot be imported. void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h index 2f9990ca03d85..4687f2d53e7ed 100644 --- a/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,7 +135,7 @@ template <> struct MappingTraits { struct FunctionSummaryYaml { unsigned Linkage; - bool NotEligibleToImport, Live; + bool NotEligibleToImport, Live, IsLocal; std::vector TypeTests; std::vector TypeTestAssumeVCalls, TypeCheckedLoadVCalls; @@ -177,6 +177,7 @@ template <> struct MappingTraits { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); io.mapOptional("Live", summary.Live); + io.mapOptional("Local", summary.IsLocal); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls); @@ -211,7 +212,7 @@ template <> struct CustomMappingTraits { Elem.SummaryList.push_back(llvm::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), - FSum.NotEligibleToImport, FSum.Live), + FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal), 0, FunctionSummary::FFlags{}, ArrayRef{}, ArrayRef{}, std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls), @@ -228,7 +229,8 @@ template <> struct CustomMappingTraits { FSums.push_back(FunctionSummaryYaml{ FSum->flags().Linkage, static_cast(FSum->flags().NotEligibleToImport), - static_cast(FSum->flags().Live), FSum->type_tests(), + static_cast(FSum->flags().Live), + static_cast(FSum->flags().DSOLocal), FSum->type_tests(), FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index afd575e7273cf..82db09ca97b07 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, // FIXME: refactor this to use the same code that inliner is using. F.isVarArg(); GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, - /* Live = */ false); + /* Live = */ false, F.isDSOLocal()); FunctionSummary::FFlags FunFlags{ F.hasFnAttribute(Attribute::ReadNone), F.hasFnAttribute(Attribute::ReadOnly), @@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, V.isDSOLocal()); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) @@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, DenseSet &CantBePromoted) { bool NonRenamableLocal = isNonRenamableLocal(A); GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, - /* Live = */ false); + /* Live = */ false, A.isDSOLocal()); auto AS = llvm::make_unique(Flags); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); @@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( assert(GV->isDeclaration() && "Def in module asm already has definition"); GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, /* NotEligibleToImport = */ true, - /* Live = */ true); + /* Live = */ true, + /* Local */ GV->isDSOLocal()); CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index c2272260f44c5..d0f11db8f61f3 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to work correctly on earlier versions, we must conservatively treat all // values as live. bool Live = (RawFlags & 0x2) || Version < 3; - return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live); + bool Local = (RawFlags & 0x4); + + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1e491aa066ee5..c5d376c9426b8 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { RawFlags |= Flags.NotEligibleToImport; // bool RawFlags |= (Flags.Live << 1); + RawFlags |= (Flags.DSOLocal << 2); + // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 017dd201f9c86..9c737795b5a99 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, NonPrevailingComdats.insert(GV->getComdat()); cast(GV)->setComdat(nullptr); } + + // Set the 'local' flag based on the linker resolution for this symbol. + GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit); } // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that @@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Prevailing |= Res.Prevailing; } - // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit. } if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) @@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, assert(ResI != ResE); SymbolResolution Res = *ResI++; - if (Res.Prevailing) { - if (!Sym.getIRName().empty()) { - auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( - Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (!Sym.getIRName().empty()) { + auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier( + Sym.getIRName(), GlobalValue::ExternalLinkage, "")); + if (Res.Prevailing) { ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); // For linker redefined symbols (via --wrap or --defsym) we want to @@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef Syms, GUID, BM.getModuleIdentifier())) S->setLinkage(GlobalValue::WeakAnyLinkage); } + + // If the linker resolved the symbol to a local definition then mark it + // as local in the summary for the module we are adding. + if (Res.FinalDefinitionInLinkageUnit) { + if (auto S = ThinLTO.CombinedIndex.findSummaryInModule( + GUID, BM.getModuleIdentifier())) { + S->setDSOLocal(true); + } + } } } diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index fbb61ac1ae914..2e6fc4e8482e1 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, } void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { + + // Check the summaries to see if the symbol gets resolved to a known local + // definition. + if (GV.hasName()) { + ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID()); + if (VI) { + // Need to check all summaries are local in case of hash collisions. + bool IsLocal = VI.getSummaryList().size() && + llvm::all_of(VI.getSummaryList(), + [](const std::unique_ptr &Summary) { + return Summary->isDSOLocal(); + }); + if (IsLocal) + GV.setDSOLocal(true); + } + } + bool DoPromote = false; if (GV.hasLocalLinkage() && ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll b/test/Bitcode/thinlto-summary-local-5.0.ll new file mode 100644 index 0000000000000..cbc48d23df3c7 --- /dev/null +++ b/test/Bitcode/thinlto-summary-local-5.0.ll @@ -0,0 +1,22 @@ +; Bitcode compatibility test for dso_local flag in thin-lto summaries. +; Checks that older bitcode summaries without the dso_local op are still +; properly parsed and don't set GlobalValues as dso_local. + +; RUN: llvm-dis < %s.bc | FileCheck %s +; RUN: llvm-bcanalyzer -dump %s.bc | FileCheck %s --check-prefix=BCAN + +define void @foo() { +;CHECK-DAG:define void @foo() + ret void +} + +@bar = global i32 0 +;CHECK-DAG: @bar = global i32 0 + +@baz = alias i32, i32* @bar +;CHECK-DAG: @bar = global i32 0 + +;BCAN: +;BCAN-NEXT: +;BCAN-NEXT: diff --git a/test/Bitcode/thinlto-summary-local-5.0.ll.bc b/test/Bitcode/thinlto-summary-local-5.0.ll.bc new file mode 100644 index 0000000000000000000000000000000000000000..8dc7ca0a74b760ce63a2967d78da0abefd37c9aa GIT binary patch literal 1028 zcmZ8fZ)h837=N#q=59%LxpqMlc6Z!$Q}V&g<7!#zCD7!w)>$ECei%$K7eh=XO|NN8 z(+nipWxY&;-e!@2{jlg93>7N)L2zK3)EY`0^}k3d3(dMAwFFnu5BuPIai4s6-{*PX zhv)Zu-{((ITG<|Q0B{%p5V7%8VBPXb*!pP*O9Erv(TRgi^S!Wrf73k0VhWEbA#9E$ zf)4gHL;%3q0Qd!3YYzu{pgjUNRycMk&@eq~CWYb4NYVC4FYaPYZ$x=vwMGgo5`PELlrlr;m=Xu4b@jf#b(Em31jEt*vYt-f;z7$x~7D&j2p}R z(+g?m^4iO|HsjP%6>Yv!cx=Bz?B62(S|Rp$G}5S2?JCvm>Mfbt;Hb3%y&;=_)}`Tw zG+gQGFY4Y!$y-9j#ros$(fjAcA5n3+Zu@3ZQ_g59MNP-CY*we%Iub@Q}+AK3p4 z5pA8=8L_umVh?7)y6LS|rWRnn=cug$vvrGVx9HyyCgz;*E=x}?sbVv9{6QjNZ7z#t zD-+`fy4|P%^BS`uQ+GLfdzF3zVT=!RbVoM7a6$Dp>*BKHhXuJ`^R*?h1h4h-*A|$m zHV6NMV>vgLf-M8rVegE&vRYDZSQ9LFMyW`CnA);5yjefLORZNA-whZo+vVv0-U8o| z0GJ^LYx=IL;u0(X>c67;U@`Q%F?TWxfMJiAke(_$8~h)d)Fq_YAO|AFwu60Yf$TeW z{5If((Xj>^jN%i31bGCk9D<=>24djE0f8U~ipUQJRaahuBl!?K^`na)8;*#aicL>P zB literal 0 HcmV?d00001 diff --git a/test/LTO/Resolution/X86/comdat-mixed-lto.ll b/test/LTO/Resolution/X86/comdat-mixed-lto.ll index f6ee22e4161d9..d6022c643519f 100644 --- a/test/LTO/Resolution/X86/comdat-mixed-lto.ll +++ b/test/LTO/Resolution/X86/comdat-mixed-lto.ll @@ -17,7 +17,7 @@ ; would clash with the copy from this module. ; RUN: llvm-dis %t3.0.0.preopt.bc -o - | FileCheck %s ; CHECK: define internal void @__cxx_global_var_init() section ".text.startup" { -; CHECK: define available_externally void @testglobfunc() section ".text.startup" { +; CHECK: define available_externally dso_local void @testglobfunc() section ".text.startup" { ; ModuleID = 'comdat-mixed-lto.o' source_filename = "comdat-mixed-lto.cpp" diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll index 60d082b3e0f78..94f2838423122 100644 --- a/test/LTO/Resolution/X86/comdat.ll +++ b/test/LTO/Resolution/X86/comdat.ll @@ -70,14 +70,14 @@ bb11: ; CHECK-DAG: @a23 = alias i32 (i8*), i32 (i8*)* @f1.2{{$}} ; CHECK-DAG: @a24 = alias i16, bitcast (i32 (i8*)* @f1.2 to i16*) -; CHECK: define weak_odr i32 @f1(i8*) comdat($c1) { +; CHECK: define weak_odr dso_local i32 @f1(i8*) comdat($c1) { ; CHECK-NEXT: bb10: ; CHECK-NEXT: br label %bb11{{$}} ; CHECK: bb11: ; CHECK-NEXT: ret i32 42 ; CHECK-NEXT: } -; CHECK: define internal i32 @f1.2(i8* %this) comdat($c2) { +; CHECK: define internal dso_local i32 @f1.2(i8* %this) comdat($c2) { ; CHECK-NEXT: bb20: ; CHECK-NEXT: store i8* %this, i8** null ; CHECK-NEXT: br label %bb21 diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll index 28bf1ada4a862..8adfb87d6edf5 100644 --- a/test/LTO/Resolution/X86/commons.ll +++ b/test/LTO/Resolution/X86/commons.ll @@ -4,7 +4,7 @@ ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc | FileCheck %s ; A strong definition should override the common -; CHECK: @x = global i32 42, align 4 +; CHECK: @x = dso_local global i32 42, align 4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll index c19ccb01be3cf..90de3bb9a3223 100644 --- a/test/ThinLTO/X86/deadstrip.ll +++ b/test/ThinLTO/X86/deadstrip.ll @@ -18,8 +18,8 @@ ; RUN: -r %t2.bc,_boo,pl \ ; RUN: -r %t2.bc,_dead_func,pl \ ; RUN: -r %t2.bc,_another_dead_func,pl -; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s -; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2 +; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s --check-prefix=LTO2 +; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM ; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED @@ -27,14 +27,14 @@ ; COMBINED-DAG: