diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index f9bd233cf8ecf..351ba623e2b6d 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -47,6 +47,7 @@ set(sources X86FixupVectorConstants.cpp X86AvoidStoreForwardingBlocks.cpp X86DynAllocaExpander.cpp + X86EliminateRedundantZeroExtend.cpp X86FixupSetCC.cpp X86FlagsCopyLowering.cpp X86FloatingPoint.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 6261fadf10a7a..cd59eb5c80149 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass(); /// the upper portions of registers, and to save code size. FunctionPass *createX86FixupBWInsts(); +/// Return a Machine IR pass that eliminates redundant zero-extension +/// instructions where the upper bits are already known to be zero. +FunctionPass *createX86EliminateRedundantZeroExtend(); + /// Return a Machine IR pass that reassigns instruction chains from one domain /// to another, when profitable. FunctionPass *createX86DomainReassignmentPass(); diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp new file mode 100644 index 0000000000000..72717b1c64794 --- /dev/null +++ b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp @@ -0,0 +1,292 @@ +//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass eliminates redundant zero-extension instructions where the source +/// register is a sub-register of the destination and the destination's upper +/// bits are known to be zero. +/// +/// For example: +/// movzbl (%rdi), %ecx ; ECX = zero-extend byte, upper 24 bits are zero +/// ... +/// movzbl %cl, %ecx ; Redundant! CL is part of ECX, upper bits already 0 +/// +/// This pattern commonly occurs in loops processing byte values. +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-eliminate-zext" +#define PASS_NAME "X86 Eliminate Redundant Zero Extension" + +namespace { +class EliminateRedundantZeroExtend : public MachineFunctionPass { +public: + static char ID; + EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return PASS_NAME; } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().setNoVRegs(); + } + +private: + const X86InstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + + /// Check if the register's upper bits are known to be zero at this point. + /// This checks backward from MI to find the most recent definition of Reg. + bool hasZeroUpperBits(Register Reg, const MachineInstr &MI, + const MachineBasicBlock &MBB) const; + + /// Try to eliminate a redundant MOVZX instruction. + bool tryEliminateRedundantZeroExtend(MachineInstr &MI, + MachineBasicBlock &MBB) const; +}; + +char EliminateRedundantZeroExtend::ID = 0; +} // end anonymous namespace + +FunctionPass *llvm::createX86EliminateRedundantZeroExtend() { + return new EliminateRedundantZeroExtend(); +} + +bool EliminateRedundantZeroExtend::hasZeroUpperBits( + Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const { + // Walk backward from MI to find the most recent definition of Reg + MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator(); + MachineBasicBlock::const_reverse_iterator E = MBB.rend(); + for (; I != E; ++I) { + const MachineInstr &Inst = *I; + + // Check if this instruction defines Reg + for (const MachineOperand &MO : Inst.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + Register DefReg = MO.getReg(); + if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) { + // Found a definition - check if it zeros upper bits + unsigned Opc = Inst.getOpcode(); + switch (Opc) { + // These instructions zero-extend to 32 bits + case X86::MOVZX32rm8: + case X86::MOVZX32rr8: + case X86::MOVZX32rm16: + case X86::MOVZX32rr16: + return true; + // XOR with self zeros the register + case X86::XOR32rr: + if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg()) + return true; + return false; + // MOV32r0 explicitly zeros + case X86::MOV32r0: + return true; + // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit) + case X86::ADD32rr: + case X86::ADD32ri: + case X86::ADD32rm: + case X86::SUB32rr: + case X86::SUB32ri: + case X86::SUB32rm: + case X86::LEA32r: + return true; + default: + // Any other definition might set upper bits, so not safe + return false; + } + } + + // Check if this instruction modifies Reg (partial write or implicit use) + if (TRI->regsOverlap(DefReg, Reg)) { + // Partial register update - upper bits are unknown + return false; + } + } + + // Check for implicit defs + for (const MachineOperand &MO : Inst.implicit_operands()) { + if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) { + return false; + } + } + } + + // Didn't find a definition in this block - check predecessors + // If all predecessors define Reg with zero upper bits, it's safe + if (MBB.pred_empty()) + return false; + + // Check all predecessor blocks + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + bool FoundZeroExtend = false; + + // SAFETY CHECK: If the sub-register is live-in to the predecessor, + // we make the CONSERVATIVE assumption that the parent register was + // zero-extended in an earlier block. + // + // This is safe because: + // 1. After register allocation, if $cl is live-in but $ecx is not, + // it means only the low 8 bits are meaningful + // 2. The register allocator ensures no other code modifies $ecx between + // the zero-extension and this point (otherwise $ecx would be live) + // 3. Any write to $ch or upper bits would show as a def of $ecx, which + // would be found in our backward scan below and handled correctly + // + // However, this is still conservative - we should verify the actual + // definition to be completely safe. + Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit); + Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit); + bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) || + (SubReg16 && Pred->isLiveIn(SubReg16)); + + if (SubRegLiveIn) { + // Sub-register is live-in. We'll verify this is safe by checking + // that no instructions in this block modify the parent register + // before we reach the end (where control flows to our block). + // If we find any such modification, we'll conservatively bail out. + bool SafeToAssume = true; + for (const MachineInstr &Inst : *Pred) { + for (const MachineOperand &MO : Inst.operands()) { + if (MO.isReg() && MO.isDef()) { + Register DefReg = MO.getReg(); + // Check if this modifies Reg or overlaps with it (partial write) + if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) && + DefReg != SubReg8 && DefReg != SubReg16) { + // Found a write to the parent register or overlapping register + // that's not just the sub-register we expect + SafeToAssume = false; + break; + } + } + } + if (!SafeToAssume) + break; + } + + if (SafeToAssume) { + FoundZeroExtend = true; + goto next_predecessor; + } + } + + // Walk backward through predecessor to find last definition of Reg + for (const MachineInstr &Inst : llvm::reverse(*Pred)) { + // Check if this instruction defines Reg + for (const MachineOperand &MO : Inst.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + + Register DefReg = MO.getReg(); + if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) { + // Found a definition - check if it zeros upper bits + unsigned Opc = Inst.getOpcode(); + switch (Opc) { + case X86::MOVZX32rm8: + case X86::MOVZX32rr8: + case X86::MOVZX32rm16: + case X86::MOVZX32rr16: + case X86::MOV32r0: + case X86::ADD32rr: + case X86::ADD32ri: + case X86::ADD32rm: + case X86::SUB32rr: + case X86::SUB32ri: + case X86::SUB32rm: + case X86::LEA32r: + FoundZeroExtend = true; + break; + case X86::XOR32rr: + if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg()) + FoundZeroExtend = true; + break; + default: + // Found a definition that doesn't zero upper bits + return false; + } + // Found the definition in this predecessor + goto next_predecessor; + } + + // Check for partial register updates + if (TRI->regsOverlap(DefReg, Reg)) { + return false; + } + } + } + + next_predecessor: + // If we didn't find a zero-extending definition in this predecessor, fail + if (!FoundZeroExtend) + return false; + } + + // All predecessors have zero-extending definitions + return true; +} + +bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend( + MachineInstr &MI, MachineBasicBlock &MBB) const { + unsigned Opc = MI.getOpcode(); + + // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later) + if (Opc != X86::MOVZX32rr8) + return false; + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + // Check if source is a sub-register of destination + // e.g., CL is sub-register of ECX + if (!TRI->isSubRegister(DstReg, SrcReg)) + return false; + + // Check if destination's upper bits are already zero + if (!hasZeroUpperBits(DstReg, MI, MBB)) + return false; + + // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's + // upper bits are already zero, this instruction does nothing. + LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI); + MI.eraseFromParent(); + return true; +} + +bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + // Iterate through instructions - use a worklist to handle erasures + SmallVector ToErase; + + for (MachineInstr &MI : MBB) { + if (tryEliminateRedundantZeroExtend(MI, MBB)) { + Changed = true; + // Note: MI is already erased in tryEliminateRedundantZeroExtend + break; // Restart iteration for this block + } + } + } + + return Changed; +} \ No newline at end of file diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 8dd6f3d97ccea..72835150e8277 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() { if (getOptLevel() != CodeGenOptLevel::None) { addPass(createX86FixupBWInsts()); + addPass(createX86EliminateRedundantZeroExtend()); addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); addPass(createX86FixupInstTuning()); diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..a283a002d9818 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -177,7 +177,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind { ; X86-NEXT: lock cmpxchgb %cl, (%esi) ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: testl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -198,7 +197,6 @@ define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind { ; X64-NEXT: lock cmpxchgb %cl, (%rdi) ; X64-NEXT: jne .LBB3_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: testl %eax, %edx ; X64-NEXT: sete %al ; X64-NEXT: retq @@ -233,7 +231,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw ; X86-NEXT: lock cmpxchgb %cl, (%esi) ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movzbl %dl, %ecx ; X86-NEXT: btl %ecx, %eax ; X86-NEXT: setae %al @@ -255,7 +252,6 @@ define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounw ; X64-NEXT: lock cmpxchgb %cl, (%rdi) ; X64-NEXT: jne .LBB4_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: btl %ecx, %eax ; X64-NEXT: setae %al @@ -291,7 +287,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun ; X86-NEXT: lock cmpxchgb %cl, (%edx) ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: testl %eax, %ebx ; X86-NEXT: sete %al ; X86-NEXT: popl %ebx @@ -313,7 +308,6 @@ define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) noun ; X64-NEXT: lock cmpxchgb %cl, (%rdi) ; X64-NEXT: jne .LBB5_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: testl %eax, %edx ; X64-NEXT: sete %al ; X64-NEXT: retq @@ -349,7 +343,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind { ; X86-NEXT: lock cmpxchgb %ch, (%edx) ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: testl %eax, %ebx ; X86-NEXT: je .LBB6_3 ; X86-NEXT: # %bb.4: # %if.then @@ -378,7 +371,6 @@ define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind { ; X64-NEXT: lock cmpxchgb %r8b, (%rdi) ; X64-NEXT: jne .LBB6_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: testl %eax, %edx ; X64-NEXT: je .LBB6_3 ; X64-NEXT: # %bb.4: # %if.then @@ -512,7 +504,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun ; X86-NEXT: testl %ecx, %ebx ; X86-NEXT: je .LBB8_3 ; X86-NEXT: # %bb.4: # %if.then -; X86-NEXT: movzbl %ah, %eax ; X86-NEXT: movzbl (%edx,%eax), %eax ; X86-NEXT: popl %ebx ; X86-NEXT: retl @@ -538,7 +529,6 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun ; X64-NEXT: lock cmpxchgb %r8b, (%rdi) ; X64-NEXT: jne .LBB8_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: testl %eax, %edx ; X64-NEXT: je .LBB8_3 ; X64-NEXT: # %bb.4: # %if.then diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index 1267fe9033454..a3d28a7fcba24 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -224,7 +224,6 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X86-NOCMOV-NEXT: testb %al, %al ; X86-NOCMOV-NEXT: je .LBB4_1 ; X86-NOCMOV-NEXT: # %bb.2: # %cond.false -; X86-NOCMOV-NEXT: movzbl %al, %eax ; X86-NOCMOV-NEXT: bsrl %eax, %eax ; X86-NOCMOV-NEXT: xorl $7, %eax ; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax @@ -961,7 +960,6 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X86-NOCMOV-NEXT: testb %al, %al ; X86-NOCMOV-NEXT: je .LBB16_1 ; X86-NOCMOV-NEXT: # %bb.2: # %cond.false -; X86-NOCMOV-NEXT: movzbl %al, %eax ; X86-NOCMOV-NEXT: bsrl %eax, %eax ; X86-NOCMOV-NEXT: xorl $7, %eax ; X86-NOCMOV-NEXT: xorb $7, %al diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll new file mode 100644 index 0000000000000..294a6e7f780e3 --- /dev/null +++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s + +; Test that redundant MOVZX instructions are eliminated when the source +; register is a sub-register of the destination and the destination's upper +; bits are already known to be zero. + +; This is the original countholes test case from GitHub issue #160710 that demonstrates +; the redundant movzbl %cl, %ecx in the loop +define i32 @countholes(ptr %s) { +; CHECK-LABEL: countholes: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movzbl (%rdi), %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpb $48, %cl +; CHECK-NEXT: jb .LBB0_3 +; CHECK-NEXT: # %bb.1: # %while.body.preheader +; CHECK-NEXT: incq %rdi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: # %while.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: addl pre_table-192(,%rcx,4), %eax +; CHECK-NEXT: movzbl (%rdi), %ecx +; CHECK-NEXT: incq %rdi +; CHECK-NEXT: cmpb $47, %cl +; CHECK-NEXT: ja .LBB0_2 +; CHECK-NEXT: .LBB0_3: # %cleanup +; CHECK-NEXT: retq +entry: + %c.0 = load i8, ptr %s, align 1 + %conv = zext i8 %c.0 to i32 + %cmp = icmp ult i8 %c.0, 48 + br i1 %cmp, label %cleanup, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ] + %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ] + %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ] + %conv3 = zext i8 %c.010 to i64 + %sub = add nsw i64 %conv3, -48 + %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub + %0 = load i32, ptr %arrayidx, align 4 + %add = add i32 %0, %tot.09 + %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1 + %c.1 = load i8, ptr %incdec.ptr, align 1 + %cmp1 = icmp ult i8 %c.1, 48 + br i1 %cmp1, label %cleanup.loopexit, label %while.body + +cleanup.loopexit: + br label %cleanup + +cleanup: + %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ] + ret i32 %retval.0 +} + +@pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4 diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll index d013ad2c7fbff..783db3487e2bd 100644 --- a/llvm/test/CodeGen/X86/isel-select-cmov.ll +++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll @@ -73,11 +73,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b ; FAST-X86-NEXT: jne LBB0_1 ; FAST-X86-NEXT: ## %bb.2: ; FAST-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FAST-X86-NEXT: movzbl %al, %eax ; FAST-X86-NEXT: retl ; FAST-X86-NEXT: LBB0_1: ; FAST-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FAST-X86-NEXT: movzbl %al, %eax ; FAST-X86-NEXT: retl ; ; FAST-X86-CMOV-LABEL: select_cmov_i8: @@ -86,11 +84,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b ; FAST-X86-CMOV-NEXT: jne LBB0_1 ; FAST-X86-CMOV-NEXT: ## %bb.2: ; FAST-X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FAST-X86-CMOV-NEXT: movzbl %al, %eax ; FAST-X86-CMOV-NEXT: retl ; FAST-X86-CMOV-NEXT: LBB0_1: ; FAST-X86-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; FAST-X86-CMOV-NEXT: movzbl %al, %eax ; FAST-X86-CMOV-NEXT: retl ; ; GISEL-X86-LABEL: select_cmov_i8: diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll index b123b3c7780fa..f96a12c2fafd0 100644 --- a/llvm/test/CodeGen/X86/isel-udiv.ll +++ b/llvm/test/CodeGen/X86/isel-udiv.ll @@ -22,7 +22,6 @@ define i8 @test_udiv_i8(i8 %arg1, i8 %arg2) nounwind { ; GISEL-X86-LABEL: test_udiv_i8: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movzbl %al, %eax ; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: divb %cl ; GISEL-X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll index 386f08151ad9c..5dd901fe8daa6 100644 --- a/llvm/test/CodeGen/X86/isel-urem.ll +++ b/llvm/test/CodeGen/X86/isel-urem.ll @@ -49,7 +49,6 @@ define i8 @test_urem_i8(i8 %arg1, i8 %arg2) nounwind { ; GISEL-X86-LABEL: test_urem_i8: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movzbl %al, %eax ; GISEL-X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; GISEL-X86-NEXT: divb %cl ; GISEL-X86-NEXT: movb %ah, %al diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 81390e59d0d0a..01385fb63d6e1 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -202,6 +202,7 @@ ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Byte/Word Instruction Fixup +; CHECK-NEXT: X86 Eliminate Redundant Zero Extension ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index 3004b8b72fcc5..cd5edffd8ccda 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -67,7 +67,6 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $8, %eax ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -1840,7 +1839,6 @@ define i32 @popcount_i16_zext(i16 zeroext %x) { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $8, %eax ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; ; X64-BASE-LABEL: popcount_i16_zext: diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index 412455384e937..147abcdbff0b9 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -28,7 +28,6 @@ define void @f() nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl (%eax), %eax ; X86-NEXT: movzbl (%eax), %ecx -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: divb %cl ; X86-NEXT: movl %edi, %eax diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 39cbee54737c3..b0c92831124bf 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -89,7 +89,6 @@ define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, ; X64-NEXT: jne .LBB2_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB2_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) @@ -222,7 +221,6 @@ define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 ; X64-NEXT: jne .LBB5_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB5_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) @@ -552,7 +550,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { ; X86-NEXT: jne .LBB14_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: movzbl %al, %eax ; X86-NEXT: retl ; X86-NEXT: .LBB14_2: # %compare ; X86-NEXT: pushl %ebp @@ -577,7 +574,6 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { ; X64-NEXT: jne .LBB14_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB14_2: # %compare ; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) @@ -690,7 +686,6 @@ define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind { ; X64-NEXT: jne .LBB17_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: movzbl %al, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB17_2: # %compare ; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index ac932d51017ae..53e6e49268789 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -2012,7 +2012,6 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX2-NEXT: andl $63, %ecx ; AVX2-NEXT: vpextrb $10, %xmm0, (%rsp,%rcx) ; AVX2-NEXT: movzbl 56(%rbp), %ecx -; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addq %rax, %rcx ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax @@ -3348,7 +3347,6 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX2-NEXT: addl %r8d, %r9d ; AVX2-NEXT: movzbl 16(%rbp), %ecx ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%r9,4) -; AVX2-NEXT: movzbl %cl, %ecx ; AVX2-NEXT: andl $1, %ecx ; AVX2-NEXT: addl %r9d, %ecx ; AVX2-NEXT: movzbl 24(%rbp), %edx