Skip to content

Conversation

FranciscoThiesen
Copy link
Contributor

This pass eliminates redundant MOVZX32rr8 instructions when the source register is a sub-register of the destination and the destination's upper bits are already known to be zero.

For example, in loops processing byte values:

  movzbl (%rdi), %ecx  ; ECX upper 24 bits are zero
  ...
  movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0

The optimization:

  • Runs post-register allocation in the X86 backend pipeline
  • Analyzes backward through basic blocks to verify upper bits are zero
  • Handles cross-block analysis by checking predecessor definitions
  • Only eliminates when provably safe (not heuristic)

This commonly occurs in loops that process byte values, saving one instruction per loop iteration and reducing code size by 3 bytes.

This is to solve the problem outlined in #160710

This pass eliminates redundant MOVZX32rr8 instructions when the source
register is a sub-register of the destination and the destination's upper
bits are already known to be zero.

For example, in loops processing byte values:
```
  movzbl (%rdi), %ecx  ; ECX upper 24 bits are zero
  ...
  movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
```

The optimization:
- Runs post-register allocation in the X86 backend pipeline
- Analyzes backward through basic blocks to verify upper bits are zero
- Handles cross-block analysis by checking predecessor definitions
- Only eliminates when provably safe (not heuristic)

This commonly occurs in loops that process byte values, saving one
instruction per loop iteration and reducing code size by 3 bytes.
@llvmbot
Copy link
Member

llvmbot commented Sep 30, 2025

@llvm/pr-subscribers-backend-x86

Author: Francisco Geiman Thiesen (FranciscoThiesen)

Changes

This pass eliminates redundant MOVZX32rr8 instructions when the source register is a sub-register of the destination and the destination's upper bits are already known to be zero.

For example, in loops processing byte values:

  movzbl (%rdi), %ecx  ; ECX upper 24 bits are zero
  ...
  movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0

The optimization:

  • Runs post-register allocation in the X86 backend pipeline
  • Analyzes backward through basic blocks to verify upper bits are zero
  • Handles cross-block analysis by checking predecessor definitions
  • Only eliminates when provably safe (not heuristic)

This commonly occurs in loops that process byte values, saving one instruction per loop iteration and reducing code size by 3 bytes.

This is to solve the problem outlined in #160710


Full diff: https://github.com/llvm/llvm-project/pull/161401.diff

5 Files Affected:

  • (modified) llvm/lib/Target/X86/CMakeLists.txt (+1)
  • (modified) llvm/lib/Target/X86/X86.h (+4)
  • (added) llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp (+292)
  • (modified) llvm/lib/Target/X86/X86TargetMachine.cpp (+1)
  • (added) llvm/test/CodeGen/X86/eliminate-redundant-zext.ll (+63)
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..351ba623e2b6d 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -47,6 +47,7 @@ set(sources
   X86FixupVectorConstants.cpp
   X86AvoidStoreForwardingBlocks.cpp
   X86DynAllocaExpander.cpp
+  X86EliminateRedundantZeroExtend.cpp
   X86FixupSetCC.cpp
   X86FlagsCopyLowering.cpp
   X86FloatingPoint.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6261fadf10a7a..cd59eb5c80149 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -127,6 +127,10 @@ FunctionPass *createX86CmovConverterPass();
 /// the upper portions of registers, and to save code size.
 FunctionPass *createX86FixupBWInsts();
 
+/// Return a Machine IR pass that eliminates redundant zero-extension
+/// instructions where the upper bits are already known to be zero.
+FunctionPass *createX86EliminateRedundantZeroExtend();
+
 /// Return a Machine IR pass that reassigns instruction chains from one domain
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
diff --git a/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
new file mode 100644
index 0000000000000..72717b1c64794
--- /dev/null
+++ b/llvm/lib/Target/X86/X86EliminateRedundantZeroExtend.cpp
@@ -0,0 +1,292 @@
+//===-- X86EliminateRedundantZeroExtend.cpp - Eliminate Redundant ZExt ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass eliminates redundant zero-extension instructions where the source
+/// register is a sub-register of the destination and the destination's upper
+/// bits are known to be zero.
+///
+/// For example:
+///   movzbl (%rdi), %ecx  ; ECX = zero-extend byte, upper 24 bits are zero
+///   ...
+///   movzbl %cl, %ecx     ; Redundant! CL is part of ECX, upper bits already 0
+///
+/// This pattern commonly occurs in loops processing byte values.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-eliminate-zext"
+#define PASS_NAME "X86 Eliminate Redundant Zero Extension"
+
+namespace {
+class EliminateRedundantZeroExtend : public MachineFunctionPass {
+public:
+  static char ID;
+  EliminateRedundantZeroExtend() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().setNoVRegs();
+  }
+
+private:
+  const X86InstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+
+  /// Check if the register's upper bits are known to be zero at this point.
+  /// This checks backward from MI to find the most recent definition of Reg.
+  bool hasZeroUpperBits(Register Reg, const MachineInstr &MI,
+                        const MachineBasicBlock &MBB) const;
+
+  /// Try to eliminate a redundant MOVZX instruction.
+  bool tryEliminateRedundantZeroExtend(MachineInstr &MI,
+                                       MachineBasicBlock &MBB) const;
+};
+
+char EliminateRedundantZeroExtend::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86EliminateRedundantZeroExtend() {
+  return new EliminateRedundantZeroExtend();
+}
+
+bool EliminateRedundantZeroExtend::hasZeroUpperBits(
+    Register Reg, const MachineInstr &MI, const MachineBasicBlock &MBB) const {
+  // Walk backward from MI to find the most recent definition of Reg
+  MachineBasicBlock::const_reverse_iterator I = ++MI.getReverseIterator();
+  MachineBasicBlock::const_reverse_iterator E = MBB.rend();
+  for (; I != E; ++I) {
+    const MachineInstr &Inst = *I;
+
+    // Check if this instruction defines Reg
+    for (const MachineOperand &MO : Inst.operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      Register DefReg = MO.getReg();
+      if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+        // Found a definition - check if it zeros upper bits
+        unsigned Opc = Inst.getOpcode();
+        switch (Opc) {
+        // These instructions zero-extend to 32 bits
+        case X86::MOVZX32rm8:
+        case X86::MOVZX32rr8:
+        case X86::MOVZX32rm16:
+        case X86::MOVZX32rr16:
+          return true;
+        // XOR with self zeros the register
+        case X86::XOR32rr:
+          if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+            return true;
+          return false;
+        // MOV32r0 explicitly zeros
+        case X86::MOV32r0:
+          return true;
+        // ADD, SUB on 32-bit register (implicitly zero-extends to 64-bit)
+        case X86::ADD32rr:
+        case X86::ADD32ri:
+        case X86::ADD32rm:
+        case X86::SUB32rr:
+        case X86::SUB32ri:
+        case X86::SUB32rm:
+        case X86::LEA32r:
+          return true;
+        default:
+          // Any other definition might set upper bits, so not safe
+          return false;
+        }
+      }
+
+      // Check if this instruction modifies Reg (partial write or implicit use)
+      if (TRI->regsOverlap(DefReg, Reg)) {
+        // Partial register update - upper bits are unknown
+        return false;
+      }
+    }
+
+    // Check for implicit defs
+    for (const MachineOperand &MO : Inst.implicit_operands()) {
+      if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Reg)) {
+        return false;
+      }
+    }
+  }
+
+  // Didn't find a definition in this block - check predecessors
+  // If all predecessors define Reg with zero upper bits, it's safe
+  if (MBB.pred_empty())
+    return false;
+
+  // Check all predecessor blocks
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    bool FoundZeroExtend = false;
+
+    // SAFETY CHECK: If the sub-register is live-in to the predecessor,
+    // we make the CONSERVATIVE assumption that the parent register was
+    // zero-extended in an earlier block.
+    //
+    // This is safe because:
+    // 1. After register allocation, if $cl is live-in but $ecx is not,
+    //    it means only the low 8 bits are meaningful
+    // 2. The register allocator ensures no other code modifies $ecx between
+    //    the zero-extension and this point (otherwise $ecx would be live)
+    // 3. Any write to $ch or upper bits would show as a def of $ecx, which
+    //    would be found in our backward scan below and handled correctly
+    //
+    // However, this is still conservative - we should verify the actual
+    // definition to be completely safe.
+    Register SubReg8 = TRI->getSubReg(Reg, X86::sub_8bit);
+    Register SubReg16 = TRI->getSubReg(Reg, X86::sub_16bit);
+    bool SubRegLiveIn = (SubReg8 && Pred->isLiveIn(SubReg8)) ||
+                        (SubReg16 && Pred->isLiveIn(SubReg16));
+
+    if (SubRegLiveIn) {
+      // Sub-register is live-in. We'll verify this is safe by checking
+      // that no instructions in this block modify the parent register
+      // before we reach the end (where control flows to our block).
+      // If we find any such modification, we'll conservatively bail out.
+      bool SafeToAssume = true;
+      for (const MachineInstr &Inst : *Pred) {
+        for (const MachineOperand &MO : Inst.operands()) {
+          if (MO.isReg() && MO.isDef()) {
+            Register DefReg = MO.getReg();
+            // Check if this modifies Reg or overlaps with it (partial write)
+            if ((DefReg == Reg || TRI->regsOverlap(DefReg, Reg)) &&
+                DefReg != SubReg8 && DefReg != SubReg16) {
+              // Found a write to the parent register or overlapping register
+              // that's not just the sub-register we expect
+              SafeToAssume = false;
+              break;
+            }
+          }
+        }
+        if (!SafeToAssume)
+          break;
+      }
+
+      if (SafeToAssume) {
+        FoundZeroExtend = true;
+        goto next_predecessor;
+      }
+    }
+
+    // Walk backward through predecessor to find last definition of Reg
+    for (const MachineInstr &Inst : llvm::reverse(*Pred)) {
+      // Check if this instruction defines Reg
+      for (const MachineOperand &MO : Inst.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+
+        Register DefReg = MO.getReg();
+        if (DefReg == Reg || TRI->isSuperRegister(Reg, DefReg)) {
+          // Found a definition - check if it zeros upper bits
+          unsigned Opc = Inst.getOpcode();
+          switch (Opc) {
+          case X86::MOVZX32rm8:
+          case X86::MOVZX32rr8:
+          case X86::MOVZX32rm16:
+          case X86::MOVZX32rr16:
+          case X86::MOV32r0:
+          case X86::ADD32rr:
+          case X86::ADD32ri:
+          case X86::ADD32rm:
+          case X86::SUB32rr:
+          case X86::SUB32ri:
+          case X86::SUB32rm:
+          case X86::LEA32r:
+            FoundZeroExtend = true;
+            break;
+          case X86::XOR32rr:
+            if (Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg())
+              FoundZeroExtend = true;
+            break;
+          default:
+            // Found a definition that doesn't zero upper bits
+            return false;
+          }
+          // Found the definition in this predecessor
+          goto next_predecessor;
+        }
+
+        // Check for partial register updates
+        if (TRI->regsOverlap(DefReg, Reg)) {
+          return false;
+        }
+      }
+    }
+
+  next_predecessor:
+    // If we didn't find a zero-extending definition in this predecessor, fail
+    if (!FoundZeroExtend)
+      return false;
+  }
+
+  // All predecessors have zero-extending definitions
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::tryEliminateRedundantZeroExtend(
+    MachineInstr &MI, MachineBasicBlock &MBB) const {
+  unsigned Opc = MI.getOpcode();
+
+  // Only handle MOVZX32rr8 for now (can extend to MOVZX32rr16 later)
+  if (Opc != X86::MOVZX32rr8)
+    return false;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  // Check if source is a sub-register of destination
+  // e.g., CL is sub-register of ECX
+  if (!TRI->isSubRegister(DstReg, SrcReg))
+    return false;
+
+  // Check if destination's upper bits are already zero
+  if (!hasZeroUpperBits(DstReg, MI, MBB))
+    return false;
+
+  // The MOVZX is redundant! Since SrcReg is part of DstReg and DstReg's
+  // upper bits are already zero, this instruction does nothing.
+  LLVM_DEBUG(dbgs() << "Eliminating redundant zero-extend: " << MI);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool EliminateRedundantZeroExtend::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Iterate through instructions - use a worklist to handle erasures
+    SmallVector<MachineInstr *, 4> ToErase;
+
+    for (MachineInstr &MI : MBB) {
+      if (tryEliminateRedundantZeroExtend(MI, MBB)) {
+        Changed = true;
+        // Note: MI is already erased in tryEliminateRedundantZeroExtend
+        break; // Restart iteration for this block
+      }
+    }
+  }
+
+  return Changed;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d97ccea..72835150e8277 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -558,6 +558,7 @@ void X86PassConfig::addPreEmitPass() {
 
   if (getOptLevel() != CodeGenOptLevel::None) {
     addPass(createX86FixupBWInsts());
+    addPass(createX86EliminateRedundantZeroExtend());
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
     addPass(createX86FixupInstTuning());
diff --git a/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
new file mode 100644
index 0000000000000..2c9e46e043187
--- /dev/null
+++ b/llvm/test/CodeGen/X86/eliminate-redundant-zext.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test that redundant MOVZX instructions are eliminated when the source
+; register is a sub-register of the destination and the destination's upper
+; bits are already known to be zero.
+
+; This is the original countholes test case from GitHub issue that demonstrates
+; the redundant movzbl %cl, %ecx in the loop
+define i32 @countholes(ptr %s) {
+; CHECK-LABEL: countholes:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpb $48, %cl
+; CHECK-NEXT:    jb .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %while.body.preheader
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    leaq pre_table(%rip), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addl $-48, %ecx
+; CHECK-NEXT:    addl (%rdx,%rcx,4), %eax
+; CHECK-NEXT:    movzbl (%rdi), %ecx
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    cmpb $47, %cl
+; CHECK-NEXT:    ja .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %cleanup
+; CHECK-NEXT:    retq
+entry:
+  %c.0 = load i8, ptr %s, align 1
+  %conv = zext i8 %c.0 to i32
+  %cmp = icmp ult i8 %c.0, 48
+  br i1 %cmp, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %s.addr.011 = phi ptr [ %incdec.ptr, %while.body ], [ %s, %while.body.preheader ]
+  %c.010 = phi i8 [ %c.1, %while.body ], [ %c.0, %while.body.preheader ]
+  %tot.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %conv3 = zext i8 %c.010 to i64
+  %sub = add nsw i64 %conv3, -48
+  %arrayidx = getelementptr inbounds [10 x i32], ptr @pre_table, i64 0, i64 %sub
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add i32 %0, %tot.09
+  %incdec.ptr = getelementptr inbounds i8, ptr %s.addr.011, i64 1
+  %c.1 = load i8, ptr %incdec.ptr, align 1
+  %cmp1 = icmp ult i8 %c.1, 48
+  br i1 %cmp1, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ 0, %entry ], [ %add, %cleanup.loopexit ]
+  ret i32 %retval.0
+}
+
+@pre_table = internal constant [10 x i32] [i32 1, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 2, i32 1], align 4

@FranciscoThiesen
Copy link
Contributor Author

@RKSimon can I get a re-review?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants