[GlobalISel] combine G_TRUNC with G_MERGE_VALUES

Summary: Truncating the result of a merge means that most likely we could have done without merge in the first place and just used the input merge inputs directly. This can be done in three cases: 1. If the truncation result is smaller than the merge source, we can use the source in the trunc directly 2. If the sizes are the same, we can replace the register or use a copy 3. If the truncation size is a multiple of the merge source size, we can build a smaller merge This gets rid of most of the larger, hard-to-legalize merges. Reviewers: qcolombet, aditya_nandakumar, aemerson, paquette, arsenm, Petar.Avramovic Reviewed By: arsenm Subscribers: sdardis, jvesely, wdng, nhaehnle, rovka, jrtc27, atanasyan, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D75915
llvm · Mar 16, 2020 · c0241f1 · c0241f1
1 parent 7aecf23
commit c0241f1
Show file tree

Hide file tree

Showing 15 changed files with 1,156 additions and 1,154 deletions.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -167,7 +167,8 @@ class LegalizationArtifactCombiner {
 
   bool tryCombineTrunc(MachineInstr &MI,
                        SmallVectorImpl<MachineInstr *> &DeadInsts,
-                       SmallVectorImpl<Register> &UpdatedDefs) {
+                       SmallVectorImpl<Register> &UpdatedDefs,
+                       GISelObserverWrapper &Observer) {
     assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
 
     Builder.setInstr(MI);
@@ -189,6 +190,66 @@ class LegalizationArtifactCombiner {
       }
     }
 
+    // Try to fold trunc(merge) to directly use the source of the merge.
+    // This gets rid of large, difficult to legalize, merges
+    if (SrcMI->getOpcode() == TargetOpcode::G_MERGE_VALUES) {
+      const Register MergeSrcReg = SrcMI->getOperand(1).getReg();
+      const LLT MergeSrcTy = MRI.getType(MergeSrcReg);
+      const LLT DstTy = MRI.getType(DstReg);
+
+      // We can only fold if the types are scalar
+      const unsigned DstSize = DstTy.getSizeInBits();
+      const unsigned MergeSrcSize = MergeSrcTy.getSizeInBits();
+      if (!DstTy.isScalar() || !MergeSrcTy.isScalar())
+        return false;
+
+      if (DstSize < MergeSrcSize) {
+        // When the merge source is larger than the destination, we can just
+        // truncate the merge source directly
+        if (isInstUnsupported({TargetOpcode::G_TRUNC, {DstTy, MergeSrcTy}}))
+          return false;
+
+        LLVM_DEBUG(dbgs() << "Combining G_TRUNC(G_MERGE_VALUES) to G_TRUNC: "
+                          << MI);
+
+        Builder.buildTrunc(DstReg, MergeSrcReg);
+        UpdatedDefs.push_back(DstReg);
+      } else if (DstSize == MergeSrcSize) {
+        // If the sizes match we can simply try to replace the register
+        LLVM_DEBUG(
+            dbgs() << "Replacing G_TRUNC(G_MERGE_VALUES) with merge input: "
+                   << MI);
+        replaceRegOrBuildCopy(DstReg, MergeSrcReg, MRI, Builder, UpdatedDefs,
+                              Observer);
+      } else if (DstSize % MergeSrcSize == 0) {
+        // If the trunc size is a multiple of the merge source size we can use
+        // a smaller merge instead
+        if (isInstUnsupported(
+                {TargetOpcode::G_MERGE_VALUES, {DstTy, MergeSrcTy}}))
+          return false;
+
+        LLVM_DEBUG(
+            dbgs() << "Combining G_TRUNC(G_MERGE_VALUES) to G_MERGE_VALUES: "
+                   << MI);
+
+        const unsigned NumSrcs = DstSize / MergeSrcSize;
+        assert(NumSrcs < SrcMI->getNumOperands() - 1 &&
+               "trunc(merge) should require less inputs than merge");
+        SmallVector<Register, 2> SrcRegs(NumSrcs);
+        for (unsigned i = 0; i < NumSrcs; ++i)
+          SrcRegs[i] = SrcMI->getOperand(i + 1).getReg();
+
+        Builder.buildMerge(DstReg, SrcRegs);
+        UpdatedDefs.push_back(DstReg);
+      } else {
+        // Unable to combine
+        return false;
+      }
+
+      markInstAndDefDead(MI, *SrcMI, DeadInsts);
+      return true;
+    }
+
     return false;
   }
 
@@ -533,7 +594,7 @@ class LegalizationArtifactCombiner {
       Changed = tryCombineExtract(MI, DeadInsts, UpdatedDefs);
       break;
     case TargetOpcode::G_TRUNC:
-      Changed = tryCombineTrunc(MI, DeadInsts, UpdatedDefs);
+      Changed = tryCombineTrunc(MI, DeadInsts, UpdatedDefs, WrapperObserver);
       if (!Changed) {
         // Try to combine truncates away even if they are legal. As all artifact
         // combines at the moment look only "up" the def-use chains, we achieve

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
@@ -8,9 +8,7 @@ body: |
 
     ; CHECK-LABEL: name: test_implicit_def
     ; CHECK: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[DEF]](s64), [[DEF]](s64)
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
-    ; CHECK: $x0 = COPY [[TRUNC]](s64)
+    ; CHECK: $x0 = COPY [[DEF]](s64)
     %0:_(s128) = G_IMPLICIT_DEF
     %1:_(s64) = G_TRUNC %0(s128)
     $x0 = COPY %1(s64)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-trunc.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name: trunc_s16_merge_s64_s32
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with trunc-size < merge-source-size creates a trunc
+    ; of the merge source
+    ; CHECK-LABEL: name: trunc_s16_merge_s64_s32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: $vgpr0 = COPY [[C]](s32)
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(s64) = G_MERGE_VALUES %0, %1
+    %3:_(s16) = G_TRUNC %2
+    %4:_(s32) = G_ANYEXT %3
+    $vgpr0 = COPY %4
+...
+
+---
+name: trunc_s32_merge_s64_s32
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with trunc-size == merge-source-size is eliminated
+    ; CHECK-LABEL: name: trunc_s32_merge_s64_s32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: $vgpr0 = COPY [[C]](s32)
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(s64) = G_MERGE_VALUES %0, %1
+    %3:_(s32) = G_TRUNC %2
+    $vgpr0 = COPY %3
+...
+
+---
+name: trunc_s64_merge_s128_s32
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with trunc-size > merge-source-size combines to a
+    ; smaller merge
+    ; CHECK-LABEL: name: trunc_s64_merge_s128_s32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(s128) = G_MERGE_VALUES %0, %1, %0, %1
+    %3:_(s64) = G_TRUNC %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: trunc_s32_merge_s128_p0
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with a non-scalar merge source is not combined
+    ; CHECK-LABEL: name: trunc_s32_merge_s128_p0
+    ; CHECK: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+    ; CHECK: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 1
+    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[C]](p0), [[C1]](p0)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[MV]](s128)
+    ; CHECK: $vgpr0 = COPY [[TRUNC]](s32)
+    %0:_(p0) = G_CONSTANT i64 0
+    %1:_(p0) = G_CONSTANT i64 1
+    %2:_(s128) = G_MERGE_VALUES %0, %1
+    %3:_(s32) = G_TRUNC %2
+    $vgpr0 = COPY %3
+...
+
+---
+name: trunc_s64_merge_s128_p0
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with a non-scalar merge source is not combined
+    ; CHECK-LABEL: name: trunc_s64_merge_s128_p0
+    ; CHECK: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+    ; CHECK: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 1
+    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[C]](p0), [[C1]](p0)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[TRUNC]](s64)
+    %0:_(p0) = G_CONSTANT i64 0
+    %1:_(p0) = G_CONSTANT i64 1
+    %2:_(s128) = G_MERGE_VALUES %0, %1
+    %3:_(s64) = G_TRUNC %2
+    $vgpr0_vgpr1 = COPY %3
+...
+
+---
+name: trunc_s128_merge_s192_p0
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with a non-scalar merge source is not combined
+    ; CHECK-LABEL: name: trunc_s128_merge_s192_p0
+    ; CHECK: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+    ; CHECK: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 1
+    ; CHECK: [[MV:%[0-9]+]]:_(s192) = G_MERGE_VALUES [[C]](p0), [[C1]](p0), [[C]](p0)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s128) = G_TRUNC [[MV]](s192)
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[TRUNC]](s128)
+    %0:_(p0) = G_CONSTANT i64 0
+    %1:_(p0) = G_CONSTANT i64 1
+    %2:_(s192) = G_MERGE_VALUES %0, %1, %0
+    %3:_(s128) = G_TRUNC %2
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: trunc_s68_merge_s128_s32
+
+body: |
+  bb.0:
+    ; Test that trunc(merge) with trunc-size > merge-source-size is not combined
+    ; if trunc-size % merge-source-size != 0
+    ; CHECK-LABEL: name: trunc_s68_merge_s128_s32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[C]](s32), [[C1]](s32), [[C]](s32), [[C1]](s32)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s68) = G_TRUNC [[MV]](s128)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[TRUNC]](s68)
+    ; CHECK: $vgpr0 = COPY [[TRUNC1]](s32)
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(s32) = G_CONSTANT i32 1
+    %2:_(s128) = G_MERGE_VALUES %0, %1, %0, %1
+    %3:_(s68) = G_TRUNC %2
+    %4:_(s32) = G_TRUNC %3
+    $vgpr0 = COPY %4
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
@@ -292,19 +292,17 @@ body: |
     ; CHECK-LABEL: name: test_bitcast_s24_to_v3s8
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[DEF]](s32)
     ; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[MV]](s64)
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[DEF1]](s64)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC1]](s32)
-    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[TRUNC]](s32)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF1]](s64)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
-    ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16)
-    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16)
+    ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
@@ -328,23 +326,21 @@ body: |
     ; CHECK-LABEL: name: test_bitcast_s48_to_v3s16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; CHECK: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[MV]](s64)
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC1]](s32)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
-    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C]](s32)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
     ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
     ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
     ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]]
     ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[TRUNC1]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
     ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]]
     ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -207,13 +207,8 @@ body: |
     ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C1]](s64)
     ; CHECK: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
     ; CHECK: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
-    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C2]](s64)
-    ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[COPY3]]
-    ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY [[AND1]](s64)
-    ; CHECK: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s33) = G_TRUNC %0
     %2:_(s33) = G_CTLZ_ZERO_UNDEF %1

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir
@@ -253,13 +253,8 @@ body: |
     ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C3]](s64)
     ; CHECK: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
     ; CHECK: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
-    ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
-    ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C4]](s64)
-    ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[COPY3]]
-    ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY [[AND1]](s64)
-    ; CHECK: $vgpr0_vgpr1 = COPY [[COPY4]](s64)
+    ; CHECK: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
+    ; CHECK: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s33) = G_TRUNC %0
     %2:_(s33) = G_CTLZ %1