[GlobalISel] Handle non-multiples of the base type in narrowScalarInsert

When narrowing G_INSERT, handle types that aren't a multiple of the type we're narrowing to. This comes up if we're narrowing something like an s96 to fit in 64 bit registers and also for non-byte multiple packed types if they come up. This implementation handles these cases by extending the extra bits to the narrow size and truncating the result back to the destination size. Differential Revision: https://reviews.llvm.org/D97791
llvm · Jun 8, 2021 · 2a7e759 · 2a7e759
1 parent 30bb5dc
commit 2a7e759
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 55 deletions.
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5100,37 +5100,43 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
   if (TypeIdx != 0)
     return UnableToLegalize;
 
-  uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-  uint64_t NarrowSize = NarrowTy.getSizeInBits();
-
-  // FIXME: add support for when SizeOp0 isn't an exact multiple of
-  // NarrowSize.
-  if (SizeOp0 % NarrowSize != 0)
-    return UnableToLegalize;
-
-  int NumParts = SizeOp0 / NarrowSize;
-
-  SmallVector<Register, 2> SrcRegs, DstRegs;
+  SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
   SmallVector<uint64_t, 2> Indexes;
-  extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+  LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT LeftoverTy;
+  extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
+               LeftoverRegs);
 
+  for (Register Reg : LeftoverRegs)
+    SrcRegs.push_back(Reg);
+
+  uint64_t NarrowSize = NarrowTy.getSizeInBits();
   Register OpReg = MI.getOperand(2).getReg();
   uint64_t OpStart = MI.getOperand(3).getImm();
   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
-  for (int i = 0; i < NumParts; ++i) {
-    unsigned DstStart = i * NarrowSize;
+  for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
+    unsigned DstStart = I * NarrowSize;
 
-    if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
-      // No part of the insert affects this subregister, forward the original.
-      DstRegs.push_back(SrcRegs[i]);
-      continue;
-    } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+    if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
       // The entire subregister is defined by this insert, forward the new
       // value.
       DstRegs.push_back(OpReg);
       continue;
     }
 
+    Register SrcReg = SrcRegs[I];
+    if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
+      // The leftover reg is smaller than NarrowTy, so we need to extend it.
+      SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
+    }
+
+    if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
+      // No part of the insert affects this subregister, forward the original.
+      DstRegs.push_back(SrcReg);
+      continue;
+    }
+
     // OpSegStart is where this destination segment would start in OpReg if it
     // extended infinitely in both directions.
     int64_t ExtractOffset, InsertOffset;
@@ -5154,16 +5160,19 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
     }
 
     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
-    MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset);
+    MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
     DstRegs.push_back(DstReg);
   }
 
-  assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
+  uint64_t WideSize = DstRegs.size() * NarrowSize;
   Register DstReg = MI.getOperand(0).getReg();
-  if(MRI.getType(DstReg).isVector())
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
-  else
+  if (WideSize > RegTy.getSizeInBits()) {
+    Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
+    MIRBuilder.buildMerge(MergeReg, DstRegs);
+    MIRBuilder.buildTrunc(DstReg, MergeReg);
+  } else
     MIRBuilder.buildMerge(DstReg, DstRegs);
+
   MI.eraseFromParent();
   return Legalized;
 }

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -91,28 +91,6 @@ define void @nonpow2_add_narrowing(i128 %x, i128 %y) {
   ret void
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %{{[0-9]+}}:_(s96) = G_INSERT %{{[0-9]+}}:_, %{{[0-9]+}}:_(s32), 64 (in function: nonpow2_or_narrowing)
-; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_or_narrowing
-; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_or_narrowing:
-define void @nonpow2_or_narrowing() {
-  %a = add i128 undef, undef
-  %b = trunc i128 %a to i96
-  %a2 = add i128 undef, undef
-  %b2 = trunc i128 %a2 to i96
-  %dummy = or i96 %b, %b2
-  store i96 %dummy, i96* undef
-  ret void
-}
-
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s96) = G_INSERT %10:_, %8:_(s32), 64 (in function: nonpow2_load_narrowing)
-; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_load_narrowing
-; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_load_narrowing:
-define void @nonpow2_load_narrowing() {
-  %dummy = load i96, i96* undef
-  store i96 %dummy, i96* undef
-  ret void
-}
-
 ; Currently can't handle vector lengths that aren't an exact multiple of
 ; natively supported vector lengths. Test that the fall-back works for those.
 ; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %1:_(<7 x s64>) = G_ADD %0, %0 (in function: nonpow2_vector_add_fewerelements

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
@@ -1,11 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_inserts_nonpow2() { ret void }
-...
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            test_inserts_nonpow2
@@ -15,8 +9,12 @@ body: |
 
 
     ; CHECK-LABEL: name: test_inserts_nonpow2
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = COPY $x3
-    ; CHECK: $x0 = COPY [[C]]
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK: $x0 = COPY [[COPY3]](s64)
+    ; CHECK: RET_ReallyLR
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s64) = COPY $x2
@@ -27,3 +25,61 @@ body: |
     $x0 = COPY %6
     RET_ReallyLR
 ...
+---
+name:            test_inserts_s96
+body: |
+  bb.0:
+    liveins: $x0, $x1, $x2
+
+    ; CHECK-LABEL: name: test_inserts_s96
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[COPY1]](s64), 0
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT]](s32)
+    ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[ANYEXT]], [[TRUNC]](s32), 0
+    ; CHECK: $x0 = COPY [[COPY3]](s64)
+    ; CHECK: $x1 = COPY [[INSERT]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s128) = G_MERGE_VALUES %0:_(s64), %1:_(s64)
+    %4:_(s96) = G_TRUNC %3(s128)
+    %5:_(s32) = G_TRUNC %2(s64)
+    %6:_(s96) = G_INSERT %4, %5(s32), 64
+    %7:_(s128) = G_ANYEXT %6(s96)
+    %8:_(s64), %9:_(s64) = G_UNMERGE_VALUES %7
+    $x0 = COPY %8
+    $x1 = COPY %9
+...
+---
+name:            test_inserts_s65
+body: |
+  bb.0:
+    liveins: $x0, $x1, $x2
+
+    ; CHECK-LABEL: name: test_inserts_s65
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s64)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
+    ; CHECK: [[EXTRACT:%[0-9]+]]:_(s1) = G_EXTRACT [[COPY1]](s64), 0
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT]](s1)
+    ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[ANYEXT]], [[TRUNC]](s1), 0
+    ; CHECK: $x0 = COPY [[COPY3]](s64)
+    ; CHECK: $x1 = COPY [[INSERT]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s128) = G_MERGE_VALUES %0:_(s64), %1:_(s64)
+    %4:_(s65) = G_TRUNC %3(s128)
+    %5:_(s1) = G_TRUNC %2(s64)
+    %6:_(s65) = G_INSERT %4, %5(s1), 64
+    %7:_(s128) = G_ANYEXT %6(s65)
+    %8:_(s64), %9:_(s64) = G_UNMERGE_VALUES %7
+    $x0 = COPY %8
+    $x1 = COPY %9
+...