Skip to content

Conversation

david-arm
Copy link
Contributor

There is no easy way to materialise some fixed-width vector constants with 64-bit elements. This is because NEON's movi instruction is restricted to setting all bits in a byte to the same value, i.e. 0xFF can be encoded as an immediate but not 0x1F. However, if SVE is available we can use the dup instruction to cover more cases.

Rather than lower the immediate directly using the dup instruction, I've instead used the generic SPLAT_VECTOR node in combination with an EXTRACT_SUBVECTOR. This is because we already have SVE splat_vector patterns that can match directly to dup.

@llvmbot
Copy link
Member

llvmbot commented Sep 16, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Sherwood (david-arm)

Changes

There is no easy way to materialise some fixed-width vector constants with 64-bit elements. This is because NEON's movi instruction is restricted to setting all bits in a byte to the same value, i.e. 0xFF can be encoded as an immediate but not 0x1F. However, if SVE is available we can use the dup instruction to cover more cases.

Rather than lower the immediate directly using the dup instruction, I've instead used the generic SPLAT_VECTOR node in combination with an EXTRACT_SUBVECTOR. This is because we already have SVE splat_vector patterns that can match directly to dup.


Full diff: https://github.com/llvm/llvm-project/pull/159101.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+8-27)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+26)
  • (modified) llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h (+30)
  • (modified) llvm/test/CodeGen/AArch64/extract-vector-cmp.ll (+2-3)
  • (added) llvm/test/CodeGen/AArch64/movi64_sve.ll (+165)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6fdc981fc21a5..2b78955b5a5b3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4328,34 +4328,15 @@ bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm,
                     ->getAPIntValue()
                     .trunc(VT.getFixedSizeInBits())
                     .getSExtValue();
+  int32_t ImmVal, ShiftVal;
+  if (!AArch64_AM::isSVECpyDupImm(VT.getScalarSizeInBits(), Val, ImmVal,
+                                  ShiftVal))
+    return false;
 
-  switch (VT.SimpleTy) {
-  case MVT::i8:
-    // All immediates are supported.
-    Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-    Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
-    return true;
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-    // Support 8bit signed immediates.
-    if (Val >= -128 && Val <= 127) {
-      Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
-      return true;
-    }
-    // Support 16bit signed immediates that are a multiple of 256.
-    if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
-      Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
-      Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
-      return true;
-    }
-    break;
-  default:
-    break;
-  }
-
-  return false;
+  // All immediates are supported.
+  Shift = CurDAG->getTargetConstant(ShiftVal, DL, MVT::i32);
+  Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+  return true;
 }
 
 bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a194147d09396..d137ac68cb6e1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15197,6 +15197,27 @@ static SDValue NormalizeBuildVector(SDValue Op,
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
+static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG,
+                             const AArch64Subtarget *ST, APInt &DefBits) {
+  EVT VT = Op.getValueType();
+  // TODO: We should be able to support 64-bit destinations too
+  if (!ST->hasSVE() || DefBits.getHiBits(64) != DefBits.getLoBits(64) ||
+      VT.getFixedSizeInBits() != 128)
+    return SDValue();
+
+  // See if we can make use of the SVE dup instruction.
+  APInt Val64 = DefBits.sextOrTrunc(64);
+  int32_t ImmVal, ShiftVal;
+  if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
+                                        DAG.getConstant(Val64, DL, MVT::i64));
+  SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
+  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
+}
+
 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
                                    const AArch64Subtarget *ST) {
   EVT VT = Op.getValueType();
@@ -15236,6 +15257,11 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
     if (SDValue R = TryMOVIWithBits(UndefBits))
       return R;
 
+    // NEON doesn't have a nice way of materialising 64-bit values, but if SVE
+    // is available we have more options.
+    if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
+      return R;
+
     // See if a fneg of the constant can be materialized with a MOVI, etc
     auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
       // FNegate each sub-element of the constant
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index f542592d22c5f..4ae5d040d5e8a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -871,6 +871,36 @@ inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
   return isAnyMOVZMovAlias(Value, RegWidth);
 }
 
+static inline bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm,
+                                  int32_t &Shift) {
+  switch (SizeInBits) {
+  case 8:
+    // All immediates are supported.
+    Shift = 0;
+    Imm = Val & 0xFF;
+    return true;
+  case 16:
+  case 32:
+  case 64:
+    // Support 8bit signed immediates.
+    if (Val >= -128 && Val <= 127) {
+      Shift = 0;
+      Imm = Val & 0xFF;
+      return true;
+    }
+    // Support 16bit signed immediates that are a multiple of 256.
+    if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
+      Shift = 8;
+      Imm = (Val >> 8) & 0xFF;
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
 } // end namespace AArch64_AM
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
index 832e34b664fbe..f5cf629b2a4a4 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -75,10 +75,9 @@ define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
 ; CHECK-LABEL: vector_loop_with_icmp:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEXT:    mov w9, #16 // =0x10
-; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    mov z1.d, #2 // =0x2
 ; CHECK-NEXT:    add x8, x0, #4
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    mov w10, #1 // =0x1
 ; CHECK-NEXT:    b .LBB5_2
 ; CHECK-NEXT:  .LBB5_1: // %pred.store.continue6
diff --git a/llvm/test/CodeGen/AArch64/movi64_sve.ll b/llvm/test/CodeGen/AArch64/movi64_sve.ll
new file mode 100644
index 0000000000000..da1a21532ac79
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movi64_sve.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+define <2 x i64> @movi_1_v2i64() {
+; CHECK-LABEL: movi_1_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #1 // =0x1
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 1)
+}
+
+define <2 x i64> @movi_127_v2i64() {
+; CHECK-LABEL: movi_127_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #127 // =0x7f
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 127)
+}
+
+define <2 x i64> @movi_m128_v2i64() {
+; CHECK-LABEL: movi_m128_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #-128 // =0xffffffffffffff80
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 -128)
+}
+
+define <2 x i64> @movi_256_v2i64() {
+; CHECK-LABEL: movi_256_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #256 // =0x100
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 256)
+}
+
+define <2 x i64> @movi_32512_v2i64() {
+; CHECK-LABEL: movi_32512_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #32512 // =0x7f00
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 32512)
+}
+
+define <2 x i64> @movi_m32768_v2i64() {
+; CHECK-LABEL: movi_m32768_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 -32768)
+}
+
+; Special cases where the destination vector does not have 64-bit elements
+
+define <4 x i32> @movi_v4i32_1() {
+; CHECK-LABEL: movi_v4i32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #127 // =0x7f
+; CHECK-NEXT:    ret
+  ret <4 x i32> <i32 127, i32 0, i32 127, i32 0>
+}
+
+define <4 x i32> @movi_v4i32_2() {
+; CHECK-LABEL: movi_v4i32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #32512 // =0x7f00
+; CHECK-NEXT:    ret
+  ret <4 x i32> <i32 32512, i32 0, i32 32512, i32 0>
+}
+
+define <8 x i16> @movi_v8i16_1() {
+; CHECK-LABEL: movi_v8i16_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #127 // =0x7f
+; CHECK-NEXT:    ret
+  ret <8 x i16> <i16 127, i16 0, i16 0, i16 0, i16 127, i16 0, i16 0, i16 0>
+}
+
+define <8 x i16> @movi_v8i16_2() {
+; CHECK-LABEL: movi_v8i16_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #32512 // =0x7f00
+; CHECK-NEXT:    ret
+  ret <8 x i16> <i16 32512, i16 0, i16 0, i16 0, i16 32512, i16 0, i16 0, i16 0>
+}
+
+define <16 x i8> @movi_v16i8_1() {
+; CHECK-LABEL: movi_v16i8_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #127 // =0x7f
+; CHECK-NEXT:    ret
+  ret <16 x i8> <i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <16 x i8> @movi_v16i8_2() {
+; CHECK-LABEL: movi_v16i8_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #32512 // =0x7f00
+; CHECK-NEXT:    ret
+  ret <16 x i8> <i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; Negative cases
+
+define <2 x i64> @movi_128_v2i64() {
+; CHECK-LABEL: movi_128_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #128 // =0x80
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 128)
+}
+
+define <2 x i64> @movi_m127_v2i64() {
+; CHECK-LABEL: movi_m127_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-129 // =0xffffffffffffff7f
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 -129)
+}
+
+define <2 x i64> @movi_32513_v2i64() {
+; CHECK-LABEL: movi_32513_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #32513 // =0x7f01
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 32513)
+}
+
+define <2 x i64> @movi_m32769_v2i64() {
+; CHECK-LABEL: movi_m32769_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-32769 // =0xffffffffffff7fff
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 -32769)
+}
+
+define <2 x i64> @movi_257_v2i64() {
+; CHECK-LABEL: movi_257_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    ret
+  ret <2 x i64> splat (i64 257)
+}
+
+define <4 x i32> @movi_v4i32_3() {
+; CHECK-LABEL: movi_v4i32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    ret
+  ret <4 x i32> <i32 -128, i32 0, i32 -128, i32 0>
+}
+
+define <16 x i8> @movi_v16i8_3() {
+; CHECK-LABEL: movi_v16i8_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    ret
+  ret <16 x i8> <i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0>
+}

There is no easy way to materialise some fixed-width vector
constants with 64-bit elements. This is because NEON's movi
instruction is restricted to setting all bits in a byte to
the same value, i.e. 0xFF can be encoded as an immediate but
not 0x1F. However, if SVE is available we can use the dup
instruction to cover more cases.

Rather than lower the immediate directly using the dup
instruction, I've instead used the generic SPLAT_VECTOR node
in combination with an EXTRACT_SUBVECTOR. This is because we
already have SVE splat_vector patterns that can match directly
to dup.
@david-arm david-arm merged commit 1034bb5 into llvm:main Sep 25, 2025
9 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 25, 2025

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-5 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/28169

Here is the relevant piece of the build log for the reference
Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'LLVM :: ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll' FAILED ********************
Exit Code: 2

Command Output (stdout):
--
# RUN: at line 1
/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll | /Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
# executed command: /Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
# .---command stderr------------
# | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug.
# | Stack dump:
# | 0.	Program arguments: /Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli -jit-kind=orc-lazy -compile-threads=2 -thread-entry hello /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
# |  #0 0x0000000101eb08a8 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100f188a8)
# |  #1 0x0000000101eae658 llvm::sys::RunSignalHandlers() (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100f16658)
# |  #2 0x0000000101eb13a8 SignalHandler(int, __siginfo*, void*) (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100f193a8)
# |  #3 0x0000000182f63584 (/usr/lib/system/libsystem_platform.dylib+0x18047b584)
# |  #4 0x0000010101a0b25c
# |  #5 0x0000000101a166ac llvm::orc::ExecutionSession::removeJITDylibs(std::__1::vector<llvm::IntrusiveRefCntPtr<llvm::orc::JITDylib>, std::__1::allocator<llvm::IntrusiveRefCntPtr<llvm::orc::JITDylib>>>) (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100a7e6ac)
# |  #6 0x0000000101a1645c llvm::orc::ExecutionSession::endSession() (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100a7e45c)
# |  #7 0x0000000101aa1f54 llvm::orc::LLJIT::~LLJIT() (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100b09f54)
# |  #8 0x0000000101aa68e0 llvm::orc::LLLazyJIT::~LLLazyJIT() (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100b0e8e0)
# |  #9 0x0000000100f9f824 runOrcJIT(char const*) (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100007824)
# | #10 0x0000000100f9af54 main (/Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/lli+0x100002f54)
# | #11 0x0000000182ba7154
# `-----------------------------
# error: command failed with exit status: -11
# executed command: /Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
# .---command stderr------------
# | FileCheck error: '<stdin>' is empty.
# | FileCheck command line:  /Volumes/ExternalSSD/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root2/aarch64-darwin/llvm-project/llvm/test/ExecutionEngine/OrcLazy/multiple-compile-threads-basic.ll
# `-----------------------------
# error: command failed with exit status: 2

--

********************


mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025
…159101)

There is no easy way to materialise some fixed-width vector constants
with 64-bit elements. This is because NEON's movi instruction is
restricted to setting all bits in a byte to the same value, i.e. 0xFF
can be encoded as an immediate but not 0x1F. However, if SVE is
available we can use the dup instruction to cover more cases.

Rather than lower the immediate directly using the dup instruction, I've
instead used the generic SPLAT_VECTOR node in combination with an
EXTRACT_SUBVECTOR. This is because we already have SVE splat_vector
patterns that can match directly to dup.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants