From b90315b5093091097faaefd513278f5b6134e489 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 20 Nov 2025 15:44:31 +0000 Subject: [PATCH 1/5] Precommit test --- .../streaming-compatible-memory-ops.ll | 319 +++++++++++++++++- 1 file changed, 315 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index 9c66b38c46973..b87a81856056d 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -153,6 +153,316 @@ entry: ret void } +define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: se_memchr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #80] // 8-byte Spill +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: mov x2, x1 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov w1, #5 // =0x5 +; CHECK-NEXT: bl memchr +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: se_memchr: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Spill +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset vg, -16 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset w30, -24 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset w29, -32 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b8, -40 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b9, -48 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b10, -56 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b11, -64 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b12, -72 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b13, -80 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b14, -88 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b15, -96 +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x1 +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #5 // =0x5 +; CHECK-NO-SME-ROUTINES-NEXT: bl memchr +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore vg +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore w30 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore w29 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b8 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b9 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b10 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b11 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b12 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b13 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b14 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b15 +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: se_memchr: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 96 +; CHECK-MOPS-NEXT: cntd x9 +; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: str x9, [sp, #80] // 8-byte Spill +; CHECK-MOPS-NEXT: .cfi_offset vg, -16 +; CHECK-MOPS-NEXT: .cfi_offset w30, -24 +; CHECK-MOPS-NEXT: .cfi_offset w29, -32 +; CHECK-MOPS-NEXT: .cfi_offset b8, -40 +; CHECK-MOPS-NEXT: .cfi_offset b9, -48 +; CHECK-MOPS-NEXT: .cfi_offset b10, -56 +; CHECK-MOPS-NEXT: .cfi_offset b11, -64 +; CHECK-MOPS-NEXT: .cfi_offset b12, -72 +; CHECK-MOPS-NEXT: .cfi_offset b13, -80 +; CHECK-MOPS-NEXT: .cfi_offset b14, -88 +; CHECK-MOPS-NEXT: .cfi_offset b15, -96 +; CHECK-MOPS-NEXT: mov x2, x1 +; CHECK-MOPS-NEXT: smstop sm +; CHECK-MOPS-NEXT: mov w1, #5 // =0x5 +; CHECK-MOPS-NEXT: bl memchr +; CHECK-MOPS-NEXT: smstart sm +; CHECK-MOPS-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 0 +; CHECK-MOPS-NEXT: .cfi_restore vg +; CHECK-MOPS-NEXT: .cfi_restore w30 +; CHECK-MOPS-NEXT: .cfi_restore w29 +; CHECK-MOPS-NEXT: .cfi_restore b8 +; CHECK-MOPS-NEXT: .cfi_restore b9 +; CHECK-MOPS-NEXT: .cfi_restore b10 +; CHECK-MOPS-NEXT: .cfi_restore b11 +; CHECK-MOPS-NEXT: .cfi_restore b12 +; CHECK-MOPS-NEXT: .cfi_restore b13 +; CHECK-MOPS-NEXT: .cfi_restore b14 +; CHECK-MOPS-NEXT: .cfi_restore b15 +; CHECK-MOPS-NEXT: ret +entry: + %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n) + ret ptr %res +} + +define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: sc_memchr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset vg, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_offset b8, -40 +; CHECK-NEXT: .cfi_offset b9, -48 +; CHECK-NEXT: .cfi_offset b10, -56 +; CHECK-NEXT: .cfi_offset b11, -64 +; CHECK-NEXT: .cfi_offset b12, -72 +; CHECK-NEXT: .cfi_offset b13, -80 +; CHECK-NEXT: .cfi_offset b14, -88 +; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: mov x2, x1 +; CHECK-NEXT: mrs x19, SVCR +; CHECK-NEXT: tbz w19, #0, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB4_2: // %entry +; CHECK-NEXT: mov w1, #5 // =0x5 +; CHECK-NEXT: bl memchr +; CHECK-NEXT: tbz w19, #0, .LBB4_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB4_4: // %entry +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: .cfi_restore b8 +; CHECK-NEXT: .cfi_restore b9 +; CHECK-NEXT: .cfi_restore b10 +; CHECK-NEXT: .cfi_restore b11 +; CHECK-NEXT: .cfi_restore b12 +; CHECK-NEXT: .cfi_restore b13 +; CHECK-NEXT: .cfi_restore b14 +; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: ret +; +; CHECK-NO-SME-ROUTINES-LABEL: sc_memchr: +; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NO-SME-ROUTINES-NEXT: cntd x9 +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset w19, -8 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset vg, -16 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset w30, -24 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset w29, -32 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b8, -40 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b9, -48 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b10, -56 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b11, -64 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b12, -72 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b13, -80 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b14, -88 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_offset b15, -96 +; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x1 +; CHECK-NO-SME-ROUTINES-NEXT: mrs x19, SVCR +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB4_2 +; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: smstop sm +; CHECK-NO-SME-ROUTINES-NEXT: .LBB4_2: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #5 // =0x5 +; CHECK-NO-SME-ROUTINES-NEXT: bl memchr +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB4_4 +; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: smstart sm +; CHECK-NO-SME-ROUTINES-NEXT: .LBB4_4: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldr x19, [sp, #88] // 8-byte Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore w19 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore vg +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore w30 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore w29 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b8 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b9 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b10 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b11 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b12 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b13 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b14 +; CHECK-NO-SME-ROUTINES-NEXT: .cfi_restore b15 +; CHECK-NO-SME-ROUTINES-NEXT: ret +; +; CHECK-MOPS-LABEL: sc_memchr: +; CHECK-MOPS: // %bb.0: // %entry +; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 96 +; CHECK-MOPS-NEXT: cntd x9 +; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-MOPS-NEXT: .cfi_offset w19, -8 +; CHECK-MOPS-NEXT: .cfi_offset vg, -16 +; CHECK-MOPS-NEXT: .cfi_offset w30, -24 +; CHECK-MOPS-NEXT: .cfi_offset w29, -32 +; CHECK-MOPS-NEXT: .cfi_offset b8, -40 +; CHECK-MOPS-NEXT: .cfi_offset b9, -48 +; CHECK-MOPS-NEXT: .cfi_offset b10, -56 +; CHECK-MOPS-NEXT: .cfi_offset b11, -64 +; CHECK-MOPS-NEXT: .cfi_offset b12, -72 +; CHECK-MOPS-NEXT: .cfi_offset b13, -80 +; CHECK-MOPS-NEXT: .cfi_offset b14, -88 +; CHECK-MOPS-NEXT: .cfi_offset b15, -96 +; CHECK-MOPS-NEXT: mov x2, x1 +; CHECK-MOPS-NEXT: mrs x19, SVCR +; CHECK-MOPS-NEXT: tbz w19, #0, .LBB4_2 +; CHECK-MOPS-NEXT: // %bb.1: // %entry +; CHECK-MOPS-NEXT: smstop sm +; CHECK-MOPS-NEXT: .LBB4_2: // %entry +; CHECK-MOPS-NEXT: mov w1, #5 // =0x5 +; CHECK-MOPS-NEXT: bl memchr +; CHECK-MOPS-NEXT: tbz w19, #0, .LBB4_4 +; CHECK-MOPS-NEXT: // %bb.3: // %entry +; CHECK-MOPS-NEXT: smstart sm +; CHECK-MOPS-NEXT: .LBB4_4: // %entry +; CHECK-MOPS-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldr x19, [sp, #88] // 8-byte Reload +; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 0 +; CHECK-MOPS-NEXT: .cfi_restore w19 +; CHECK-MOPS-NEXT: .cfi_restore vg +; CHECK-MOPS-NEXT: .cfi_restore w30 +; CHECK-MOPS-NEXT: .cfi_restore w29 +; CHECK-MOPS-NEXT: .cfi_restore b8 +; CHECK-MOPS-NEXT: .cfi_restore b9 +; CHECK-MOPS-NEXT: .cfi_restore b10 +; CHECK-MOPS-NEXT: .cfi_restore b11 +; CHECK-MOPS-NEXT: .cfi_restore b12 +; CHECK-MOPS-NEXT: .cfi_restore b13 +; CHECK-MOPS-NEXT: .cfi_restore b14 +; CHECK-MOPS-NEXT: .cfi_restore b15 +; CHECK-MOPS-NEXT: ret +entry: + %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n) + ret ptr %res +} + define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: sc_memcpy: ; CHECK: // %bb.0: // %entry @@ -179,15 +489,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: mrs x19, SVCR ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] -; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2 +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB5_2 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm -; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_2: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: .LBB5_2: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy -; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_4 +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB5_4 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstart sm -; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: .LBB5_4: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -283,3 +593,4 @@ entry: declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) +declare ptr @memchr(ptr, i32, i64) From 4b834bcc9daa7c4bac44553cb775227124b94379 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 20 Nov 2025 15:45:48 +0000 Subject: [PATCH 2/5] [AArch64][SME] Lower memchr to __arm_sc_memchr in streaming[-compatible] functions This allows us to avoid some streaming-mode switches. --- llvm/include/llvm/IR/RuntimeLibcalls.td | 2 + .../AArch64/AArch64SelectionDAGInfo.cpp | 35 +++- .../Target/AArch64/AArch64SelectionDAGInfo.h | 9 +- .../streaming-compatible-memory-ops.ll | 196 ++---------------- 4 files changed, 58 insertions(+), 184 deletions(-) diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index ce7e836f66446..71e0edf03a16d 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -315,6 +315,7 @@ def MEMCMP : RuntimeLibcall; def MEMCPY : RuntimeLibcall; def MEMMOVE : RuntimeLibcall; def MEMSET : RuntimeLibcall; +def MEMCHR : RuntimeLibcall; def CALLOC : RuntimeLibcall; def BZERO : RuntimeLibcall; def STRLEN : RuntimeLibcall; @@ -997,6 +998,7 @@ def fesetmode : RuntimeLibcallImpl; def memcpy : RuntimeLibcallImpl; def memmove : RuntimeLibcallImpl; def memset : RuntimeLibcallImpl; +def memchr : RuntimeLibcallImpl; // DSEPass can emit calloc if it finds a pair of malloc/memset def calloc : RuntimeLibcallImpl; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 48e03ad853d26..38c7a3d55f856 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -156,29 +156,35 @@ SDValue AArch64SelectionDAGInfo::EmitMOPS(unsigned Opcode, SelectionDAG &DAG, } SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall( - SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src, + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op0, SDValue Op1, SDValue Size, RTLIB::Libcall LC) const { const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); const AArch64TargetLowering *TLI = STI.getTargetLowering(); TargetLowering::ArgListTy Args; - Args.emplace_back(Dst, PointerType::getUnqual(*DAG.getContext())); + Args.emplace_back(Op0, PointerType::getUnqual(*DAG.getContext())); RTLIB::Libcall NewLC; switch (LC) { case RTLIB::MEMCPY: { NewLC = RTLIB::SC_MEMCPY; - Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext())); + Args.emplace_back(Op1, PointerType::getUnqual(*DAG.getContext())); break; } case RTLIB::MEMMOVE: { NewLC = RTLIB::SC_MEMMOVE; - Args.emplace_back(Src, PointerType::getUnqual(*DAG.getContext())); + Args.emplace_back(Op1, PointerType::getUnqual(*DAG.getContext())); break; } case RTLIB::MEMSET: { NewLC = RTLIB::SC_MEMSET; - Args.emplace_back(DAG.getZExtOrTrunc(Src, DL, MVT::i32), + Args.emplace_back(DAG.getZExtOrTrunc(Op1, DL, MVT::i32), + Type::getInt32Ty(*DAG.getContext())); + break; + } + case RTLIB::MEMCHR: { + NewLC = RTLIB::SC_MEMCHR; + Args.emplace_back(DAG.getZExtOrTrunc(Op1, DL, MVT::i32), Type::getInt32Ty(*DAG.getContext())); break; } @@ -194,7 +200,11 @@ SDValue AArch64SelectionDAGInfo::EmitStreamingCompatibleMemLibCall( PointerType *RetTy = PointerType::getUnqual(*DAG.getContext()); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( TLI->getLibcallCallingConv(NewLC), RetTy, Symbol, std::move(Args)); - return TLI->LowerCallTo(CLI).second; + + auto [Result, ChainOut] = TLI->LowerCallTo(CLI); + if (LC == RTLIB::MEMCHR) + return DAG.getMergeValues({Result, ChainOut}, DL); + return ChainOut; } SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -255,6 +265,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( return SDValue(); } +std::pair AArch64SelectionDAGInfo::EmitTargetCodeForMemchr( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Src, + SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const { + auto *AFI = DAG.getMachineFunction().getInfo(); + SMEAttrs Attrs = AFI->getSMEFnAttrs(); + if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) { + SDValue Result = EmitStreamingCompatibleMemLibCall( + DAG, dl, Chain, Src, Char, Length, RTLIB::MEMCHR); + return std::make_pair(Result.getValue(0), Result.getValue(1)); + } + return std::make_pair(SDValue(), SDValue()); +} + static const int kSetTagLoopThreshold = 176; static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 42c2797ebdd17..656a58c1dc1bf 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -53,14 +53,19 @@ class AArch64SelectionDAGInfo : public SelectionDAGGenTargetInfo { MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const override; + std::pair + EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, + SDValue Src, SDValue Char, SDValue Length, + MachinePointerInfo SrcPtrInfo) const override; + SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, MachinePointerInfo DstPtrInfo, bool ZeroData) const override; SDValue EmitStreamingCompatibleMemLibCall(SelectionDAG &DAG, const SDLoc &DL, - SDValue Chain, SDValue Dst, - SDValue Src, SDValue Size, + SDValue Chain, SDValue Op0, + SDValue Op1, SDValue Size, RTLIB::Libcall LC) const; }; } // namespace llvm diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index b87a81856056d..fc4ae272046a0 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -156,47 +156,13 @@ entry: define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" { ; CHECK-LABEL: se_memchr: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: str x9, [sp, #80] // 8-byte Spill -; CHECK-NEXT: .cfi_offset vg, -16 -; CHECK-NEXT: .cfi_offset w30, -24 -; CHECK-NEXT: .cfi_offset w29, -32 -; CHECK-NEXT: .cfi_offset b8, -40 -; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: .cfi_offset b10, -56 -; CHECK-NEXT: .cfi_offset b11, -64 -; CHECK-NEXT: .cfi_offset b12, -72 -; CHECK-NEXT: .cfi_offset b13, -80 -; CHECK-NEXT: .cfi_offset b14, -88 -; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: mov x2, x1 -; CHECK-NEXT: smstop sm ; CHECK-NEXT: mov w1, #5 // =0x5 -; CHECK-NEXT: bl memchr -; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore vg -; CHECK-NEXT: .cfi_restore w30 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: bl __arm_sc_memchr +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NO-SME-ROUTINES-LABEL: se_memchr: @@ -246,47 +212,13 @@ define ptr @se_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_enabled" { ; ; CHECK-MOPS-LABEL: se_memchr: ; CHECK-MOPS: // %bb.0: // %entry -; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 96 -; CHECK-MOPS-NEXT: cntd x9 -; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: str x9, [sp, #80] // 8-byte Spill -; CHECK-MOPS-NEXT: .cfi_offset vg, -16 -; CHECK-MOPS-NEXT: .cfi_offset w30, -24 -; CHECK-MOPS-NEXT: .cfi_offset w29, -32 -; CHECK-MOPS-NEXT: .cfi_offset b8, -40 -; CHECK-MOPS-NEXT: .cfi_offset b9, -48 -; CHECK-MOPS-NEXT: .cfi_offset b10, -56 -; CHECK-MOPS-NEXT: .cfi_offset b11, -64 -; CHECK-MOPS-NEXT: .cfi_offset b12, -72 -; CHECK-MOPS-NEXT: .cfi_offset b13, -80 -; CHECK-MOPS-NEXT: .cfi_offset b14, -88 -; CHECK-MOPS-NEXT: .cfi_offset b15, -96 +; CHECK-MOPS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MOPS-NEXT: .cfi_offset w30, -16 ; CHECK-MOPS-NEXT: mov x2, x1 -; CHECK-MOPS-NEXT: smstop sm ; CHECK-MOPS-NEXT: mov w1, #5 // =0x5 -; CHECK-MOPS-NEXT: bl memchr -; CHECK-MOPS-NEXT: smstart sm -; CHECK-MOPS-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 0 -; CHECK-MOPS-NEXT: .cfi_restore vg -; CHECK-MOPS-NEXT: .cfi_restore w30 -; CHECK-MOPS-NEXT: .cfi_restore w29 -; CHECK-MOPS-NEXT: .cfi_restore b8 -; CHECK-MOPS-NEXT: .cfi_restore b9 -; CHECK-MOPS-NEXT: .cfi_restore b10 -; CHECK-MOPS-NEXT: .cfi_restore b11 -; CHECK-MOPS-NEXT: .cfi_restore b12 -; CHECK-MOPS-NEXT: .cfi_restore b13 -; CHECK-MOPS-NEXT: .cfi_restore b14 -; CHECK-MOPS-NEXT: .cfi_restore b15 +; CHECK-MOPS-NEXT: bl __arm_sc_memchr +; CHECK-MOPS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-MOPS-NEXT: ret entry: %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n) @@ -296,57 +228,13 @@ entry: define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: sc_memchr: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset vg, -16 -; CHECK-NEXT: .cfi_offset w30, -24 -; CHECK-NEXT: .cfi_offset w29, -32 -; CHECK-NEXT: .cfi_offset b8, -40 -; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: .cfi_offset b10, -56 -; CHECK-NEXT: .cfi_offset b11, -64 -; CHECK-NEXT: .cfi_offset b12, -72 -; CHECK-NEXT: .cfi_offset b13, -80 -; CHECK-NEXT: .cfi_offset b14, -88 -; CHECK-NEXT: .cfi_offset b15, -96 +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: mov x2, x1 -; CHECK-NEXT: mrs x19, SVCR -; CHECK-NEXT: tbz w19, #0, .LBB4_2 -; CHECK-NEXT: // %bb.1: // %entry -; CHECK-NEXT: smstop sm -; CHECK-NEXT: .LBB4_2: // %entry ; CHECK-NEXT: mov w1, #5 // =0x5 -; CHECK-NEXT: bl memchr -; CHECK-NEXT: tbz w19, #0, .LBB4_4 -; CHECK-NEXT: // %bb.3: // %entry -; CHECK-NEXT: smstart sm -; CHECK-NEXT: .LBB4_4: // %entry -; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Reload -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w19 -; CHECK-NEXT: .cfi_restore vg -; CHECK-NEXT: .cfi_restore w30 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: .cfi_restore b8 -; CHECK-NEXT: .cfi_restore b9 -; CHECK-NEXT: .cfi_restore b10 -; CHECK-NEXT: .cfi_restore b11 -; CHECK-NEXT: .cfi_restore b12 -; CHECK-NEXT: .cfi_restore b13 -; CHECK-NEXT: .cfi_restore b14 -; CHECK-NEXT: .cfi_restore b15 +; CHECK-NEXT: bl __arm_sc_memchr +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NO-SME-ROUTINES-LABEL: sc_memchr: @@ -406,57 +294,13 @@ define ptr @sc_memchr(ptr %src, i64 %n) "aarch64_pstate_sm_compatible" { ; ; CHECK-MOPS-LABEL: sc_memchr: ; CHECK-MOPS: // %bb.0: // %entry -; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill -; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 96 -; CHECK-MOPS-NEXT: cntd x9 -; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-MOPS-NEXT: .cfi_offset w19, -8 -; CHECK-MOPS-NEXT: .cfi_offset vg, -16 -; CHECK-MOPS-NEXT: .cfi_offset w30, -24 -; CHECK-MOPS-NEXT: .cfi_offset w29, -32 -; CHECK-MOPS-NEXT: .cfi_offset b8, -40 -; CHECK-MOPS-NEXT: .cfi_offset b9, -48 -; CHECK-MOPS-NEXT: .cfi_offset b10, -56 -; CHECK-MOPS-NEXT: .cfi_offset b11, -64 -; CHECK-MOPS-NEXT: .cfi_offset b12, -72 -; CHECK-MOPS-NEXT: .cfi_offset b13, -80 -; CHECK-MOPS-NEXT: .cfi_offset b14, -88 -; CHECK-MOPS-NEXT: .cfi_offset b15, -96 +; CHECK-MOPS-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MOPS-NEXT: .cfi_offset w30, -16 ; CHECK-MOPS-NEXT: mov x2, x1 -; CHECK-MOPS-NEXT: mrs x19, SVCR -; CHECK-MOPS-NEXT: tbz w19, #0, .LBB4_2 -; CHECK-MOPS-NEXT: // %bb.1: // %entry -; CHECK-MOPS-NEXT: smstop sm -; CHECK-MOPS-NEXT: .LBB4_2: // %entry ; CHECK-MOPS-NEXT: mov w1, #5 // =0x5 -; CHECK-MOPS-NEXT: bl memchr -; CHECK-MOPS-NEXT: tbz w19, #0, .LBB4_4 -; CHECK-MOPS-NEXT: // %bb.3: // %entry -; CHECK-MOPS-NEXT: smstart sm -; CHECK-MOPS-NEXT: .LBB4_4: // %entry -; CHECK-MOPS-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldr x19, [sp, #88] // 8-byte Reload -; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload -; CHECK-MOPS-NEXT: .cfi_def_cfa_offset 0 -; CHECK-MOPS-NEXT: .cfi_restore w19 -; CHECK-MOPS-NEXT: .cfi_restore vg -; CHECK-MOPS-NEXT: .cfi_restore w30 -; CHECK-MOPS-NEXT: .cfi_restore w29 -; CHECK-MOPS-NEXT: .cfi_restore b8 -; CHECK-MOPS-NEXT: .cfi_restore b9 -; CHECK-MOPS-NEXT: .cfi_restore b10 -; CHECK-MOPS-NEXT: .cfi_restore b11 -; CHECK-MOPS-NEXT: .cfi_restore b12 -; CHECK-MOPS-NEXT: .cfi_restore b13 -; CHECK-MOPS-NEXT: .cfi_restore b14 -; CHECK-MOPS-NEXT: .cfi_restore b15 +; CHECK-MOPS-NEXT: bl __arm_sc_memchr +; CHECK-MOPS-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-MOPS-NEXT: ret entry: %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n) From f1f20f8e01bcb4055713934f2d7a3e740961aa2a Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 20 Nov 2025 16:50:36 +0000 Subject: [PATCH 3/5] Add test of non-streaming[-compatible] memchr --- .../streaming-compatible-memory-ops.ll | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index fc4ae272046a0..b000854f5948e 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK-NO-SME-ROUTINES +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-COMMON,CHECK-MOPS @dst = global [512 x i8] zeroinitializer, align 1 @src = global [512 x i8] zeroinitializer, align 1 @@ -307,6 +307,18 @@ entry: ret ptr %res } +; Non-streaming[-compatible] call to memchr. +define ptr @ns_memcpy(ptr %src, i64 %n) { +; CHECK-COMMON-LABEL: ns_memcpy: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: mov x2, x1 +; CHECK-COMMON-NEXT: mov w1, #5 // =0x5 +; CHECK-COMMON-NEXT: b memchr +entry: + %res = tail call ptr @memchr(ptr %src, i32 5, i64 %n) + ret ptr %res +} + define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-LABEL: sc_memcpy: ; CHECK: // %bb.0: // %entry @@ -333,15 +345,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: mrs x19, SVCR ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] -; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB5_2 +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB6_2 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm -; CHECK-NO-SME-ROUTINES-NEXT: .LBB5_2: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: .LBB6_2: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy -; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB5_4 +; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB6_4 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstart sm -; CHECK-NO-SME-ROUTINES-NEXT: .LBB5_4: // %entry +; CHECK-NO-SME-ROUTINES-NEXT: .LBB6_4: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload @@ -438,3 +450,5 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) declare ptr @memchr(ptr, i32, i64) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; : {{.*}} From e43acbf763c5c58d1990b1026ddfd5dffec3e530 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 20 Nov 2025 16:53:09 +0000 Subject: [PATCH 4/5] Remove error line --- llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index b000854f5948e..895271d8bfdc8 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -450,5 +450,3 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) declare ptr @memchr(ptr, i32, i64) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; : {{.*}} From 014d8df12707f0a7a0fe053a1fb4a6c42ec0be28 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 20 Nov 2025 16:56:29 +0000 Subject: [PATCH 5/5] Rename ns_memcpy to ns_memchr in LLVM test --- llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index 895271d8bfdc8..2fbb7b63a7564 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -308,8 +308,8 @@ entry: } ; Non-streaming[-compatible] call to memchr. -define ptr @ns_memcpy(ptr %src, i64 %n) { -; CHECK-COMMON-LABEL: ns_memcpy: +define ptr @ns_memchr(ptr %src, i64 %n) { +; CHECK-COMMON-LABEL: ns_memchr: ; CHECK-COMMON: // %bb.0: // %entry ; CHECK-COMMON-NEXT: mov x2, x1 ; CHECK-COMMON-NEXT: mov w1, #5 // =0x5