Skip to content

Conversation

@sunshaoce
Copy link
Contributor

According to the SpacemiT documentation, the following settings are recommended for the SpacemiT-X60:

-mllvm -cache-line-size=64
-mllvm -prefetch-distance=128

Documentation: https://developer.spacemit.com/documentation?token=HPVlwPdGGiDsurkuHm9cbtarnk0#3.2-llvm-toolchain

@llvmbot
Copy link
Member

llvmbot commented Nov 14, 2025

@llvm/pr-subscribers-backend-risc-v

Author: Shaoce SUN (sunshaoce)

Changes

According to the SpacemiT documentation, the following settings are recommended for the SpacemiT-X60:

-mllvm -cache-line-size=64
-mllvm -prefetch-distance=128

Documentation: https://developer.spacemit.com/documentation?token=HPVlwPdGGiDsurkuHm9cbtarnk0#3.2-llvm-toolchain


Full diff: https://github.com/llvm/llvm-project/pull/168071.diff

2 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVProcessors.td (+7-1)
  • (modified) llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll (+95-80)
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e86431f78f1ba..6ba953977274f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -68,6 +68,11 @@ def getRISCVTuneInfo : SearchIndex {
 
 class GenericTuneInfo: RISCVTuneInfo;
 
+class SpacemitX60TuneInfo : GenericTuneInfo {
+  let CacheLineSize = 64;
+  let PrefetchDistance = 128;
+}
+
 class RISCVProcessorModel<string n,
                           SchedMachineModel m,
                           list<SubtargetFeature> f,
@@ -748,7 +753,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
                                         TuneOptimizedNF2SegmentLoadStore,
                                         TuneOptimizedNF3SegmentLoadStore,
                                         TuneOptimizedNF4SegmentLoadStore,
-                                        TuneVXRMPipelineFlush]> {
+                                        TuneVXRMPipelineFlush]>,
+                   SpacemitX60TuneInfo {
   let MVendorID = 0x710;
   let MArchID = 0x8000000058000001;
   let MImpID = 0x1000000049772200;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index af3b0852a6461..21070981e1958 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -287,133 +287,148 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
 ; RV64X60-LABEL: test1:
 ; RV64X60:       # %bb.0: # %entry
 ; RV64X60-NEXT:    csrwi vxrm, 0
-; RV64X60-NEXT:    blez a7, .LBB0_12
+; RV64X60-NEXT:    blez a7, .LBB0_11
 ; RV64X60-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
-; RV64X60-NEXT:    blez a6, .LBB0_12
+; RV64X60-NEXT:    blez a6, .LBB0_11
 ; RV64X60-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
-; RV64X60-NEXT:    addi sp, sp, -48
-; RV64X60-NEXT:    .cfi_def_cfa_offset 48
-; RV64X60-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
-; RV64X60-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
-; RV64X60-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
-; RV64X60-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
-; RV64X60-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    addi sp, sp, -64
+; RV64X60-NEXT:    .cfi_def_cfa_offset 64
+; RV64X60-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s1, 48(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s2, 40(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s3, 32(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s4, 24(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s5, 16(sp) # 8-byte Folded Spill
+; RV64X60-NEXT:    sd s6, 8(sp) # 8-byte Folded Spill
 ; RV64X60-NEXT:    .cfi_offset s0, -8
 ; RV64X60-NEXT:    .cfi_offset s1, -16
 ; RV64X60-NEXT:    .cfi_offset s2, -24
 ; RV64X60-NEXT:    .cfi_offset s3, -32
 ; RV64X60-NEXT:    .cfi_offset s4, -40
+; RV64X60-NEXT:    .cfi_offset s5, -48
+; RV64X60-NEXT:    .cfi_offset s6, -56
 ; RV64X60-NEXT:    li t0, 0
-; RV64X60-NEXT:    li t1, 0
 ; RV64X60-NEXT:    addi s1, a7, -1
 ; RV64X60-NEXT:    zext.w s1, s1
-; RV64X60-NEXT:    mul t3, a1, s1
-; RV64X60-NEXT:    mul t4, a3, s1
-; RV64X60-NEXT:    mul t5, a5, s1
+; RV64X60-NEXT:    mul t2, a1, s1
+; RV64X60-NEXT:    mul t3, a3, s1
+; RV64X60-NEXT:    mul t4, a5, s1
 ; RV64X60-NEXT:    add s0, a0, a6
-; RV64X60-NEXT:    csrr t2, vlenb
+; RV64X60-NEXT:    csrr t1, vlenb
 ; RV64X60-NEXT:    add s1, a2, a6
-; RV64X60-NEXT:    add t3, t3, s0
+; RV64X60-NEXT:    add t2, t2, s0
 ; RV64X60-NEXT:    add s0, a4, a6
-; RV64X60-NEXT:    add t4, t4, s1
-; RV64X60-NEXT:    li t6, 32
-; RV64X60-NEXT:    add t5, t5, s0
+; RV64X60-NEXT:    add t3, t3, s1
+; RV64X60-NEXT:    li t5, 32
+; RV64X60-NEXT:    add t4, t4, s0
+; RV64X60-NEXT:    sltu s0, a0, t3
+; RV64X60-NEXT:    sltu s1, a2, t2
+; RV64X60-NEXT:    and t3, s0, s1
+; RV64X60-NEXT:    or t6, a1, a3
 ; RV64X60-NEXT:    sltu s0, a0, t4
-; RV64X60-NEXT:    sltu s1, a2, t3
+; RV64X60-NEXT:    sltu s1, a4, t2
+; RV64X60-NEXT:    srli t2, t6, 63
 ; RV64X60-NEXT:    and t4, s0, s1
-; RV64X60-NEXT:    or s2, a1, a3
-; RV64X60-NEXT:    sltu s0, a0, t5
-; RV64X60-NEXT:    sltu s1, a4, t3
-; RV64X60-NEXT:    srli t3, s2, 63
-; RV64X60-NEXT:    and s0, s0, s1
 ; RV64X60-NEXT:    or s1, a1, a5
-; RV64X60-NEXT:    or t4, t4, t3
-; RV64X60-NEXT:    slli t3, t2, 1
+; RV64X60-NEXT:    or s0, t3, t2
+; RV64X60-NEXT:    slli t2, t1, 1
 ; RV64X60-NEXT:    srli s1, s1, 63
-; RV64X60-NEXT:    or s0, s0, s1
-; RV64X60-NEXT:    maxu s1, t3, t6
-; RV64X60-NEXT:    or s0, t4, s0
-; RV64X60-NEXT:    sltu s1, a6, s1
-; RV64X60-NEXT:    or s0, s0, s1
-; RV64X60-NEXT:    add t4, a0, a6
-; RV64X60-NEXT:    andi t5, s0, 1
+; RV64X60-NEXT:    maxu t5, t2, t5
+; RV64X60-NEXT:    or s1, t4, s1
+; RV64X60-NEXT:    or t4, s0, s1
+; RV64X60-NEXT:    slli s1, t1, 4
+; RV64X60-NEXT:    add t3, a0, s1
+; RV64X60-NEXT:    sltu s0, a6, t5
+; RV64X60-NEXT:    or s0, s0, t4
+; RV64X60-NEXT:    add t4, a4, s1
+; RV64X60-NEXT:    add t5, a2, s1
+; RV64X60-NEXT:    andi t6, s0, 1
 ; RV64X60-NEXT:    j .LBB0_4
 ; RV64X60-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
 ; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
 ; RV64X60-NEXT:    add a0, a0, a1
 ; RV64X60-NEXT:    add a2, a2, a3
-; RV64X60-NEXT:    addiw t1, t1, 1
 ; RV64X60-NEXT:    add a4, a4, a5
-; RV64X60-NEXT:    addi t0, t0, 1
-; RV64X60-NEXT:    beq t1, a7, .LBB0_11
+; RV64X60-NEXT:    addiw t0, t0, 1
+; RV64X60-NEXT:    add t3, t3, a1
+; RV64X60-NEXT:    add t4, t4, a5
+; RV64X60-NEXT:    add t5, t5, a3
+; RV64X60-NEXT:    beq t0, a7, .LBB0_10
 ; RV64X60-NEXT:  .LBB0_4: # %for.cond1.preheader.us
 ; RV64X60-NEXT:    # =>This Loop Header: Depth=1
 ; RV64X60-NEXT:    # Child Loop BB0_7 Depth 2
-; RV64X60-NEXT:    # Child Loop BB0_10 Depth 2
-; RV64X60-NEXT:    beqz t5, .LBB0_6
+; RV64X60-NEXT:    # Child Loop BB0_9 Depth 2
+; RV64X60-NEXT:    beqz t6, .LBB0_6
 ; RV64X60-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT:    li t6, 0
+; RV64X60-NEXT:    li s2, 0
 ; RV64X60-NEXT:    j .LBB0_9
 ; RV64X60-NEXT:  .LBB0_6: # %vector.ph
 ; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT:    slli s1, t2, 28
-; RV64X60-NEXT:    sub s1, s1, t3
-; RV64X60-NEXT:    and t6, s1, a6
-; RV64X60-NEXT:    mv s2, a2
-; RV64X60-NEXT:    mv s3, a4
-; RV64X60-NEXT:    mv s4, a0
-; RV64X60-NEXT:    mv s1, t6
+; RV64X60-NEXT:    li s1, 0
+; RV64X60-NEXT:    slli s0, t1, 28
+; RV64X60-NEXT:    sub s0, s0, t2
+; RV64X60-NEXT:    and s2, s0, a6
 ; RV64X60-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
 ; RV64X60-NEXT:  .LBB0_7: # %vector.body
 ; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
 ; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64X60-NEXT:    vl2r.v v8, (s2)
+; RV64X60-NEXT:    add s0, a2, s1
+; RV64X60-NEXT:    add s3, a4, s1
+; RV64X60-NEXT:    vl2r.v v8, (s0)
+; RV64X60-NEXT:    add s4, t3, s1
 ; RV64X60-NEXT:    vl2r.v v10, (s3)
+; RV64X60-NEXT:    add s3, t5, s1
 ; RV64X60-NEXT:    vaaddu.vv v8, v8, v10
-; RV64X60-NEXT:    sub s1, s1, t3
-; RV64X60-NEXT:    vs2r.v v8, (s4)
-; RV64X60-NEXT:    add s4, s4, t3
-; RV64X60-NEXT:    add s3, s3, t3
-; RV64X60-NEXT:    add s2, s2, t3
-; RV64X60-NEXT:    bnez s1, .LBB0_7
+; RV64X60-NEXT:    add s0, t4, s1
+; RV64X60-NEXT:    prefetch.r 0(s3)
+; RV64X60-NEXT:    prefetch.r 0(s0)
+; RV64X60-NEXT:    prefetch.w 0(s4)
+; RV64X60-NEXT:    add s0, a0, s1
+; RV64X60-NEXT:    add s1, s1, t2
+; RV64X60-NEXT:    vs2r.v v8, (s0)
+; RV64X60-NEXT:    bne s2, s1, .LBB0_7
 ; RV64X60-NEXT:  # %bb.8: # %middle.block
 ; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT:    beq t6, a6, .LBB0_3
-; RV64X60-NEXT:  .LBB0_9: # %for.body4.us.preheader
-; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
-; RV64X60-NEXT:    mul s2, a1, t0
-; RV64X60-NEXT:    add s0, a0, t6
-; RV64X60-NEXT:    add s2, s2, t4
-; RV64X60-NEXT:    add s4, a4, t6
-; RV64X60-NEXT:    add t6, t6, a2
-; RV64X60-NEXT:  .LBB0_10: # %for.body4.us
+; RV64X60-NEXT:    beq s2, a6, .LBB0_3
+; RV64X60-NEXT:  .LBB0_9: # %for.body4.us
 ; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
 ; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
-; RV64X60-NEXT:    lbu s3, 0(t6)
-; RV64X60-NEXT:    lbu s1, 0(s4)
-; RV64X60-NEXT:    add s1, s1, s3
-; RV64X60-NEXT:    addi s1, s1, 1
-; RV64X60-NEXT:    srli s1, s1, 1
-; RV64X60-NEXT:    sb s1, 0(s0)
-; RV64X60-NEXT:    addi s0, s0, 1
-; RV64X60-NEXT:    addi s4, s4, 1
-; RV64X60-NEXT:    addi t6, t6, 1
-; RV64X60-NEXT:    bne s0, s2, .LBB0_10
+; RV64X60-NEXT:    add s0, a4, s2
+; RV64X60-NEXT:    add s1, a2, s2
+; RV64X60-NEXT:    lbu s3, 0(s1)
+; RV64X60-NEXT:    lbu s4, 0(s0)
+; RV64X60-NEXT:    add s5, a0, s2
+; RV64X60-NEXT:    addi s6, s5, 10
+; RV64X60-NEXT:    addi s1, s1, 10
+; RV64X60-NEXT:    addi s0, s0, 10
+; RV64X60-NEXT:    prefetch.r 0(s1)
+; RV64X60-NEXT:    prefetch.r 0(s0)
+; RV64X60-NEXT:    add s3, s3, s4
+; RV64X60-NEXT:    addi s3, s3, 1
+; RV64X60-NEXT:    prefetch.w 0(s6)
+; RV64X60-NEXT:    srli s0, s3, 1
+; RV64X60-NEXT:    addi s2, s2, 1
+; RV64X60-NEXT:    sb s0, 0(s5)
+; RV64X60-NEXT:    bne a6, s2, .LBB0_9
 ; RV64X60-NEXT:    j .LBB0_3
-; RV64X60-NEXT:  .LBB0_11:
-; RV64X60-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
-; RV64X60-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
-; RV64X60-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
-; RV64X60-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
-; RV64X60-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:  .LBB0_10:
+; RV64X60-NEXT:    ld s0, 56(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s1, 48(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s2, 40(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s3, 32(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s4, 24(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s5, 16(sp) # 8-byte Folded Reload
+; RV64X60-NEXT:    ld s6, 8(sp) # 8-byte Folded Reload
 ; RV64X60-NEXT:    .cfi_restore s0
 ; RV64X60-NEXT:    .cfi_restore s1
 ; RV64X60-NEXT:    .cfi_restore s2
 ; RV64X60-NEXT:    .cfi_restore s3
 ; RV64X60-NEXT:    .cfi_restore s4
-; RV64X60-NEXT:    addi sp, sp, 48
+; RV64X60-NEXT:    .cfi_restore s5
+; RV64X60-NEXT:    .cfi_restore s6
+; RV64X60-NEXT:    addi sp, sp, 64
 ; RV64X60-NEXT:    .cfi_def_cfa_offset 0
-; RV64X60-NEXT:  .LBB0_12: # %for.cond.cleanup
+; RV64X60-NEXT:  .LBB0_11: # %for.cond.cleanup
 ; RV64X60-NEXT:    ret
 ;
 ; RV64-LABEL: test1:

@topperc topperc requested a review from lukel97 November 14, 2025 17:43
@wangpc-pp
Copy link
Contributor

The code looks great, but I am not sure if these values are fine-tuned or they are just examples.

@sunshaoce
Copy link
Contributor Author

The code looks great, but I am not sure if these values are fine-tuned or they are just examples.

You’re right. I ran the benchmark and the scores actually dropped. I’ll close this PR.

@sunshaoce sunshaoce closed this Nov 19, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants