[RISCV] When enabling `-force-vector-width`, RVV vectorization has produced wrong results #77044

sunshaoce · 2024-01-05T04:28:01Z

clang foo.c -w -lm -mllvm -force-vector-width=16 -mllvm -force-vector-interleave=2 --target=riscv64 -march=rv64gcv -O3 --gcc-toolchain=$HOME/riscv --sysroot=$HOME/riscv/riscv64-unknown-elf

#include <malloc.h>
#include <stdio.h>

float foo() {
#define LEN 10000

  float a[LEN];
  float ret = 0.;
  for (int i = 0; i < LEN; i++) {
    a[i] = 1 + i;
    ret += a[i];
  }
  return ret;
}

int main(int argc, char **argv) { printf("%f\n", foo()); }

I got the wrong result.

50005000.000000

Then I tested several other compilers(gcc, clang on x86) and got the same results.

50002896.000000

Disabling -force-vector-width can avoid this issue.

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-01-05T04:28:15Z

@llvm/issue-subscribers-backend-risc-v

Author: Shao-Ce SUN (sunshaoce)

```shell clang foo.c -w -lm -mllvm -force-vector-width=16 -mllvm -force-vector-interleave=2 --target=riscv64 -march=rv64gcv -O3 --gcc-toolchain=$HOME/riscv --sysroot=$HOME/riscv/riscv64-unknown-elf ```

#include &lt;malloc.h&gt;
#include &lt;stdio.h&gt;

float foo() {
#define LEN 10000

  float a[LEN];
  float ret = 0.;
  for (int i = 0; i &lt; LEN; i++) {
    a[i] = 1 + i;
    ret += a[i];
  }
  return ret;
}

int main(int argc, char **argv) { printf("%f\n", foo()); }

I got the wrong result.

50005000.000000

Then I tested several other compilers(gcc, clang on x86) and got the same results.

50002896.000000

Disabling -force-vector-width can avoid this issue.

llvmbot · 2024-01-05T04:28:16Z

@llvm/issue-subscribers-bug

Author: Shao-Ce SUN (sunshaoce)

```shell clang foo.c -w -lm -mllvm -force-vector-width=16 -mllvm -force-vector-interleave=2 --target=riscv64 -march=rv64gcv -O3 --gcc-toolchain=$HOME/riscv --sysroot=$HOME/riscv/riscv64-unknown-elf ```

#include &lt;malloc.h&gt;
#include &lt;stdio.h&gt;

float foo() {
#define LEN 10000

  float a[LEN];
  float ret = 0.;
  for (int i = 0; i &lt; LEN; i++) {
    a[i] = 1 + i;
    ret += a[i];
  }
  return ret;
}

int main(int argc, char **argv) { printf("%f\n", foo()); }

I got the wrong result.

50005000.000000

Then I tested several other compilers(gcc, clang on x86) and got the same results.

50002896.000000

Disabling -force-vector-width can avoid this issue.

topperc · 2024-01-05T05:07:07Z

Forcing vectorization enables some FP reassociation even without -fast-math. X86 with -O0 gives 50002896.000000, but with forced vetorization I got 50004992.000000. So I'm not sure what the right answer is.

dtcxzyw · 2024-01-05T06:07:18Z

I got 50004992.000000 with qemu-riscv64 -cpu rv64,v=true,vlen=256,vext_spec=v1.0.

dtcxzyw · 2024-01-05T06:18:30Z

Bisect to LoopVectorizePass,
clang -O3 -mllvm -force-vector-width=16 -mllvm -opt-bisect-limit=179 foo.c -mllvm -print-changed -mllvm -print-module-scope
before (50002896.000000):

; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1

; Function Attrs: nofree norecurse nosync nounwind memory(none) uwtable
define dso_local float @foo() local_unnamed_addr #0 {
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body.1, %entry
  %index = phi i32 [ 0, %entry ], [ %index.next.1, %vector.body.1 ]
  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %entry ], [ %vec.ind.next.1, %vector.body.1 ]
  %vec.phi = phi <16 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %entry ], [ %6, %vector.body.1 ]
  %0 = add nuw nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %1 = sitofp <16 x i32> %0 to <16 x float>
  %2 = fadd <16 x float> %vec.phi, %1
  %3 = icmp eq i32 %index, 9984
  br i1 %3, label %middle.block, label %vector.body.1, !llvm.loop !5

vector.body.1:                                    ; preds = %vector.body
  %4 = add <16 x i32> %vec.ind, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  %5 = sitofp <16 x i32> %4 to <16 x float>
  %6 = fadd <16 x float> %2, %5
  %index.next.1 = add nuw nsw i32 %index, 32
  %vec.ind.next.1 = add <16 x i32> %vec.ind, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
  br label %vector.body

middle.block:                                     ; preds = %vector.body
  %7 = tail call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %2)
  ret float %7
}

; Function Attrs: nofree nounwind uwtable
define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 {
entry:
  br label %for.body.i

for.body.i:                                       ; preds = %for.body.i, %entry
  %i.010.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
  %ret.09.i = phi float [ 0.000000e+00, %entry ], [ %add3.i, %for.body.i ]
  %add.i = add nuw nsw i32 %i.010.i, 1
  %conv.i = sitofp i32 %add.i to float
  %add3.i = fadd float %ret.09.i, %conv.i
  %exitcond.not.i = icmp eq i32 %add.i, 10000
  br i1 %exitcond.not.i, label %foo.exit, label %for.body.i, !llvm.loop !9

foo.exit:                                         ; preds = %for.body.i
  %add3.i.lcssa = phi float [ %add3.i, %for.body.i ]
  %conv = fpext float %add3.i.lcssa to double
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, double noundef %conv)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) #3

attributes #0 = { nofree norecurse nosync nounwind memory(none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nofree nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0, !1, !2, !3}
!llvm.ident = !{!4}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{!"clang version 18.0.0git"}
!5 = distinct !{!5, !6, !7, !8}
!6 = !{!"llvm.loop.mustprogress"}
!7 = !{!"llvm.loop.isvectorized", i32 1}
!8 = !{!"llvm.loop.unroll.runtime.disable"}
!9 = distinct !{!9, !6}

after (50005000.000000):

; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1

; Function Attrs: nofree norecurse nosync nounwind memory(none) uwtable
define dso_local float @foo() local_unnamed_addr #0 {
entry:
  br label %vector.body

vector.body:                                      ; preds = %vector.body.1, %entry
  %index = phi i32 [ 0, %entry ], [ %index.next.1, %vector.body.1 ]
  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %entry ], [ %vec.ind.next.1, %vector.body.1 ]
  %vec.phi = phi <16 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %entry ], [ %6, %vector.body.1 ]
  %0 = add nuw nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %1 = sitofp <16 x i32> %0 to <16 x float>
  %2 = fadd <16 x float> %vec.phi, %1
  %3 = icmp eq i32 %index, 9984
  br i1 %3, label %middle.block, label %vector.body.1, !llvm.loop !5

vector.body.1:                                    ; preds = %vector.body
  %4 = add <16 x i32> %vec.ind, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  %5 = sitofp <16 x i32> %4 to <16 x float>
  %6 = fadd <16 x float> %2, %5
  %index.next.1 = add nuw nsw i32 %index, 32
  %vec.ind.next.1 = add <16 x i32> %vec.ind, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
  br label %vector.body

middle.block:                                     ; preds = %vector.body
  %7 = tail call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %2)
  ret float %7
}

; Function Attrs: nofree nounwind uwtable
define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 {
entry:
  br i1 false, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %entry
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ]
  %vec.phi = phi <16 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %2, %vector.body ]
  %0 = add nuw nsw <16 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %1 = sitofp <16 x i32> %0 to <16 x float>
  %2 = fadd <16 x float> %vec.phi, %1
  %index.next = add nuw i32 %index, 16
  %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %3 = icmp eq i32 %index.next, 10000
  br i1 %3, label %middle.block, label %vector.body, !llvm.loop !9

middle.block:                                     ; preds = %vector.body
  %4 = call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %2)
  br i1 true, label %foo.exit, label %scalar.ph

scalar.ph:                                        ; preds = %entry, %middle.block
  %bc.resume.val = phi i32 [ 10000, %middle.block ], [ 0, %entry ]
  %bc.merge.rdx = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ]
  br label %for.body.i

for.body.i:                                       ; preds = %for.body.i, %scalar.ph
  %i.010.i = phi i32 [ %bc.resume.val, %scalar.ph ], [ %add.i, %for.body.i ]
  %ret.09.i = phi float [ %bc.merge.rdx, %scalar.ph ], [ %add3.i, %for.body.i ]
  %add.i = add nuw nsw i32 %i.010.i, 1
  %conv.i = sitofp i32 %add.i to float
  %add3.i = fadd float %ret.09.i, %conv.i
  %exitcond.not.i = icmp eq i32 %add.i, 10000
  br i1 %exitcond.not.i, label %foo.exit, label %for.body.i, !llvm.loop !10

foo.exit:                                         ; preds = %middle.block, %for.body.i
  %add3.i.lcssa = phi float [ %add3.i, %for.body.i ], [ %4, %middle.block ]
  %conv = fpext float %add3.i.lcssa to double
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, double noundef %conv)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) #3

attributes #0 = { nofree norecurse nosync nounwind memory(none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nofree nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { nofree nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0, !1, !2, !3}
!llvm.ident = !{!4}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{!"clang version 18.0.0git"}
!5 = distinct !{!5, !6, !7, !8}
!6 = !{!"llvm.loop.mustprogress"}
!7 = !{!"llvm.loop.isvectorized", i32 1}
!8 = !{!"llvm.loop.unroll.runtime.disable"}
!9 = distinct !{!9, !6, !7, !8}
!10 = distinct !{!10, !6, !8, !7}

cc @fhahn

topperc · 2024-01-05T06:40:13Z

I'm pretty sure this is known vectorizer behavior. You can disable it with -mllvm -hints-allow-reordering=false

sunshaoce · 2024-01-05T06:52:49Z

Using -mllvm -hints-allow-reordering=false indeed resolved this issue. Thank you very much!

sunshaoce added bug Indicates an unexpected problem or unintended behavior backend:RISC-V vectorization labels Jan 5, 2024

dtcxzyw added the floating-point Floating-point math label Jan 5, 2024

dtcxzyw added miscompilation and removed bug Indicates an unexpected problem or unintended behavior labels Jan 5, 2024

sunshaoce closed this as completed Jan 5, 2024

EugeneZelenko added the question A question, not bug report. Check out https://llvm.org/docs/GettingInvolved.html instead! label Jan 5, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[RISCV] When enabling `-force-vector-width`, RVV vectorization has produced wrong results #77044

[RISCV] When enabling `-force-vector-width`, RVV vectorization has produced wrong results #77044

sunshaoce commented Jan 5, 2024

llvmbot commented Jan 5, 2024

llvmbot commented Jan 5, 2024

topperc commented Jan 5, 2024

dtcxzyw commented Jan 5, 2024

dtcxzyw commented Jan 5, 2024

topperc commented Jan 5, 2024 •

edited

Loading

sunshaoce commented Jan 5, 2024

[RISCV] When enabling -force-vector-width, RVV vectorization has produced wrong results #77044

[RISCV] When enabling -force-vector-width, RVV vectorization has produced wrong results #77044

Comments

sunshaoce commented Jan 5, 2024

llvmbot commented Jan 5, 2024

llvmbot commented Jan 5, 2024

topperc commented Jan 5, 2024

dtcxzyw commented Jan 5, 2024

dtcxzyw commented Jan 5, 2024

topperc commented Jan 5, 2024 • edited Loading

sunshaoce commented Jan 5, 2024

[RISCV] When enabling `-force-vector-width`, RVV vectorization has produced wrong results #77044

[RISCV] When enabling `-force-vector-width`, RVV vectorization has produced wrong results #77044

topperc commented Jan 5, 2024 •

edited

Loading