After r307529, sinl() test failures in FreeBSD's libm #33427

DimitryAndric · 2017-08-05T12:02:21Z


Bugzilla Link	34080
Resolution	FIXED
Resolved on	Jan 16, 2018 06:40
Version	trunk
OS	All
Blocks	#33196
CC	@topperc,@emaste,@zmodem,@RKSimon,@rotateright
Fixed by commit(s)	321424

Extended Description

I recently imported a clang 5.0.0 snapshot (from just before the release_50 branch) into FreeBSD 12.0-CURRENT, and soon after we noticed that one of our libm regression tests was failing, in particular the sinl(3) test.

It turns out that this failure was introduced by r307529 ("This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target"), see also https://reviews.llvm.org/D35019.

I did a bit of investigation, and it turned out that r307529 changes the x87 instruction order, but I do not yet know if this is due to some sort of undefined behavior in the sinl code, or a bug in LLVM.

The code in question can be found here: https://github.com/freebsd/freebsd/blob/master/lib/msun/ld80/e_rem_pio2l.h

The part that seems to be miscompiled is the for loop from line 137 onwards, looking like:

43  static const double
44  zero =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
45  two24 =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */

...
77 long double z,w,t,r,fn;
78 double tx[3],ty[2];
79 int e0,ex,i,j,nx,n;
...
136 z = u1.e;
137 for(i=0;i<2;i++) {
138 tx[i] = (double)((int32_t)(z));
139 z = (z-tx[i])*two24;
140 }
141 tx[2] = z;

Before r307529, this loop is unrolled to:

    #DEBUG_VALUE: sinl:z <- [DW_OP_LLVM_fragment 0 80] %FP1
    #DEBUG_VALUE: sinl:x <- %FP0
    .loc    1 0 6                   # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:0:6
    fstp    %st(0)
    .loc    1 84 7 is_stmt 1        # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:84:7
    shll    $16, %ebx
    .loc    1 135 19                # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:135:19
    movq    %rax, -96(%rbp)
    movw    $16406, -88(%rbp)       # imm = 0x4016
    fldt    -96(%rbp)

.Ltmp44:
#DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0
#DEBUG_VALUE: __ieee754_rem_pio2l:u1 <- [DW_OP_LLVM_fragment 0 80] %FP0
.loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
fnstcw -68(%rbp)
movzwl -68(%rbp), %eax
movw $3199, -68(%rbp) # imm = 0xC7F
fldcw -68(%rbp)
movw %ax, -68(%rbp)
fistl -72(%rbp)
fldcw -68(%rbp)
.loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11
cvtsi2sdl -72(%rbp), %xmm0
.loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9
movsd %xmm0, -48(%rbp)
.loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10
movsd %xmm0, -152(%rbp)
.loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9
fsubl -152(%rbp)
.loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16
flds .LCPI0_1(%rip)
fmul %st(0), %st(1)
#DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0
.loc 1 138 20 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
fnstcw -66(%rbp)
movzwl -66(%rbp), %eax
movw $3199, -66(%rbp) # imm = 0xC7F
fldcw -66(%rbp)
movw %ax, -66(%rbp)
fxch %st(1)
fistl -76(%rbp)
fldcw -66(%rbp)
.loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11
xorps %xmm0, %xmm0
cvtsi2sdl -76(%rbp), %xmm0
.loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9
movsd %xmm0, -40(%rbp)
.loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10
movsd %xmm0, -144(%rbp)
.loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9
fsubl -144(%rbp)
.loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16
fmulp %st(1)
.Ltmp45:
.loc 1 141 8 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:141:8
fstpl -32(%rbp)

After r307529, it becomes:

    #DEBUG_VALUE: sinl:z <- [DW_OP_LLVM_fragment 0 80] %FP1
    #DEBUG_VALUE: sinl:x <- %FP0
    .loc    1 0 6                   # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:0:6
    fstp    %st(0)
    .loc    1 135 19 is_stmt 1      # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:135:19
    movq    %rax, -96(%rbp)
    movw    $16406, -88(%rbp)       # imm = 0x4016
    fldt    -96(%rbp)

.Ltmp44:
#DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0
#DEBUG_VALUE: __ieee754_rem_pio2l:u1 <- [DW_OP_LLVM_fragment 0 80] %FP0
.loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
fnstcw -68(%rbp)
movzwl -68(%rbp), %eax
movw $3199, -68(%rbp) # imm = 0xC7F
fldcw -68(%rbp)
movw %ax, -68(%rbp)
fistl -72(%rbp)
fldcw -68(%rbp)
.loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11
cvtsi2sdl -72(%rbp), %xmm0
.loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9
movsd %xmm0, -48(%rbp)
.loc 1 139 10 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10
movsd %xmm0, -152(%rbp)
.loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9
fsubl -152(%rbp)
.loc 1 138 20 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
fnstcw -66(%rbp)
.loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16
flds .LCPI0_1(%rip)
.loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
movzwl -66(%rbp), %eax
movw $3199, -66(%rbp) # imm = 0xC7F
fldcw -66(%rbp)
.loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16
fmul %st(0), %st(1)
#DEBUG_VALUE: __ieee754_rem_pio2l:z <- %FP0
.loc 1 138 20 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:20
movw %ax, -66(%rbp)
fxch %st(1)
fistl -76(%rbp)
fldcw -66(%rbp)
.loc 1 138 11 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:11
xorps %xmm0, %xmm0
cvtsi2sdl -76(%rbp), %xmm0
.Ltmp45:
.loc 1 84 7 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:84:7
shll $16, %ebx
.Ltmp46:
.loc 1 138 9 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:138:9
movsd %xmm0, -40(%rbp)
.loc 1 139 10 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:10
movsd %xmm0, -144(%rbp)
.loc 1 139 9 is_stmt 0 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:9
fsubl -144(%rbp)
.loc 1 139 16 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:139:16
fmulp %st(1)
.Ltmp47:
.loc 1 141 8 is_stmt 1 # /usr/src/lib/msun/src/../ld80/e_rem_pio2l.h:141:8
fstpl -32(%rbp)

Viewed as a context diff this looks as follows:

@@ -179,7 +179,6 @@
jbe .LBB0_9
.LBB0_14: # %if.end80.i
fstp %st(0)

  shll    $16, %ebx
  movq    %rax, -96(%rbp)
  movw    $16406, -88(%rbp)       # imm = 0x4016
  fldt    -96(%rbp)

@@ -194,18 +193,19 @@
movsd %xmm0, -48(%rbp)
movsd %xmm0, -152(%rbp)
fsubl -152(%rbp)

```
  flds    .LCPI0_1(%rip)
```

  fmul    %st(0), %st(1)
  fnstcw  -66(%rbp)

  flds    .LCPI0_1(%rip)
  movzwl  -66(%rbp), %eax
  movw    $3199, -66(%rbp)        # imm = 0xC7F
  fldcw   -66(%rbp)

  fmul    %st(0), %st(1)
  movw    %ax, -66(%rbp)
  fxch    %st(1)
  fistl   -76(%rbp)
  fldcw   -66(%rbp)
  xorps   %xmm0, %xmm0
  cvtsi2sdl       -76(%rbp), %xmm0

  shll    $16, %ebx
  movsd   %xmm0, -40(%rbp)
  movsd   %xmm0, -144(%rbp)
  fsubl   -144(%rbp)

E.g., apart from the moving around of 'shll $16, %ebx', which does not influence the end result, the loading of the LCPI0_1 constant and multiplying it is moved further into the code, and that definitely does influence the result.

I would like to get to the bottom of this before 5.0.0 is released, if possible, otherwise I would propose to revert r307529, at least in the release_50 branch. But it should still be fixed somehow.

Currently I have a small test case which is derived from the libm code in e_rem_pio2l.h, but it does not seem to exhibit the same problem, for some reason that I do not know yet:

#include <stdio.h>

attribute((noinline)) void g(double *tx) {
printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a\n", tx[0], tx[1], tx[2]);
}

static const double two24 = 1.67772160000000000000e+07;

attribute((noinline)) void f(long double z) {
int i;
double tx[3];
for (i = 0; i < 2; i++) {
tx[i] = (double)((int)(z));
z = (z - tx[i]) * two24;
}
tx[2] = z;
g(tx);
}

int main(void)
{
f(0x1.b2f3ee96e76003260000p+23);
return 0;
}

The correct values this should print are:

tx[0]=0x1.b2f3ee00000000000000p+23, tx[1]=0x1.2dcec000000000000000p+22, tx[2]=0x1.93000000000000000000p+16

The text was updated successfully, but these errors were encountered:

RKSimon · 2017-08-05T12:52:31Z

Out of interest, did you see this by setting your target-cpu to sandybridge or x86-64? The generic x86-64 cpu current uses the sandy bridge model by default which will expose this much wider.....

DimitryAndric · 2017-08-05T13:42:32Z

Out of interest, did you see this by setting your target-cpu to sandybridge
or x86-64? The generic x86-64 cpu current uses the sandy bridge model by
default which will expose this much wider.....

The default is to compile for generic x86-64. (Athough the end-user can override this using a CPUTYPE setting, this is not often done in practice.)

DimitryAndric · 2017-08-05T18:56:30Z

Interestingly, compiling with an explicit -march=sandybridge appears to make the code work correctly! So it seems there is still some difference between the 'generic' x86-64 CPU and sandybridge?

DimitryAndric · 2017-08-05T19:23:31Z

Here's a working small test case:

#include <stdio.h>

attribute((noinline)) int g(double *tx) {
int res = (tx[2] != 0x1.93p+16);
printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a: %s\n", tx[0], tx[1], tx[2], res ? "Bad!" : "OK");
return res;
}

static const double two24 = 1.67772160000000000000e+07;

attribute((noinline)) int f(long double z) {
int i;
double tx[3];
for (i = 0; i < 2; i++) {
tx[i] = (double)((int)(z));
z = (z - tx[i]) * two24;
//printf("z=%.20La\n", z);
}
tx[2] = z;
return g(tx);
}

int main(void)
{
return f(0x1.b2f3ee96e76003260000p+23L);
}

The correct result (when compiling with trunk before r307529) should be:

tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

The bad result (when compiling with trunk on or after r307529) is:

tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x0.00000000000000000000p+0: Bad!

Again, the difference between produced assembly looks similar:

--- pio2m-r307528.s
+++ pio2m-r307529.s
@@ -68,11 +68,11 @@
.Lcfi6:
.cfi_def_cfa_register %rbp
subq $64, %rsp

  fldt    16(%rbp)
  fnstcw  -4(%rbp)
  movzwl  -4(%rbp), %eax
  movw    $3199, -4(%rbp)         # imm = 0xC7F
  fldcw   -4(%rbp)

  fldt    16(%rbp)
  movw    %ax, -4(%rbp)
  fistl   -8(%rbp)
  fldcw   -4(%rbp)

@@ -80,12 +80,12 @@
movsd %xmm0, -64(%rbp)
movsd %xmm0, -32(%rbp)
fsubl -32(%rbp)

```
  flds    .LCPI1_0(%rip)
```

  fmul    %st(0), %st(1)
  fnstcw  -2(%rbp)

  flds    .LCPI1_0(%rip)
  movzwl  -2(%rbp), %eax
  movw    $3199, -2(%rbp)         # imm = 0xC7F
  fldcw   -2(%rbp)

  fmul    %st(0), %st(1)
  movw    %ax, -2(%rbp)
  fxch    %st(1)
  fistl   -12(%rbp)

E.g. the flds and fmul are moved apart somehow, and that influences the end result.

Note that uncommenting the printf() call at the end of the for loop 'fixes' the problem, and then it shows the correct end result. But that is pretty much papering over the problem. :(

RKSimon · 2017-08-05T19:25:18Z

Generic x86-64 has the following features:

def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureX87,
FeatureMMX,
FeatureSSE2,
FeatureFXSR,
Feature64Bit,
FeatureSlowBTMem
]>;

It's just using the scheduling model for the instructions that it supports.

But sandybridge/corei7-avx supports:

def SNBFeatures : ProcessorFeatures<[], [
FeatureX87,
FeatureMMX,
FeatureAVX,
FeatureFXSR,
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
class SandyBridgeProc : ProcModel<Name, SandyBridgeModel,
SNBFeatures.Value, [
FeatureSlowBTMem,
FeatureSlowUAMem32
]>;

Any of those feature diffs can cause changes in isel which could affect final codegen + order, and there are plenty of other CPUs in between that use the model as well.

Please can you try with yonah, core2, penryn and corei7 and westmere to see if any of those expose it as well? My current guess is a SSE vs AVX discrepancy.

RKSimon · 2017-08-05T19:26:59Z

Collision! Thanks for the test case, I'll compare the codegen

DimitryAndric · 2017-08-05T19:31:06Z

Btw, at least -O2 is needed to reproduce, I forgot to mention that, sorry.

DimitryAndric · 2017-08-05T20:45:24Z

Please can you try with yonah, core2, penryn and corei7 and westmere to see
if any of those expose it as well? My current guess is a SSE vs AVX
discrepancy.

clang r307529 doesn't accept -march=yonah, but the others all seem to result in working code:

core2:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

penryn:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

nehalem:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

westmere:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

sandybridge:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

ivybridge:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

haswell:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

broadwell:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

skylake:
tx[0]=0x1.b2f3ee00000000000000p+23 tx[1]=0x1.2dcec000000000000000p+22 tx[2]=0x1.93000000000000000000p+16: OK

Apparently, only when you use plain x86-64, it starts failing...

RKSimon · 2017-08-05T20:52:46Z

The bug disappears when SSE3 is enabled which allows use of FISTTP to avoid control word manipulation for truncated stores.

RKSimon · 2017-08-05T22:05:39Z

I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a simpler test case and put it up for review tomorrow.

RKSimon · 2017-08-06T14:48:28Z

I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a
simpler test case and put it up for review tomorrow.

That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right now.

We can make a minor tweak to the sandy bridge model to hide the problem - disabling the FLDCW16m/FNSTCW16m entries is enough.

I've been looking at X86TargetLowering::EmitInstrWithCustomInserter in case the FNSTCW insertion is wrong, but I haven't noticed anything.

I've added the test case at rL310198

Note: X86InstrFPStack.td is missing Defs = [FPSW] for various x87 instructions, as they leave the status flags reg in an undefined state - that needs fixing but creating tests will be tricky.

Did I mention I hate x87 programming?

DimitryAndric · 2017-08-08T10:58:58Z

I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a
simpler test case and put it up for review tomorrow.

That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right
now.

We can make a minor tweak to the sandy bridge model to hide the problem -
disabling the FLDCW16m/FNSTCW16m entries is enough.

What will be the side effects of such a change?

I've been looking at X86TargetLowering::EmitInstrWithCustomInserter in case
the FNSTCW insertion is wrong, but I haven't noticed anything.

I've added the test case at rL310198

Note: X86InstrFPStack.td is missing Defs = [FPSW] for various x87
instructions, as they leave the status flags reg in an undefined state -
that needs fixing but creating tests will be tricky.

Did I mention I hate x87 programming?

Heh, it's a can of worms, indeed.

RKSimon · 2017-08-09T09:16:14Z

I have a fix (add missing DEFS/USES for the FPSW reg) - I'll create a
simpler test case and put it up for review tomorrow.

That fix fails on EXPENSIVE_CHECKS builds, so I don't have a solution right
now.

We can make a minor tweak to the sandy bridge model to hide the problem -
disabling the FLDCW16m/FNSTCW16m entries is enough.

What will be the side effects of such a change?

Side effect is that the problem (instructions being moved across a control word instruction) is still there and likely to go off at some other point now that teams are starting to improve their scheduler models, affecting x87 instruction ordering.

DimitryAndric · 2017-08-13T18:29:48Z

Ok, so to fix this, https://reviews.llvm.org/rL310792 should be merged into 5.0.0. It is pretty big, but it's a continuation of r307529 which is already in 5.0.0, so I'd really like to have it. Gadi, do you think this is OK? Or do we need to let r310792 bake for a while, still?

llvmbot · 2017-08-14T06:28:02Z

I guess it should be ok.
Note that I ran various tests before submitting the revision. I will re-run them with your change.
Note that we are still missing the scheduling of some SNB instructions - which I hope to finish gathering after committing the main HSW and SKL scheduling.

zmodem · 2017-08-15T00:18:52Z

I'll let r310792 sit in tree a bit more before merging it.

zmodem · 2017-08-21T23:22:27Z

I'll let r310792 sit in tree a bit more before merging it.

I'm waiting for follow-up comments on https://reviews.llvm.org/D36388 to be resolved.

zmodem · 2017-08-22T21:58:07Z

Perhaps we should try to revert r307529 on the branch instead?

DimitryAndric · 2017-08-23T20:32:45Z

Perhaps we should try to revert r307529 on the branch instead?

Yeah, maybe that is a safer solution. Before r307529, the scheduler seems to have worked okay enough... :)

zmodem · 2017-08-23T21:18:18Z

Perhaps we should try to revert r307529 on the branch instead?

Yeah, maybe that is a safer solution. Before r307529, the scheduler seems
to have worked okay enough... :)

I've reverted it in r311600.

DimitryAndric · 2017-08-24T14:36:21Z

Perhaps we should try to revert r307529 on the branch instead?

Yeah, maybe that is a safer solution. Before r307529, the scheduler seems
to have worked okay enough... :)

I've reverted it in r311600.

Thanks, I have verified that both my simplified test cases, and the full libm regression tests now all succeed again.

llvmbot · 2017-12-21T16:17:58Z

Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy bridge changes?

RKSimon · 2017-12-21T16:29:29Z

Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over
an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy
bridge changes?

No the underlying problem is still there and I don't think anyone has found the root cause - I think Bug #21160 is the same but I have no proof. If you have a different testcase it'd be useful for comparison.

llvmbot · 2017-12-21T16:40:08Z

Sorry, to elaborate. I have an OpenBSD/i386 machine with clang 4.0, clang 5.0 and the latest clang 6.0 git checkout. All seem the have the same behaviour.

With -O2, compiling sqlite3 (or a smaller C code of that), some fmul is moved over:

Bad:
.loc 4 35 41 # sqlite3.c:35:41
fnstcw (%esp)
movw (%esp), %ax
movw $3199, (%esp) # imm = 0xC7F
fldcw (%esp)
movw %ax, (%esp)
.loc 4 35 13 # sqlite3.c:35:13
movl %ecx, %eax
sarl $31, %eax
.loc 4 35 61 # sqlite3.c:35:61
fmuls .LCPI0_2@GOTOFF(%ebx)
.loc 4 35 41 # sqlite3.c:35:41
fistpll 16(%esp)
fldcw (%esp)

Good:
.loc 4 35 60 # sqlite3.c:35:60
fldl 28(%ecx)
.loc 4 35 61 # sqlite3.c:35:61
flds .LCPI0_2@GOTOFF(%eax)
fmulp %st(1)
.loc 4 35 41 # sqlite3.c:35:41
fnstcw 28(%esp)
movw 28(%esp), %si
movw $3199, 28(%esp) # imm = 0xC7F
fldcw 28(%esp)
movw %si, 28(%esp)
fistpll 32(%esp)
fldcw 28(%esp)
movl 32(%esp), %ebx
movl 36(%esp), %eax
.loc 4 35 39 # sqlite3.c:35:39

This is part of -O0 vs -O1. In the good example, flds/fmulp is happening before fldcw/fistpll/flcdw. In the bad example, it becomes flcdw/fmuls/fistpll/fldcw.

/var/tmp/pwildt/w-llvm/fake-i386/usr/local/bin/clang -O2 -g -Wall -v -c sqlite3.c
clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 0604b824fdd2e3b0c6705a4b7aabc532ebb4613e) (https://github.com/llvm-mirror/llvm.git eb585b8)
Target: i386-unknown-openbsd6.2
Thread model: posix
InstalledDir: /var/tmp/pwildt/w-llvm/fake-i386/usr/local/bin
"/tmp/pwildt/w-llvm/fake-i386/usr/local/bin/clang-6.0" -cc1 -triple i386-unknown-openbsd6.2 -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name sqlite3.c -mrelocation-model pic -pic-level 1 -pic-is-pie -mthread-model posix -mdisable-fp-elim -relaxed-aliasing -masm-verbose -mconstructor-aliases -target-cpu i486 -dwarf-column-info -debug-info-kind=limited -dwarf-version=2 -debugger-tuning=gdb -v -coverage-notes-file /home/pwildt/sqlite3/sqlite3.gcno -resource-dir /tmp/pwildt/w-llvm/fake-i386/usr/local/lib/clang/6.0.0 -O2 -Wall -fdebug-compilation-dir /home/pwildt/sqlite3 -ferror-limit 19 -fmessage-length 136 -femulated-tls -fwrapv -stack-protector 2 -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -fobjc-runtime=gnustep -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o sqlite3.o -x c sqlite3.c
clang -cc1 version 6.0.0 based upon LLVM 6.0.0svn default target i386-unknown-openbsd6.2

llvmbot · 2017-12-21T18:00:45Z

Sqlite3-based Test-Case

llvmbot · 2017-12-21T18:04:16Z

Compiling for i386, thus no SSE on OpenBSD, I see that fmuls is moved over
an fldcw and causes the same issue. I guess this wasn‘t fixed with the sandy
bridge changes?

No the underlying problem is still there and I don't think anyone has found
the root cause - I think Bug #21160 is the same but I have no proof. If you
have a different testcase it'd be useful for comparison.

Thank you for taking the time to reply. I have attached our test case,
which is an (in comparison) small part of sqlite3. The issue arises in
the function computeJD(), p->JD becomes incorrect.

Good output:
$ TZ=GMT ./test
x = 0000-00-00 00:00:0.000000 (210866760000000)
p = 1970-01-01 00:00:0.000000 (210866760000000)
x = 1970-01-01 00:00:0.000000 (210866760000000)
t = 0
tm = { 70/01/01 00:00:00+0 }
p = 1970-01-01 00:00:0.000000 (210866760000000)
y = 1970-01-01 00:00:0.000000 (210866760000000)
yJD = 210866760000000, xJD=210866760000000, offset = 0
Result: 1970-01-01 00:00:00

Bad output:
$ TZ=GMT ./test
x = 0000-00-00 00:00:0.000000 (210866760000000)
p = 1970-01-01 00:00:0.000000 (210866760000000)
x = 1970-01-01 00:00:0.000000 (210866760000000)
t = 0
tm = { 70/01/01 00:00:00+0 }
p = 1970-01-01 00:00:0.000000 (210866754551808)
y = 1970-01-01 00:00:0.000000 (210866754551808)
yJD = 210866754551808, xJD=210866760000000, offset = -5448192
Result: 1969-12-31 22:29:11

RKSimon · 2017-12-24T12:28:26Z

rL321424 should finally fix this, it marks the pseudo x87 memory folded instructions as having side effects, the same as we do for the actual x87 opcodes.

This should prevent the scheduler from moving these instructions across the fldcw sequences that has been causing the rounding issues.

llvmbot · 2017-12-24T12:57:35Z

Yup, that fixed my test case, thank you and merry christmas! :)

zmodem · 2018-01-16T14:40:51Z

Thanks everyone! This landed before 6.0 branch, so will be part of 6.0.0.

zmodem mentioned this issue Jul 19, 2017

[meta] 5.0.0 Release Blockers #33196

Closed

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

After r307529, sinl() test failures in FreeBSD's libm #33427

After r307529, sinl() test failures in FreeBSD's libm #33427

DimitryAndric commented Aug 5, 2017

RKSimon commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 6, 2017

DimitryAndric commented Aug 8, 2017

RKSimon commented Aug 9, 2017

DimitryAndric commented Aug 13, 2017

llvmbot commented Aug 14, 2017

zmodem commented Aug 15, 2017

zmodem commented Aug 21, 2017

zmodem commented Aug 22, 2017

DimitryAndric commented Aug 23, 2017

zmodem commented Aug 23, 2017

DimitryAndric commented Aug 24, 2017

llvmbot commented Dec 21, 2017

RKSimon commented Dec 21, 2017

llvmbot commented Dec 21, 2017

llvmbot commented Dec 21, 2017

llvmbot commented Dec 21, 2017

RKSimon commented Dec 24, 2017

llvmbot commented Dec 24, 2017

zmodem commented Jan 16, 2018

After r307529, sinl() test failures in FreeBSD's libm #33427

After r307529, sinl() test failures in FreeBSD's libm #33427

Comments

DimitryAndric commented Aug 5, 2017

Extended Description

RKSimon commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

DimitryAndric commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 5, 2017

RKSimon commented Aug 6, 2017

DimitryAndric commented Aug 8, 2017

RKSimon commented Aug 9, 2017

DimitryAndric commented Aug 13, 2017

llvmbot commented Aug 14, 2017

zmodem commented Aug 15, 2017

zmodem commented Aug 21, 2017

zmodem commented Aug 22, 2017

DimitryAndric commented Aug 23, 2017

zmodem commented Aug 23, 2017

DimitryAndric commented Aug 24, 2017

llvmbot commented Dec 21, 2017

RKSimon commented Dec 21, 2017

llvmbot commented Dec 21, 2017

llvmbot commented Dec 21, 2017

llvmbot commented Dec 21, 2017

RKSimon commented Dec 24, 2017

llvmbot commented Dec 24, 2017

zmodem commented Jan 16, 2018