[RFC] Dynamic check for naked pointers #9534

kayceesrk · 2020-05-04T12:20:35Z

Note: This PR is not for merging, but to aid discussion

This PR adds the ability to dynamically identify naked pointers to 4.10.0 compiler. A naked pointer is identified during a GC if the pointer is either

not in heap and reading the head value triggers a segfault or
not in heap, points to a non-zero-sized block and is not black

If the block is zero-sized or marked black, then the GC would not have to scan this object and can be safely skipped. Of course, this technique will have false negatives since the bit pattern in the field where the header is expected to be may look like a zero-sized block or marked black.

When a naked pointer is found, a warning is output to stderr and the program continues.

OPAM remote

You can install this compiler using this opam repo:

opam switch create 4.10.0+nnp+check --repositories=nnp=git+https://github.com/kayceesrk/opam-repo.git,default

Example

/* cstub.c */
#include "caml/mlvalues.h"

value make_block (value header, value size) {
  int64_t* p = (int64_t*)malloc(sizeof(int64_t) * (Int64_val(size) + 1));
  p[0] = Int64_val(header);
  return (value)&p[1];
}

value get_raw_pointer (value v) {
  return (value)Int64_val(v);
}

(* test.ml *)
external make_block : int64 -> int64 -> Obj.t = "make_block"                        
external get_raw_pointer : int64 -> Obj.t = "get_raw_pointer"                       
                                                                                    
(* See runtime/caml/gc.h *)                                                         
let white = 0L                                                                      
let gray = 1L                                                                       
let blue = 2L                                                                       
let black = 3L                                                                      
                                                                                    
(* See runtime/caml/mlvalues.h *)                                                   
let mk_header tag colour size =                                                     
  let open Int64 in                                                                 
  assert (colour >= 0L && colour <= 3L);                                            
  assert (tag >= 0L && tag <= 255L);        
  assert (size >=0L);
  logor (shift_left size 10) (logor (shift_left colour 8) tag)

let do_gc () =
  print_endline "**** Begin full major GC ****";
  Gc.full_major ();
  print_endline "**** End full major GC ****"

(* External object with black header is accepted. GC doesn't scan black
 * objects. *)
let ex1 =
  let h = mk_header 0L black 1000L in
  let e = make_block h 1000L in
  let o = Obj.new_block 0 1 in
  Obj.set_field o 0 e;
  o

let _ = do_gc ()

(* External object with size 0 is accepted. GC doesn't scan 0 sized objects. *)
let ex2 =
  (* The header may be non-black for 0 sized object *)
  let h = mk_header 0L white 0L in
  (* The actual size of the object in memory may be different *)
  let e = make_block h 1000L in
  let o = Obj.new_block 0 1 in
  Obj.set_field o 0 e;
  o

let _ = do_gc ()

(* A non-zero-sized external object cannot be non-black. *)
let ex3 =
  let h = mk_header 0L white 1000L in
  let e = make_block h 1000L in
  let o = Obj.new_block 0 1 in
  Obj.set_field o 0 e;
  o

let _ = do_gc ()

(* If external pointers point to unallocated memory, a warning is generated *)
let ex3 =
  let o = Obj.new_block 0 1 in
  Obj.set_field o 0 (get_raw_pointer 42L);
  o

let _ = do_gc ()

let _ = [ex1; ex2; ex3]

$ ocamlopt.opt -g test.ml cstub.c
$ ./a.out 
**** Begin full major GC ****
**** End full major GC ****
**** Begin full major GC ****
**** End full major GC ****
**** Begin full major GC ****
Out-of-heap pointer at 0x7feaf8083ff8 of value 0x55e0ff10aa38 has non-black head (tag=0)
**** End full major GC ****
**** Begin full major GC ****
Out-of-heap pointer at 0x7feaf800c2e0 of value 0x55e0ff10aa38 has non-black head (tag=0)
Out-of-heap pointer at 0x7feaf800c2e0 of value 0x55e0ff10aa38 has non-black head (tag=0)
Out-of-heap pointer at 0x7feaf8083ff8 of value 0x2a. Cannot read head.
**** End full major GC ****
Out-of-heap pointer at 0x7feaf800c2e0 of value 0x55e0ff10aa38 has non-black head (tag=0)
Out-of-heap pointer at 0x7feaf800c2f0 of value 0x2a. Cannot read head.

Finding the source of the naked pointers

Given that the naked pointers are discovered during the GC, how does one find the source of the problem? rr makes it quite easy. Let's look at a debugging a naked pointer on a real project:

$ opam install frama-c
$ rr frama-c
rr: Saving execution to trace directory `/home/kc/.local/share/rr/frama-c-5'.
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e2754d8 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
Out-of-heap pointer at 0x55fc1e275600 of value 0x55fc1e3a0cc0 has non-black head (tag=144)
$ rr replay
(rr) watch *(value*)0x55fc1e2754d8
Hardware watchpoint 1: *(value*)0x55fc1e2754d8
(rr) c
Continuing.

Hardware watchpoint 1: *(value*)0x55fc1e2754d8

Old value = 1
New value = 94541327240384
0x000055fc1dab48f8 in camlUnmarshal__entry () at src/libraries/datatype/unmarshal.ml:72
72      src/libraries/datatype/unmarshal.ml: No such file or directory.

This corresponds to the naked pointer at https://github.com/Frama-C/Frama-C-snapshot/blob/master/src/libraries/datatype/unmarshal.ml#L72.

Limitations

Tested only on amd64 on Linux.
Uses rr for locating the allocation point. There may be clever ways to use gdb to do the same.

stedolan · 2020-05-04T13:01:19Z

This looks great!

One change that might reduce false positives: I think it's invalid to create out-of-heap pointers that aren't black, regardless of whether they have size 0. (One reason for this is because the GC will mutate non-black headers, which probably isn't what you want on out-of-heap data, and might segfault if the out-of-heap data is mapped readonly)

This should make the test a bit more robust: not that many words of memory are black headers (you need a 1 bit at a specific location), but an awful lot of memory happens to be zero.

One other check you should consider is checking that the size is valid. Sizes of above, say, 2^40 words are definitely not valid headers.

kayceesrk · 2020-05-04T14:08:26Z

I think it's invalid to create out-of-heap pointers that aren't black, regardless of whether they have size 0. (One reason for this is because the GC will mutate non-black headers, which probably isn't what you want on out-of-heap data, and might segfault if the out-of-heap data is mapped readonly)

Should these be Caml_black then? IINM, GC doesn't mutate zero-sized headers.
For example, both caml_darken and mark_slice_darken check for non-zero size. These size checks can be removed if objects in atom table were black, I suspect.

stedolan · 2020-05-04T15:00:43Z

Huh, right. I think it would make more sense to have them be black, as you say.

xavierleroy · 2020-05-04T17:00:43Z

By all means, let's create atoms in the atom table with black color, if that's more convenient. There's no special reason atoms are white today: it makes no difference in "naked pointers" mode.

I see that ocamlopt already uses the black color for statically-allocated blocks containing structured constants. It makes complete sense to do the same for the atom table.

xavierleroy · 2020-05-04T17:03:27Z

Concerning sanity checks on possible headers: I agree with putting an upper bound on the size field. We could also try to check the size against the tag, e.g. a Double_tag block has size 1 (64 bits) or 2 (32 bits). Not sure this is really useful, though.

kayceesrk · 2020-05-04T17:55:28Z

@garrigue I ran lablgtk/examples/about.exe under this branch and I observed several naked pointers:

https://github.com/garrigue/lablgtk/blob/lablgtk3/src/gpointer.ml#L38
https://github.com/garrigue/lablgtk/blob/lablgtk3/src/pango.ml#L81
A few more calls to _get_tables() in src/gdkEnums.ml

The output is a bit noisy with the same warning repeated for several major GC rounds.

kayceesrk · 2020-05-05T06:29:50Z

We could also try to check the size against the tag, e.g. a Double_tag block has size 1 (64 bits) or 2 (32 bits). Not sure this is really useful, though.

Given that this requires the last byte to have Double_tag, I suspect that this might not be very effective.

garrigue · 2020-05-05T07:01:20Z

@kayceesrk Thank you, I'll investigate those.
Great tool, and even better if it could be available on opam for library developers.

kayceesrk · 2020-05-05T07:07:37Z

Great tool, and even better if it could be available on opam for library developers.

Thanks! We should make this compiler switch available in the official opam repository. @dra27 @avsm.

see ocaml/ocaml#9534 for more discussion on the subject.

avsm · 2020-05-05T13:46:01Z

Done; ocaml/opam-repository#16357 -- I'll post a note to discuss.ocaml.org to encourage more testing when it's merged.

When merged, you can use it simply by:

opam update
opam switch create 4.10.0+nnpcheck
eval $(opam env)

garrigue · 2020-05-05T13:57:11Z

Is NULL a naked pointer, or is there a special case for it ?
In the case you refer to in gpointer.ml, the value is either NULL or a valid pointer in the ocaml heap.

garrigue · 2020-05-05T14:07:26Z

Also, is there some kind of dummy header I could use for creating a static tuple on the C side ?
Creating a dynamic one seems stupid in the case of *_get_tables.

mshinwell · 2020-05-05T16:16:39Z

Regarding NULL: @lpw25 and I were discussing that we should probably add Obj.null, which would be the same as NULL in the C world (with an assertion that it's actually zero). Then the GC should have a special case to allow this.

kayceesrk · 2020-05-05T16:20:13Z

In the case you refer to in gpointer.ml, the value is either NULL or a valid pointer in the ocaml heap.

In this case, you could use Obj.magic 0 (least significant bit is 1) as the value for raw_null since you always know that the non-null value will be an object on the heap (least significant bit is 0).

kayceesrk · 2020-05-05T16:25:52Z

Also, is there some kind of dummy header I could use for creating a static tuple on the C side ?
Creating a dynamic one seems stupid in the case of *_get_tables.

You can use Caml_black as a valid header. This would correspond to 0 sized object, with tag 0 and gc color black. Unfortunately, this is defined only in caml/gc.h and its definition will change when multicore is upstreamed as multicore uses different color scheme.

@mshinwell it might be useful to expose a stable macro for a static header.

gasche · 2020-05-05T16:30:02Z

I would naive assume that it is fairly common in C bindings to sometimes store NULL instead of a valid pointer into the OCaml heap (can we know?). Wouldn't it make sense for the GC to support (and ignore) NULL pointers? This may fit nicely with the current Classify_addr approach, and it probably is rather cheap. (Is this also what you are suggesting, @mshinwell?). @kayceesrk is proposing to port the code to use the tagged-integer representation of 0 instead.

lpw25 · 2020-05-05T16:38:33Z

I think we should separate out two items here

I think that we should add Obj.null but I do not think it needs to be part of the work to switch to no-naked-pointers. All the places where I would like to use it currently use something worse, but something that will still work in no-naked-pointers mode
The support for ignoring null pointers in OCaml values is clearly part of the no-naked-pointers work. I think that specifically allowing null in OCaml values will make the transition easier, we can add Obj.null later to give easy access to it on the OCaml side.

Using a tagged integer instead doesn't fill the same role. Currently you can store either an OCaml value or null into a block. You know that those two things are disjoint and allowed by the runtime.

mshinwell · 2020-05-05T16:38:56Z

@gasche That is what I'm suggesting. Effectively it would amount to using an unboxed sum type NULL | Valid of value for all places where values can occur. Hence the representation of NULL needing to be apart from any valid value. Zero is a convenient choice since it may help with C bindings as you suggest.

@kayceesrk Yeah, we should expose a macro for a static header and recommend that in the manual, most likely.

kayceesrk · 2020-05-05T18:29:37Z

I don't have strong opinions against Obj.null. Turns out it is only a few lines of change to add it to trunk. I can make a PR if there is interest.

kayceesrk · 2020-05-06T05:26:00Z

I've created #9535 for Obj.null.

xavierleroy · 2020-05-06T08:11:02Z

This looks pretty good, thanks! Are you interested in a proper review or not?

kayceesrk · 2020-05-06T08:18:36Z

@xavierleroy Happy to get a proper review!

xavierleroy

This looks good to me, and should be very useful to find uses of "naked pointers" in the wild. Thanks!

Two minor comments below. No changes required.

xavierleroy · 2020-05-06T16:37:27Z

runtime/major_gc.c

+  tag_t t;
+
+  Caml_state->checking_pointer_pc = &&on_segfault;
+  h = Hd_val(v);


This, Sir, is a bold use of GCC's first-class labels, but I kind of like it.

The more conservative approach would sigsetjmp here and siglongjmp out of the signal handler for SIGSEGV, but maybe we found out previously that it doesn't work.

I'm just hoping the optimizer understands that unpredictable control flow can occur.

The optimizer doesn't if the declaration header_t h isn't marked volatile. I haven't tried sigsetjmp/siglongjmp. I shall give that a try shortly.

I'm afraid you'll find sigsetjmp and siglongjmp too costly (they work with signal masks via system calls?). The current code is very efficient; all we need it to get it to work reliably despite GCC and Clang optimizations...

You are right. sigsetjmp and siglongjmp is quite slow. I tried opam install frama-c -y on a fresh switch. With GCC first-class labels, it takes 3m30.726s. With sigsetjmp/siglongjmp it takes 16m2.556s.

xavierleroy · 2020-05-06T16:39:08Z

runtime/major_gc.c

+    return 1;
+
+on_segfault:
+  if (Caml_state->checking_pointer_pc == NULL) {


Maybe I'm missing something obvious, but it's unclear to me why on_segfault is here and not on line 186 below. On a related note, why are we re-testing checking_ponter_pc?

I needed this to get the optimizer to generate the correct code. I agree that this is quite brittle.

checking_pointer_pc is reset at line 160 if there was no segfault. The segfault handler doesn't reset it. I'll give sigsetjmp/siglongjmp a try as it might be the better way to do this.

OK, I suspected an issue with optimizations.

There may be a way to turn optimizations off for this function, using GCC/Clang attributes.

xavierleroy · 2020-05-06T16:39:35Z

runtime/major_gc.c

+    Caml_state->checking_pointer_pc = NULL;
+    fprintf (stderr, "Out-of-heap pointer at %p of value %p. "
+                     "Cannot read head.\n", p, (void*)v);


I was naively expecting on_segfault to jump right here.

Same as above.

runtime/signals_nat.c

kayceesrk · 2020-05-07T12:27:53Z

I have disabled optimizations for is_pointer_safe function under GCC and Clang. GCC compiles the code correctly. Clang miscompiled the code even without optimizations turned on. I had to insert a never-taken-branch to generate the correct code. I've confirmed that it correctly returns from segfault handler on Apple clang version 11.0.0 (clang-1100.0.33.12) and clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final).

xavierleroy · 2020-05-07T13:20:17Z

I played a bit with the approach and, yes, it is hard to make GCC and Clang produce the expected assembly code, even in -O0 mode!

One possibility is to use inline assembly for the critical bits... Since this is goign to run under x86_64 only, it might be acceptable. If you're interested I have some asm code that I could adapt.

kayceesrk · 2020-05-07T13:49:16Z

That would be useful. Please do share.

xavierleroy · 2020-05-08T08:41:47Z

Here is what I had in mind. First, the "load and trap segv" part, written in assembly:

static inline int safe_load (header_t * addr, /*out*/ header_t * contents)
{
  int ok;
  header_t h;
  intnat tmp;

  asm volatile(
      "leaq 1f(%%rip), %[tmp] \n\t"
      "movq %[tmp], 0(%[handler]) \n\t"
      "xorl %[ok], %[ok] \n\t"
      "movq 0(%[addr]), %[h] \n\t"
      "movl $1, %[ok] \n\t"
  "1: \n\t"
      "xorq %[tmp], %[tmp] \n\t"
      "movq %[tmp], 0(%[handler])"
      : [tmp] "=&r" (tmp), [ok] "=&r" (ok), [h] "=&r" (h)
      : [addr] "r" (addr),
        [handler] "r" (&(Caml_state->checking_pointer_pc)));
  *contents = h;
  return ok;
}

This could also be written in true asm syntax in runtime/amd64.S, but then it would not be inline.

Now we can use this in is_pointer_safe:

static int is_pointer_safe (value v, value *p)
{
  header_t h;
  tag_t t;

  if (! safe_load(&Hd_val(v), &h)) goto on_segfault;

  t = Tag_hd(h);
  if (t == Infix_tag) {
    v -= Infix_offset_hd(h);
    if (! safe_load(&Hd_val(v), &h)) goto on_segfault;
  }

  /* For the pointer to be considered safe, either the given pointer is in heap
   * or the (out of heap) pointer has a black header and its size is < 2 ** 40
   * words (128 GB). If not, we report a warning. */
  if (Is_in_heap (v) || (Is_black_hd(h) && Wosize_hd(h) < (1ul << 40)))
    return 1;

  if (!Is_black_hd(h)) {
    fprintf (stderr, "Out-of-heap pointer at %p of value %p has "
                    "non-black head (tag=%d)\n", p, (void*)v, t);
  } else {
    fprintf (stderr, "Out-of-heap pointer at %p of value %p has "
                      "suspiciously large size: %lu words\n",
              p, (void*)v, Wosize_hd(h));
  }
  return 0;

 on_segfault:
  fprintf (stderr, "Out-of-heap pointer at %p of value %p. "
           "Cannot read head.\n", p, (void*)v);
  return 0;
}

kayceesrk · 2020-05-09T06:17:25Z

I've incorporated the inline assembly suggestion. Tested on GCC and Clang and confirmed that they work as expected.

As a next step, I suppose we should announce this more widely. @avsm volunteered to post it on discuss.ocaml.org so that there is wider testing. We should also close this PR as it is not meant for merging.

xavierleroy · 2020-05-13T16:38:57Z

Closing as suggested by @kayceesrk . Looking forwards to having this as an OPAM switch!

kayceesrk · 2020-05-19T11:49:31Z

Posted on discuss.ocaml.org for wider visibility https://discuss.ocaml.org/t/ann-a-dynamic-checker-for-detecting-naked-pointers/5805

jberdine · 2020-05-27T20:38:36Z

Thank you very much for this! This sort of thing is a significant help to know what the situation is between existing code and upcoming changes, which is an immensely better situation to be in than just guessing how much code needs to be reworked and retuned.

It isn't useful so far, but I just wanted to mention that I have been doing some testing with this. Unfortunately, so far I have failed to get it to trigger where, as far as I understand, it should, which leaves me unsure of the other non-reports of problems. I will work on narrowing things down.

kayceesrk · 2020-05-27T23:35:50Z

Hi @jberdine, I am happy to look at this if you have example.

stedolan · 2020-09-03T13:58:54Z

@nojb @kayceesrk @xavierleroy @dra27

Here's an implementation of safe_load using Windows SEH for MSVC:

#include <stdio.h>
#include <windows.h>

typedef unsigned int value;

int safe_load(volatile value* p, value* result) {
    value v;

    __try {
      v = *p;
    }
    __except(GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION ? 
          EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) {
      *result = 0xdeadbeef;
      return 0;
    }

    *result = v;
    return 1;
}

int main() {
    value foo = 42;
    value* ps[] = { &foo, NULL, (value*)438294732 };
    for (int i = 0; i < 3; i++) {
        value res;
        int r = safe_load(ps[i], &res);
        printf("%p: %d %08lx\n", ps[i], r, res);
    }
}

nojb · 2020-09-04T14:40:28Z

@stedolan Thank you! I confirm that with this function the dynamic check seems to works under msvc.

(It would be nice to have this as a configure option.)

kayceesrk · 2020-09-05T05:11:06Z

Thanks to @dra27, the support for mscv64 has been merged into kayceesrk/ocaml/4.10.0+nnp+check branch.

Initial commit of naked pointer checker

c1fc38f

avsm marked this pull request as draft May 4, 2020 12:45

Atoms have black header. Objects must be reasonable sized.

deb67af

avsm added a commit to avsm/opam-repository that referenced this pull request May 5, 2020

add ocaml.4.10.0+nnpcheck to dynamically check for naked pointers

fb3970b

see ocaml/ocaml#9534 for more discussion on the subject.

avsm mentioned this pull request May 5, 2020

add ocaml.4.10.0+nnpcheck to dynamically check for naked pointers ocaml/opam-repository#16357

Merged

kayceesrk mentioned this pull request May 6, 2020

Add Obj.null #9535

Closed

xavierleroy approved these changes May 6, 2020

View reviewed changes

Disable optimisations for is_pointer_safe

76d3c60

Convince clang to generate the correct code

909d34b

Use inline assembly to circumvent obstinate clang

3bd8081

xavierleroy closed this May 13, 2020

kayceesrk mentioned this pull request May 14, 2020

Add a macro for out-of-heap block header #9564

Merged

kayceesrk mentioned this pull request Sep 5, 2020

Extend support for 4.10.0 nnp check to msvc64 kayceesrk/ocaml#5

Merged

This was referenced Sep 29, 2020

Add a naked pointers dynamic checker #9947

Closed

Naked pointers crash on trunk #9950

Closed

yallop mentioned this pull request Jun 26, 2021

Remove naked pointers from the Dl library. yallop/ocaml-ctypes#681

Merged

[RFC] Dynamic check for naked pointers #9534

[RFC] Dynamic check for naked pointers #9534

Conversation

kayceesrk commented May 4, 2020

OPAM remote

Example

Finding the source of the naked pointers

Limitations

stedolan commented May 4, 2020

kayceesrk commented May 4, 2020 • edited Loading

stedolan commented May 4, 2020

xavierleroy commented May 4, 2020

xavierleroy commented May 4, 2020

kayceesrk commented May 4, 2020 • edited Loading

kayceesrk commented May 5, 2020

garrigue commented May 5, 2020

kayceesrk commented May 5, 2020

avsm commented May 5, 2020

garrigue commented May 5, 2020 • edited Loading

garrigue commented May 5, 2020 • edited Loading

mshinwell commented May 5, 2020

kayceesrk commented May 5, 2020

kayceesrk commented May 5, 2020

gasche commented May 5, 2020

lpw25 commented May 5, 2020

mshinwell commented May 5, 2020

kayceesrk commented May 5, 2020

kayceesrk commented May 6, 2020

xavierleroy commented May 6, 2020

kayceesrk commented May 6, 2020

xavierleroy left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kayceesrk commented May 7, 2020 • edited Loading

xavierleroy commented May 7, 2020

kayceesrk commented May 7, 2020

xavierleroy commented May 8, 2020

kayceesrk commented May 9, 2020

xavierleroy commented May 13, 2020

kayceesrk commented May 19, 2020

jberdine commented May 27, 2020

kayceesrk commented May 27, 2020

stedolan commented Sep 3, 2020

nojb commented Sep 4, 2020

kayceesrk commented Sep 5, 2020

kayceesrk commented May 4, 2020 •

edited

Loading

kayceesrk commented May 4, 2020 •

edited

Loading

garrigue commented May 5, 2020 •

edited

Loading

garrigue commented May 5, 2020 •

edited

Loading

kayceesrk commented May 7, 2020 •

edited

Loading