From 03f37e581b8245ec8755a90ee4c99ddcc4a6792f Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 10:44:36 +0100 Subject: [PATCH 1/8] Bytes.unsafe_to_string: comments after pass through irmin-pack code For each occurrence of Bytes.unsafe_to_string, I have commented on whether or not the use is safe. Some are, some are not, and for some it is not clear whether they are safe or not. --- src/irmin-pack/unix/atomic_write.ml | 2 +- src/irmin-pack/unix/gc.ml | 6 +++--- src/irmin-pack/unix/io.ml | 2 +- src/irmin-pack/unix/mapping_file.ml | 4 ++-- src/irmin-pack/unix/pack_index.ml | 4 ++-- src/irmin-pack/unix/pack_store.ml | 8 ++++---- src/irmin-pack/unix/snapshot.ml | 4 ++-- src/irmin-pack/unix/traverse_pack_file.ml | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/irmin-pack/unix/atomic_write.ml b/src/irmin-pack/unix/atomic_write.ml index f9d6444947..7afa019c48 100644 --- a/src/irmin-pack/unix/atomic_write.ml +++ b/src/irmin-pack/unix/atomic_write.ml @@ -33,7 +33,7 @@ module Make_persistent (K : Irmin.Type.S) (V : Value.S) = struct assert (n = 4); (file_pos := Int63.Syntax.(!file_pos + Int63.of_int 4)); let pos_ref = ref 0 in - let v = decode_bin (Bytes.unsafe_to_string buf) pos_ref in + let v = decode_bin (Bytes.unsafe_to_string buf (* safe: buf is local, not leaked, not modified after call to decode_bin *)) pos_ref in assert (!pos_ref = 4); Int32.to_int v diff --git a/src/irmin-pack/unix/gc.ml b/src/irmin-pack/unix/gc.ml index 6de26f245c..9b54553e39 100644 --- a/src/irmin-pack/unix/gc.ml +++ b/src/irmin-pack/unix/gc.ml @@ -117,8 +117,8 @@ module Make (Args : Args) : S with module Args := Args = struct let len' = Int63.to_int len in read_exn ~off ~len:len' buffer; let () = - if len = buffer_size then append_exn (Bytes.unsafe_to_string buffer) - else append_exn (String.sub (Bytes.unsafe_to_string buffer) 0 len') + if len = buffer_size then append_exn (Bytes.unsafe_to_string buffer) (* likely unsafe: buffer passed in, subject to mutations; append_exn retains ref to string; buffer can then be mutated, altering string value in append_exn cache *) + else append_exn (String.sub (Bytes.unsafe_to_string buffer) 0 len') (* likely unsafe, as previous line *) in let len_remaining = len_remaining - len in if len_remaining > Int63.zero then aux (off + len) len_remaining @@ -179,7 +179,7 @@ module Make (Args : Args) : S with module Args := Args = struct read_exn ~off ~len buffer; let poff = Dispatcher.poff_of_entry_exn ~off ~len mapping in Bytes.set buffer Hash.hash_size magic_parent; - write_exn ~off:poff ~len (Bytes.unsafe_to_string buffer) + write_exn ~off:poff ~len (Bytes.unsafe_to_string buffer) (* safe: buffer created locally, not leaked, not modified after call to write_exn *) let create_new_suffix ~root ~generation = let open Result_syntax in diff --git a/src/irmin-pack/unix/io.ml b/src/irmin-pack/unix/io.ml index 723694c344..6e54b08745 100644 --- a/src/irmin-pack/unix/io.ml +++ b/src/irmin-pack/unix/io.ml @@ -199,7 +199,7 @@ module Unix = struct let buf = Bytes.create len in try read_exn t ~off ~len buf; - Ok (Bytes.unsafe_to_string buf) + Ok (Bytes.unsafe_to_string buf) (* safe: buf local, not leaked, not modified after return *) with | Errors.Pack_error ((`Invalid_argument | `Read_out_of_bounds) as e) -> Error e diff --git a/src/irmin-pack/unix/mapping_file.ml b/src/irmin-pack/unix/mapping_file.ml index ad13cae871..a51ba0747a 100644 --- a/src/irmin-pack/unix/mapping_file.ml +++ b/src/irmin-pack/unix/mapping_file.ml @@ -346,7 +346,7 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct * Fmt.epr "register_entry < 500: %d %d\n%!" (Int63.to_int off) len; *) Bytes.set_int64_ne buffer 0 (Int63.to_int64 off); Bytes.set_int64_ne buffer 8 (Int64.of_int len); - Ao.append_exn file0 (Bytes.unsafe_to_string buffer) + Ao.append_exn file0 (Bytes.unsafe_to_string buffer) (* clearly unsafe!!! *) in let* () = Errs.catch (fun () -> register_entries ~register_entry) in let* () = Ao.flush file0 in @@ -436,7 +436,7 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct else let off, len = (* Decoding a pair of int can't fail *) - decode_bin_pair (Bytes.unsafe_to_string buffer) buffer_off + decode_bin_pair (Bytes.unsafe_to_string buffer) buffer_off (* possibly safe; depends on implementation of decode_bin_pair and all other libraries involved *) in let () = if off < last_yielded_end_offset then diff --git a/src/irmin-pack/unix/pack_index.ml b/src/irmin-pack/unix/pack_index.ml index 9eb01ee0bb..bbe5bf98a9 100644 --- a/src/irmin-pack/unix/pack_index.ml +++ b/src/irmin-pack/unix/pack_index.ml @@ -39,10 +39,10 @@ module Make (K : Irmin.Hash.S) = struct Bytes.set_int64_be buf 0 (Int63.to_int64 off); Bytes.set_int32_be buf 8 (Int32.of_int len); Bytes.set buf 12 (Pack_value.Kind.to_magic kind); - Bytes.unsafe_to_string buf + Bytes.unsafe_to_string buf (* safe: buf created locally, mutated, not leaked outside function *) let decode s pos : t = - let buf = Bytes.unsafe_of_string s in + let buf = Bytes.unsafe_of_string s in (* safe: buf only used for reading locally within this function *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in let kind = Bytes.get buf (pos + 12) |> Pack_value.Kind.of_magic_exn in diff --git a/src/irmin-pack/unix/pack_store.ml b/src/irmin-pack/unix/pack_store.ml index 7b19e7d03c..0dcb045806 100644 --- a/src/irmin-pack/unix/pack_store.ml +++ b/src/irmin-pack/unix/pack_store.ml @@ -161,7 +161,7 @@ struct "Attempted to read an entry at offset %a in the pack file, but got \ only %d bytes" Int63.pp off bytes_read; - let hash = decode_bin_hash (Bytes.unsafe_to_string buf) (ref 0) in + let hash = decode_bin_hash (Bytes.unsafe_to_string buf (* possibly unsafe; but if we assume buf is not mutated after "bytes_read" above, and given buf is created at the beginning of this function, this might actually be safe; TODO provide a clear reason that this is safe *)) (ref 0) in let kind = Pack_value.Kind.of_magic_exn (Bytes.get buf Hash.hash_size) in let size_of_value_and_length_header = match Val.length_header kind with @@ -173,7 +173,7 @@ struct correctly): *) let pos_ref = ref length_header_start in let length_header = - Varint.decode_bin (Bytes.unsafe_to_string buf) pos_ref + Varint.decode_bin (Bytes.unsafe_to_string buf) (* as above, this is potentially safe, but need clear reasoning *) pos_ref in let length_header_length = !pos_ref - length_header_start in Some (length_header_length + length_header) @@ -195,7 +195,7 @@ struct let found = Dispatcher.read_if_not_gced t.dispatcher ~off ~len buf in if (not found) || gced buf then None else - let hash = decode_bin_hash (Bytes.unsafe_to_string buf) (ref 0) in + let hash = decode_bin_hash (Bytes.unsafe_to_string buf (* safe: buf local to function, not mutated after initially filled *) ) (ref 0) in Some hash let pack_file_contains_key t k = @@ -296,7 +296,7 @@ struct let dict = Dict.find t.dict in let v = Val.decode_bin ~key_of_offset ~key_of_hash ~dict - (Bytes.unsafe_to_string buf) + (Bytes.unsafe_to_string buf) (* safe? buf created, filled, not changed subsequently, not leaked outside this function so no subsequent mutations *) (ref 0) in Some v diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index 005f419dc4..4d29cfc0bf 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -105,7 +105,7 @@ module Make (Args : Args) = struct in let entry_of_hash hash = key_of_hash hash t.inode_pack in Inode.Raw.decode_children_offsets ~entry_of_offset ~entry_of_hash - (Bytes.unsafe_to_string buf) + (Bytes.unsafe_to_string buf) (* safe - buf local to this function, ownership handed to Inode.Raw.decode_children_offsets; nothing can mutate buf subsequently *) (ref 0) type visit = { visited : Hash.t -> bool; set_visit : Hash.t -> unit } @@ -226,7 +226,7 @@ module Make (Args : Args) = struct let buf = Bytes.create encoded_size in Bytes.set_int64_be buf 0 (Int63.to_int64 off); Bytes.set_int32_be buf 8 (Int32.of_int len); - Bytes.unsafe_to_string buf + Bytes.unsafe_to_string buf (* safe: buf local to this function; buf not mutated after return *) let decode s pos : t = let buf = Bytes.unsafe_of_string s in diff --git a/src/irmin-pack/unix/traverse_pack_file.ml b/src/irmin-pack/unix/traverse_pack_file.ml index 93b0e0efe8..44de315092 100644 --- a/src/irmin-pack/unix/traverse_pack_file.ml +++ b/src/irmin-pack/unix/traverse_pack_file.ml @@ -254,7 +254,7 @@ end = struct let buffer_off, off, missing_hash = match decode_entry_exn ~off - ~buffer:(Bytes.unsafe_to_string !buffer) + ~buffer:(Bytes.unsafe_to_string !buffer) (* could be safe or unsafe; depends on implementation of decode_entry_exn *) ~buffer_off with | { key; data } -> From eb1af2e01b144d9e8b6db06d3d1c9d409f5a0190 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 10:56:27 +0100 Subject: [PATCH 2/8] irmin-pack: Bytes.unsafe_to_string: comments on occurrences Each occurrence of unsafe_to_string in irmin-pack has been marked with a comment as to whether the use is safe or not. --- src/irmin-pack/inode.ml | 4 ++-- src/irmin-pack/unix/io.ml | 2 +- src/irmin-pack/unix/snapshot.ml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/irmin-pack/inode.ml b/src/irmin-pack/inode.ml index 618b572953..78e4212e8e 100644 --- a/src/irmin-pack/inode.ml +++ b/src/irmin-pack/inode.ml @@ -104,9 +104,9 @@ struct let key = match Conf.inode_child_order with | `Hash_bits -> - fun s -> Bytes.unsafe_of_string (hash_to_bin_string (Step.hash s)) + fun s -> Bytes.unsafe_of_string (hash_to_bin_string (Step.hash s)) (* possibly unsafe use of Bytes.unsafe_of_string? depends on what the rest of the code does; if we don't leak the bytes, and don't mutate them, maybe this is safe? not sure *) | `Seeded_hash | `Custom _ -> - fun s -> Bytes.unsafe_of_string (step_to_bin_string s) + fun s -> Bytes.unsafe_of_string (step_to_bin_string s) (* as previous line - not sure *) (* Assume [k = cryto_hash(step)] (see {!key}) and [Conf.entry] can can represented with [n] bits. Then, [hash_bits ~depth k] is diff --git a/src/irmin-pack/unix/io.ml b/src/irmin-pack/unix/io.ml index 6e54b08745..8a76c0b4cc 100644 --- a/src/irmin-pack/unix/io.ml +++ b/src/irmin-pack/unix/io.ml @@ -160,7 +160,7 @@ module Unix = struct | _ -> (* really_write and following do not mutate the given buffer, so Bytes.unsafe_of_string is actually safe *) - let buf = Bytes.unsafe_of_string s in + let buf = Bytes.unsafe_of_string s (* safe - see comment 1 line above *) in let () = Util.really_write t.fd off buf 0 len in Index.Stats.add_write len; () diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index 4d29cfc0bf..fab1b650f0 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -229,7 +229,7 @@ module Make (Args : Args) = struct Bytes.unsafe_to_string buf (* safe: buf local to this function; buf not mutated after return *) let decode s pos : t = - let buf = Bytes.unsafe_of_string s in + let buf = Bytes.unsafe_of_string s in (* safe: buf is created locally, not mutated, not leaked *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in (off, len) From 1ab14044737893e562a0b0751f210df3ddfc5a53 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 12:12:46 +0100 Subject: [PATCH 3/8] irmin-pack: add comments on safe uses of Bytes.unsafe_to_string Comments use the terminology from the manual ("unique ownership" and "shared ownership"). --- src/irmin-pack/inode.ml | 6 ++++-- src/irmin-pack/unix/atomic_write.ml | 11 +++++++++- src/irmin-pack/unix/gc.ml | 13 +++++++++--- src/irmin-pack/unix/io.ml | 11 ++++++++-- src/irmin-pack/unix/mapping_file.ml | 6 ++++-- src/irmin-pack/unix/pack_index.ml | 9 ++++++-- src/irmin-pack/unix/pack_store.ml | 26 +++++++++++++++++++---- src/irmin-pack/unix/snapshot.ml | 13 +++++++++--- src/irmin-pack/unix/traverse_pack_file.ml | 3 ++- 9 files changed, 78 insertions(+), 20 deletions(-) diff --git a/src/irmin-pack/inode.ml b/src/irmin-pack/inode.ml index 78e4212e8e..ddb73403f2 100644 --- a/src/irmin-pack/inode.ml +++ b/src/irmin-pack/inode.ml @@ -104,9 +104,11 @@ struct let key = match Conf.inode_child_order with | `Hash_bits -> - fun s -> Bytes.unsafe_of_string (hash_to_bin_string (Step.hash s)) (* possibly unsafe use of Bytes.unsafe_of_string? depends on what the rest of the code does; if we don't leak the bytes, and don't mutate them, maybe this is safe? not sure *) + fun s -> Bytes.unsafe_of_string (hash_to_bin_string (Step.hash s)) + (* possibly unsafe use of Bytes.unsafe_of_string? depends on what the rest of the code does; if we don't leak the bytes, and don't mutate them, maybe this is safe? not sure *) | `Seeded_hash | `Custom _ -> - fun s -> Bytes.unsafe_of_string (step_to_bin_string s) (* as previous line - not sure *) + fun s -> Bytes.unsafe_of_string (step_to_bin_string s) + (* as previous line - not sure *) (* Assume [k = cryto_hash(step)] (see {!key}) and [Conf.entry] can can represented with [n] bits. Then, [hash_bits ~depth k] is diff --git a/src/irmin-pack/unix/atomic_write.ml b/src/irmin-pack/unix/atomic_write.ml index 7afa019c48..8dd5cc790c 100644 --- a/src/irmin-pack/unix/atomic_write.ml +++ b/src/irmin-pack/unix/atomic_write.ml @@ -33,7 +33,16 @@ module Make_persistent (K : Irmin.Type.S) (V : Value.S) = struct assert (n = 4); (file_pos := Int63.Syntax.(!file_pos + Int63.of_int 4)); let pos_ref = ref 0 in - let v = decode_bin (Bytes.unsafe_to_string buf (* safe: buf is local, not leaked, not modified after call to decode_bin *)) pos_ref in + (* Bytes.unsafe_to_string usage: We assume Io_legacy.read_block returns unique + ownership of buf back to this function (this assumption holds currently; subsequent + modifications of that code need to ensure this remains the case); then in call to + Bytes.unsafe_to_string we give up ownership of buf (we do not modify the buffer + afterwards) and get ownership of resulting string; so this use is safe. *) + let v = + decode_bin + (Bytes.unsafe_to_string buf (* safe: see comment above *)) + pos_ref + in assert (!pos_ref = 4); Int32.to_int v diff --git a/src/irmin-pack/unix/gc.ml b/src/irmin-pack/unix/gc.ml index 9b54553e39..d4a448343a 100644 --- a/src/irmin-pack/unix/gc.ml +++ b/src/irmin-pack/unix/gc.ml @@ -117,8 +117,10 @@ module Make (Args : Args) : S with module Args := Args = struct let len' = Int63.to_int len in read_exn ~off ~len:len' buffer; let () = - if len = buffer_size then append_exn (Bytes.unsafe_to_string buffer) (* likely unsafe: buffer passed in, subject to mutations; append_exn retains ref to string; buffer can then be mutated, altering string value in append_exn cache *) - else append_exn (String.sub (Bytes.unsafe_to_string buffer) 0 len') (* likely unsafe, as previous line *) + if len = buffer_size then append_exn (Bytes.unsafe_to_string buffer) + (* likely unsafe: buffer passed in, subject to mutations; append_exn retains ref to string; buffer can then be mutated, altering string value in append_exn cache *) + else append_exn (String.sub (Bytes.unsafe_to_string buffer) 0 len') + (* likely unsafe, as previous line *) in let len_remaining = len_remaining - len in if len_remaining > Int63.zero then aux (off + len) len_remaining @@ -179,7 +181,12 @@ module Make (Args : Args) : S with module Args := Args = struct read_exn ~off ~len buffer; let poff = Dispatcher.poff_of_entry_exn ~off ~len mapping in Bytes.set buffer Hash.hash_size magic_parent; - write_exn ~off:poff ~len (Bytes.unsafe_to_string buffer) (* safe: buffer created locally, not leaked, not modified after call to write_exn *) + (* Bytes.unsafe_to_string usage: We assume read_exn returns unique ownership of buffer + to this function. Then at the call to Bytes.unsafe_to_string we give up unique + ownership to buffer (we do not modify it thereafter) in return for ownership of the + resulting string, which we pass to write_exn. This usage is safe. *) + write_exn ~off:poff ~len (Bytes.unsafe_to_string buffer) + (* safe: see comment above *) let create_new_suffix ~root ~generation = let open Result_syntax in diff --git a/src/irmin-pack/unix/io.ml b/src/irmin-pack/unix/io.ml index 8a76c0b4cc..70f018e70d 100644 --- a/src/irmin-pack/unix/io.ml +++ b/src/irmin-pack/unix/io.ml @@ -160,7 +160,9 @@ module Unix = struct | _ -> (* really_write and following do not mutate the given buffer, so Bytes.unsafe_of_string is actually safe *) - let buf = Bytes.unsafe_of_string s (* safe - see comment 1 line above *) in + let buf = + Bytes.unsafe_of_string s (* safe - see comment 1 line above *) + in let () = Util.really_write t.fd off buf 0 len in Index.Stats.add_write len; () @@ -199,7 +201,12 @@ module Unix = struct let buf = Bytes.create len in try read_exn t ~off ~len buf; - Ok (Bytes.unsafe_to_string buf) (* safe: buf local, not leaked, not modified after return *) + (* Bytes.unsafe_to_string usage: buf is local to this function, so uniquely + owned. We assume read_exn returns unique ownership of buf to this function. Then + at the call to Bytes.unsafe_to_string we give up unique ownership of buf for + ownership of the string. This is safe. *) + Ok (Bytes.unsafe_to_string buf) + (* safe: see comment above *) with | Errors.Pack_error ((`Invalid_argument | `Read_out_of_bounds) as e) -> Error e diff --git a/src/irmin-pack/unix/mapping_file.ml b/src/irmin-pack/unix/mapping_file.ml index a51ba0747a..e6c9c1ee36 100644 --- a/src/irmin-pack/unix/mapping_file.ml +++ b/src/irmin-pack/unix/mapping_file.ml @@ -346,7 +346,8 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct * Fmt.epr "register_entry < 500: %d %d\n%!" (Int63.to_int off) len; *) Bytes.set_int64_ne buffer 0 (Int63.to_int64 off); Bytes.set_int64_ne buffer 8 (Int64.of_int len); - Ao.append_exn file0 (Bytes.unsafe_to_string buffer) (* clearly unsafe!!! *) + Ao.append_exn file0 (Bytes.unsafe_to_string buffer) + (* clearly unsafe!!! *) in let* () = Errs.catch (fun () -> register_entries ~register_entry) in let* () = Ao.flush file0 in @@ -436,7 +437,8 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct else let off, len = (* Decoding a pair of int can't fail *) - decode_bin_pair (Bytes.unsafe_to_string buffer) buffer_off (* possibly safe; depends on implementation of decode_bin_pair and all other libraries involved *) + decode_bin_pair (Bytes.unsafe_to_string buffer) buffer_off + (* possibly safe; depends on implementation of decode_bin_pair and all other libraries involved *) in let () = if off < last_yielded_end_offset then diff --git a/src/irmin-pack/unix/pack_index.ml b/src/irmin-pack/unix/pack_index.ml index bbe5bf98a9..b876f06698 100644 --- a/src/irmin-pack/unix/pack_index.ml +++ b/src/irmin-pack/unix/pack_index.ml @@ -39,10 +39,15 @@ module Make (K : Irmin.Hash.S) = struct Bytes.set_int64_be buf 0 (Int63.to_int64 off); Bytes.set_int32_be buf 8 (Int32.of_int len); Bytes.set buf 12 (Pack_value.Kind.to_magic kind); - Bytes.unsafe_to_string buf (* safe: buf created locally, mutated, not leaked outside function *) + (* Bytes.unsafe_to_string usage: buf is local, uniquely owned. We assume the various + functions above return unique ownership of buf. Then in the call to + Bytes.unsafe_to_string we give up unique ownership of buf for ownership of the + resulting string. This is safe. *) + Bytes.unsafe_to_string buf (* safe: see comment above *) let decode s pos : t = - let buf = Bytes.unsafe_of_string s in (* safe: buf only used for reading locally within this function *) + let buf = Bytes.unsafe_of_string s in + (* safe: buf only used for reading locally within this function *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in let kind = Bytes.get buf (pos + 12) |> Pack_value.Kind.of_magic_exn in diff --git a/src/irmin-pack/unix/pack_store.ml b/src/irmin-pack/unix/pack_store.ml index 0dcb045806..13d813f65e 100644 --- a/src/irmin-pack/unix/pack_store.ml +++ b/src/irmin-pack/unix/pack_store.ml @@ -161,7 +161,13 @@ struct "Attempted to read an entry at offset %a in the pack file, but got \ only %d bytes" Int63.pp off bytes_read; - let hash = decode_bin_hash (Bytes.unsafe_to_string buf (* possibly unsafe; but if we assume buf is not mutated after "bytes_read" above, and given buf is created at the beginning of this function, this might actually be safe; TODO provide a clear reason that this is safe *)) (ref 0) in + let hash = + decode_bin_hash + (Bytes.unsafe_to_string + buf + (* possibly unsafe; but if we assume buf is not mutated after "bytes_read" above, and given buf is created at the beginning of this function, this might actually be safe; TODO provide a clear reason that this is safe *)) + (ref 0) + in let kind = Pack_value.Kind.of_magic_exn (Bytes.get buf Hash.hash_size) in let size_of_value_and_length_header = match Val.length_header kind with @@ -173,7 +179,10 @@ struct correctly): *) let pos_ref = ref length_header_start in let length_header = - Varint.decode_bin (Bytes.unsafe_to_string buf) (* as above, this is potentially safe, but need clear reasoning *) pos_ref + Varint.decode_bin + (Bytes.unsafe_to_string buf) + (* as above, this is potentially safe, but need clear reasoning *) + pos_ref in let length_header_length = !pos_ref - length_header_start in Some (length_header_length + length_header) @@ -195,7 +204,15 @@ struct let found = Dispatcher.read_if_not_gced t.dispatcher ~off ~len buf in if (not found) || gced buf then None else - let hash = decode_bin_hash (Bytes.unsafe_to_string buf (* safe: buf local to function, not mutated after initially filled *) ) (ref 0) in + (* Bytes.unsafe_to_string usafe: buf is create in this function, uniquely owned; we + assume Dispatcher.read_if_not_gced returns unique ownership; then call to + Bytes.unsafe_to_string gives up ownerhsip of buf for ownership of resulting + string. This is safe. *) + let hash = + decode_bin_hash + (Bytes.unsafe_to_string buf (* safe: see comment above *)) + (ref 0) + in Some hash let pack_file_contains_key t k = @@ -296,7 +313,8 @@ struct let dict = Dict.find t.dict in let v = Val.decode_bin ~key_of_offset ~key_of_hash ~dict - (Bytes.unsafe_to_string buf) (* safe? buf created, filled, not changed subsequently, not leaked outside this function so no subsequent mutations *) + (Bytes.unsafe_to_string buf) + (* safe? buf created, filled, not changed subsequently, not leaked outside this function so no subsequent mutations *) (ref 0) in Some v diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index fab1b650f0..7d9af21128 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -104,8 +104,11 @@ module Make (Args : Args) = struct io_read_and_decode_entry_prefix ~off:offset t in let entry_of_hash hash = key_of_hash hash t.inode_pack in + (* Bytes.unsafe_to_string usage: buf is created locally, uniquely owned; we assume + Dispatcher.read_exn returns unique ownership; then call to Bytes.unsafe_to_string + gives up unique ownership of buf. This is safe. *) Inode.Raw.decode_children_offsets ~entry_of_offset ~entry_of_hash - (Bytes.unsafe_to_string buf) (* safe - buf local to this function, ownership handed to Inode.Raw.decode_children_offsets; nothing can mutate buf subsequently *) + (Bytes.unsafe_to_string buf) (* safe: see comment above *) (ref 0) type visit = { visited : Hash.t -> bool; set_visit : Hash.t -> unit } @@ -226,10 +229,14 @@ module Make (Args : Args) = struct let buf = Bytes.create encoded_size in Bytes.set_int64_be buf 0 (Int63.to_int64 off); Bytes.set_int32_be buf 8 (Int32.of_int len); - Bytes.unsafe_to_string buf (* safe: buf local to this function; buf not mutated after return *) + (* Bytes.unsafe_to_string usage: buf is local, uniquely owned; we assume the + Bytes.set... functions return unique ownership; then Bytes.unsafe_to_string + gives up unique ownership of buf. This is safe. *) + Bytes.unsafe_to_string buf (* safe: see comment above *) let decode s pos : t = - let buf = Bytes.unsafe_of_string s in (* safe: buf is created locally, not mutated, not leaked *) + let buf = Bytes.unsafe_of_string s in + (* safe: buf is created locally, not mutated, not leaked *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in (off, len) diff --git a/src/irmin-pack/unix/traverse_pack_file.ml b/src/irmin-pack/unix/traverse_pack_file.ml index 44de315092..9f5ab5492d 100644 --- a/src/irmin-pack/unix/traverse_pack_file.ml +++ b/src/irmin-pack/unix/traverse_pack_file.ml @@ -254,7 +254,8 @@ end = struct let buffer_off, off, missing_hash = match decode_entry_exn ~off - ~buffer:(Bytes.unsafe_to_string !buffer) (* could be safe or unsafe; depends on implementation of decode_entry_exn *) + ~buffer:(Bytes.unsafe_to_string !buffer) + (* could be safe or unsafe; depends on implementation of decode_entry_exn *) ~buffer_off with | { key; data } -> From 5b0716d21ee32bd9d9de3e7587ae124d018e5c9c Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 12:30:46 +0100 Subject: [PATCH 4/8] irmin-pack: add comments on safe usages of Bytes.unsafe_of_string --- src/irmin-pack/unix/io.ml | 9 ++++----- src/irmin-pack/unix/pack_index.ml | 4 +++- src/irmin-pack/unix/snapshot.ml | 3 +++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/irmin-pack/unix/io.ml b/src/irmin-pack/unix/io.ml index 70f018e70d..015f51edff 100644 --- a/src/irmin-pack/unix/io.ml +++ b/src/irmin-pack/unix/io.ml @@ -158,11 +158,10 @@ module Unix = struct | true, _ -> raise Errors.Closed | _, true -> raise Errors.RO_not_allowed | _ -> - (* really_write and following do not mutate the given buffer, so - Bytes.unsafe_of_string is actually safe *) - let buf = - Bytes.unsafe_of_string s (* safe - see comment 1 line above *) - in + (* Bytes.unsafe_of_string usage: s has shared ownership; we assume that + Util.really_write does not mutate buf (i.e., only needs shared ownership). This + usage is safe. *) + let buf = Bytes.unsafe_of_string s (* safe: see comment above *) in let () = Util.really_write t.fd off buf 0 len in Index.Stats.add_write len; () diff --git a/src/irmin-pack/unix/pack_index.ml b/src/irmin-pack/unix/pack_index.ml index b876f06698..f73a8be247 100644 --- a/src/irmin-pack/unix/pack_index.ml +++ b/src/irmin-pack/unix/pack_index.ml @@ -46,8 +46,10 @@ module Make (K : Irmin.Hash.S) = struct Bytes.unsafe_to_string buf (* safe: see comment above *) let decode s pos : t = + (* Bytes.unsafe_of_string usage: s is shared ownership; buf is shared ownership (we + cannot mutate buf); we assume the Bytes.get... functions require only shared + ownership. This usage is safe. *) let buf = Bytes.unsafe_of_string s in - (* safe: buf only used for reading locally within this function *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in let kind = Bytes.get buf (pos + 12) |> Pack_value.Kind.of_magic_exn in diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index 7d9af21128..ca5d5c5f15 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -235,6 +235,9 @@ module Make (Args : Args) = struct Bytes.unsafe_to_string buf (* safe: see comment above *) let decode s pos : t = + (* Bytes.unsafe_of_string usage: s is shared; buf is shared (we cannot mutate it); + we assume Bytes.get_... functions need shared ownership only. This usage is + safe. *) let buf = Bytes.unsafe_of_string s in (* safe: buf is created locally, not mutated, not leaked *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in From 12a01cdab89765f071efd61d320ed7a57f6bd3c3 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 13:57:16 +0100 Subject: [PATCH 5/8] irmin-pack: further clarify uses of Bytes.unsafe_{to,of}_string --- src/irmin-pack/inode.ml | 6 ++++-- src/irmin-pack/unix/gc.ml | 6 ++---- src/irmin-pack/unix/mapping_file.ml | 9 ++++++--- src/irmin-pack/unix/pack_store.ml | 18 +++++++++--------- src/irmin-pack/unix/snapshot.ml | 3 +-- src/irmin-pack/unix/traverse_pack_file.ml | 4 +++- 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/irmin-pack/inode.ml b/src/irmin-pack/inode.ml index ddb73403f2..f2232effeb 100644 --- a/src/irmin-pack/inode.ml +++ b/src/irmin-pack/inode.ml @@ -104,11 +104,13 @@ struct let key = match Conf.inode_child_order with | `Hash_bits -> + (* Bytes.unsafe_of_string usage: possibly safe TODO justify safety, or switch to + use the safe Bytes.of_string *) fun s -> Bytes.unsafe_of_string (hash_to_bin_string (Step.hash s)) - (* possibly unsafe use of Bytes.unsafe_of_string? depends on what the rest of the code does; if we don't leak the bytes, and don't mutate them, maybe this is safe? not sure *) | `Seeded_hash | `Custom _ -> + (* Bytes.unsafe_of_string usage: possibly safe TODO justify safety, or switch to + use the safe Bytes.of_string *) fun s -> Bytes.unsafe_of_string (step_to_bin_string s) - (* as previous line - not sure *) (* Assume [k = cryto_hash(step)] (see {!key}) and [Conf.entry] can can represented with [n] bits. Then, [hash_bits ~depth k] is diff --git a/src/irmin-pack/unix/gc.ml b/src/irmin-pack/unix/gc.ml index d4a448343a..a85694d709 100644 --- a/src/irmin-pack/unix/gc.ml +++ b/src/irmin-pack/unix/gc.ml @@ -117,10 +117,8 @@ module Make (Args : Args) : S with module Args := Args = struct let len' = Int63.to_int len in read_exn ~off ~len:len' buffer; let () = - if len = buffer_size then append_exn (Bytes.unsafe_to_string buffer) - (* likely unsafe: buffer passed in, subject to mutations; append_exn retains ref to string; buffer can then be mutated, altering string value in append_exn cache *) - else append_exn (String.sub (Bytes.unsafe_to_string buffer) 0 len') - (* likely unsafe, as previous line *) + if len = buffer_size then append_exn (Bytes.to_string buffer) + else append_exn (String.sub (Bytes.to_string buffer) 0 len') in let len_remaining = len_remaining - len in if len_remaining > Int63.zero then aux (off + len) len_remaining diff --git a/src/irmin-pack/unix/mapping_file.ml b/src/irmin-pack/unix/mapping_file.ml index e6c9c1ee36..7dd51e13b3 100644 --- a/src/irmin-pack/unix/mapping_file.ml +++ b/src/irmin-pack/unix/mapping_file.ml @@ -338,16 +338,18 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct file0_ref := Some file0; (* Fill and close [file0] *) - let buffer = Bytes.create 16 in let register_entry ~off ~len = (* Write [off, len] in native-endian encoding because it will be read with mmap. *) (* if Int63.to_int off < 500 then * Fmt.epr "register_entry < 500: %d %d\n%!" (Int63.to_int off) len; *) + let buffer = Bytes.create 16 in Bytes.set_int64_ne buffer 0 (Int63.to_int64 off); Bytes.set_int64_ne buffer 8 (Int64.of_int len); + (* Bytes.unsafe_to_string usage: buffer is uniquely owned; we assume + Bytes.set_int64_ne returns unique ownership; we give up ownership of buffer in + conversion to string. This is safe. *) Ao.append_exn file0 (Bytes.unsafe_to_string buffer) - (* clearly unsafe!!! *) in let* () = Errs.catch (fun () -> register_entries ~register_entry) in let* () = Ao.flush file0 in @@ -437,8 +439,9 @@ module Make (Errs : Io_errors.S with module Io = Io.Unix) = struct else let off, len = (* Decoding a pair of int can't fail *) + (* Bytes.unsafe_to_string usage: possibly safe TODO justify safety, or convert + to Bytes.to_string *) decode_bin_pair (Bytes.unsafe_to_string buffer) buffer_off - (* possibly safe; depends on implementation of decode_bin_pair and all other libraries involved *) in let () = if off < last_yielded_end_offset then diff --git a/src/irmin-pack/unix/pack_store.ml b/src/irmin-pack/unix/pack_store.ml index 13d813f65e..d092364428 100644 --- a/src/irmin-pack/unix/pack_store.ml +++ b/src/irmin-pack/unix/pack_store.ml @@ -162,11 +162,12 @@ struct only %d bytes" Int63.pp off bytes_read; let hash = - decode_bin_hash - (Bytes.unsafe_to_string - buf - (* possibly unsafe; but if we assume buf is not mutated after "bytes_read" above, and given buf is created at the beginning of this function, this might actually be safe; TODO provide a clear reason that this is safe *)) - (ref 0) + (* Bytes.unsafe_to_string usage: buf is created locally, so we have unique + ownership; we assume Dispatcher.read_at_most_exn returns unique ownership; use of + Bytes.unsafe_to_string converts buffer to shared ownership; the rest of the code + seems to require only shared ownership (buffer is read, but not mutated). This is + safe. *) + decode_bin_hash (Bytes.unsafe_to_string buf) (ref 0) in let kind = Pack_value.Kind.of_magic_exn (Bytes.get buf Hash.hash_size) in let size_of_value_and_length_header = @@ -178,11 +179,10 @@ struct variable-length length field (if they exist / were read correctly): *) let pos_ref = ref length_header_start in + (* Bytes.unsafe_to_string usage: buf is shared at this point; we assume + Varint.decode_bin requires only shared ownership. This usage is safe. *) let length_header = - Varint.decode_bin - (Bytes.unsafe_to_string buf) - (* as above, this is potentially safe, but need clear reasoning *) - pos_ref + Varint.decode_bin (Bytes.unsafe_to_string buf) pos_ref in let length_header_length = !pos_ref - length_header_start in Some (length_header_length + length_header) diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index ca5d5c5f15..afc0c48727 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -232,14 +232,13 @@ module Make (Args : Args) = struct (* Bytes.unsafe_to_string usage: buf is local, uniquely owned; we assume the Bytes.set... functions return unique ownership; then Bytes.unsafe_to_string gives up unique ownership of buf. This is safe. *) - Bytes.unsafe_to_string buf (* safe: see comment above *) + Bytes.unsafe_to_string buf let decode s pos : t = (* Bytes.unsafe_of_string usage: s is shared; buf is shared (we cannot mutate it); we assume Bytes.get_... functions need shared ownership only. This usage is safe. *) let buf = Bytes.unsafe_of_string s in - (* safe: buf is created locally, not mutated, not leaked *) let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in (off, len) diff --git a/src/irmin-pack/unix/traverse_pack_file.ml b/src/irmin-pack/unix/traverse_pack_file.ml index 9f5ab5492d..760120d37a 100644 --- a/src/irmin-pack/unix/traverse_pack_file.ml +++ b/src/irmin-pack/unix/traverse_pack_file.ml @@ -253,9 +253,11 @@ end = struct else let buffer_off, off, missing_hash = match + (* Bytes.unsafe_to_string usage: possibly safe, depending on details of + implementation of decode_entry_exn TODO either justify clearly that this is + safe, or change to use safe Bytes.to_string *) decode_entry_exn ~off ~buffer:(Bytes.unsafe_to_string !buffer) - (* could be safe or unsafe; depends on implementation of decode_entry_exn *) ~buffer_off with | { key; data } -> From 266ba0218638c02e87947c444950c21eefb2f891 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 14:13:15 +0100 Subject: [PATCH 6/8] irmin-pack: further clarify uses of Bytes.unsafe_to_string --- src/irmin-pack/unix/gc.ml | 1 - src/irmin-pack/unix/io.ml | 3 +-- src/irmin-pack/unix/pack_store.ml | 10 ++++------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/irmin-pack/unix/gc.ml b/src/irmin-pack/unix/gc.ml index a85694d709..7089878cc3 100644 --- a/src/irmin-pack/unix/gc.ml +++ b/src/irmin-pack/unix/gc.ml @@ -184,7 +184,6 @@ module Make (Args : Args) : S with module Args := Args = struct ownership to buffer (we do not modify it thereafter) in return for ownership of the resulting string, which we pass to write_exn. This usage is safe. *) write_exn ~off:poff ~len (Bytes.unsafe_to_string buffer) - (* safe: see comment above *) let create_new_suffix ~root ~generation = let open Result_syntax in diff --git a/src/irmin-pack/unix/io.ml b/src/irmin-pack/unix/io.ml index 015f51edff..27093d55e7 100644 --- a/src/irmin-pack/unix/io.ml +++ b/src/irmin-pack/unix/io.ml @@ -161,7 +161,7 @@ module Unix = struct (* Bytes.unsafe_of_string usage: s has shared ownership; we assume that Util.really_write does not mutate buf (i.e., only needs shared ownership). This usage is safe. *) - let buf = Bytes.unsafe_of_string s (* safe: see comment above *) in + let buf = Bytes.unsafe_of_string s in let () = Util.really_write t.fd off buf 0 len in Index.Stats.add_write len; () @@ -205,7 +205,6 @@ module Unix = struct at the call to Bytes.unsafe_to_string we give up unique ownership of buf for ownership of the string. This is safe. *) Ok (Bytes.unsafe_to_string buf) - (* safe: see comment above *) with | Errors.Pack_error ((`Invalid_argument | `Read_out_of_bounds) as e) -> Error e diff --git a/src/irmin-pack/unix/pack_store.ml b/src/irmin-pack/unix/pack_store.ml index d092364428..96d9e5a292 100644 --- a/src/irmin-pack/unix/pack_store.ml +++ b/src/irmin-pack/unix/pack_store.ml @@ -208,11 +208,7 @@ struct assume Dispatcher.read_if_not_gced returns unique ownership; then call to Bytes.unsafe_to_string gives up ownerhsip of buf for ownership of resulting string. This is safe. *) - let hash = - decode_bin_hash - (Bytes.unsafe_to_string buf (* safe: see comment above *)) - (ref 0) - in + let hash = decode_bin_hash (Bytes.unsafe_to_string buf) (ref 0) in Some hash let pack_file_contains_key t k = @@ -312,9 +308,11 @@ struct let key_of_hash = Pack_key.v_indexed in let dict = Dict.find t.dict in let v = + (* Bytes.unsafe_to_string usage: buf created, uniquely owned; after creation, we + assume Dispatcher.read_if_not_gced returns unique ownership; we give up unique + ownership in call to Bytes.unsafe_to_string. This is safe. *) Val.decode_bin ~key_of_offset ~key_of_hash ~dict (Bytes.unsafe_to_string buf) - (* safe? buf created, filled, not changed subsequently, not leaked outside this function so no subsequent mutations *) (ref 0) in Some v From 15207cfaa6828691e48fe60c98aa147e1b65d6b7 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 14:16:33 +0100 Subject: [PATCH 7/8] irmin-pack: remove unnecessary comment --- src/irmin-pack/unix/atomic_write.ml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/irmin-pack/unix/atomic_write.ml b/src/irmin-pack/unix/atomic_write.ml index 8dd5cc790c..ec71b30822 100644 --- a/src/irmin-pack/unix/atomic_write.ml +++ b/src/irmin-pack/unix/atomic_write.ml @@ -38,11 +38,7 @@ module Make_persistent (K : Irmin.Type.S) (V : Value.S) = struct modifications of that code need to ensure this remains the case); then in call to Bytes.unsafe_to_string we give up ownership of buf (we do not modify the buffer afterwards) and get ownership of resulting string; so this use is safe. *) - let v = - decode_bin - (Bytes.unsafe_to_string buf (* safe: see comment above *)) - pos_ref - in + let v = decode_bin (Bytes.unsafe_to_string buf) pos_ref in assert (!pos_ref = 4); Int32.to_int v From 97340dbd8d7b1894daf9bf9ee080f49e9b4a14c3 Mon Sep 17 00:00:00 2001 From: Tom Ridge Date: Thu, 7 Jul 2022 18:19:19 +0100 Subject: [PATCH 8/8] irmin-pack: further tweaks and clarifications after initial review --- src/irmin-pack/unix/pack_index.ml | 7 ++++--- src/irmin-pack/unix/snapshot.ml | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/irmin-pack/unix/pack_index.ml b/src/irmin-pack/unix/pack_index.ml index f73a8be247..bf113efafe 100644 --- a/src/irmin-pack/unix/pack_index.ml +++ b/src/irmin-pack/unix/pack_index.ml @@ -43,12 +43,13 @@ module Make (K : Irmin.Hash.S) = struct functions above return unique ownership of buf. Then in the call to Bytes.unsafe_to_string we give up unique ownership of buf for ownership of the resulting string. This is safe. *) - Bytes.unsafe_to_string buf (* safe: see comment above *) + Bytes.unsafe_to_string buf let decode s pos : t = (* Bytes.unsafe_of_string usage: s is shared ownership; buf is shared ownership (we - cannot mutate buf); we assume the Bytes.get... functions require only shared - ownership. This usage is safe. *) + cannot mutate buf) and the lifetime of buf ends on return from this function; we + assume the Bytes.get... functions require only shared ownership. This usage is + safe. *) let buf = Bytes.unsafe_of_string s in let off = Bytes.get_int64_be buf pos |> Int63.of_int64 in let len = Bytes.get_int32_be buf (pos + 8) |> Int32.to_int in diff --git a/src/irmin-pack/unix/snapshot.ml b/src/irmin-pack/unix/snapshot.ml index afc0c48727..c9e3e880ea 100644 --- a/src/irmin-pack/unix/snapshot.ml +++ b/src/irmin-pack/unix/snapshot.ml @@ -231,7 +231,8 @@ module Make (Args : Args) = struct Bytes.set_int32_be buf 8 (Int32.of_int len); (* Bytes.unsafe_to_string usage: buf is local, uniquely owned; we assume the Bytes.set... functions return unique ownership; then Bytes.unsafe_to_string - gives up unique ownership of buf. This is safe. *) + gives up unique ownership of buf to get shared ownership of the resulting + string, which is then returned. buf is no longer accessible. This is safe. *) Bytes.unsafe_to_string buf let decode s pos : t =