Skip to content

Commit

Permalink
RUBY-1410 Implement ObjectID spec (#311)
Browse files Browse the repository at this point in the history
* C implementation of the ObjectId spec

* java implementation of ObjectId generation

* add tests per object-id spec

* add fallback to rand() when arc4random isn't available (*sadface*)

* handle variable length buffers properly

* for safety's sake, let's not hardcode the size of an integer

* target/source version need to be optional

* ProcessHandle isn't availble on our testing platforms

Also, since Java can't fork, it might not even be necessary to test for the pid changing
  • Loading branch information
jamis committed Jul 3, 2023
1 parent f7ffa3f commit 9484d2a
Show file tree
Hide file tree
Showing 10 changed files with 339 additions and 142 deletions.
2 changes: 2 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ if jruby?
ext.name = "bson-ruby"
ext.ext_dir = "src"
ext.lib_dir = "lib"
ext.target_version = ENV['TARGET_VERSION'] if ENV['TARGET_VERSION']
ext.source_version = ENV['SOURCE_VERSION'] if ENV['SOURCE_VERSION']
end
else
require "rake/extensiontask"
Expand Down
9 changes: 9 additions & 0 deletions ext/bson/bson-native.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <ruby.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include "bson-endian.h"
Expand Down Expand Up @@ -98,7 +99,9 @@ VALUE rb_bson_byte_buffer_replace_int32(VALUE self, VALUE index, VALUE i);
VALUE rb_bson_byte_buffer_rewind(VALUE self);
VALUE rb_bson_byte_buffer_write_position(VALUE self);
VALUE rb_bson_byte_buffer_to_s(VALUE self);

VALUE rb_bson_object_id_generator_next(int argc, VALUE* args, VALUE self);
VALUE rb_bson_object_id_generator_reset_counter(int argc, VALUE* args, VALUE self);

size_t rb_bson_byte_buffer_memsize(const void *ptr);
void rb_bson_byte_buffer_free(void *ptr);
Expand All @@ -113,6 +116,12 @@ VALUE pvt_const_get_3(const char *c1, const char *c2, const char *c3);

int pvt_get_mode_option(int argc, VALUE *argv);

#define BSON_OBJECT_ID_RANDOM_VALUE_LENGTH ( 5 )

uint8_t* pvt_get_object_id_random_value();
void pvt_rand_buf(uint8_t* bytes, int len, int pid);
int pvt_rand();

/**
* The counter for incrementing object ids.
*/
Expand Down
10 changes: 6 additions & 4 deletions ext/bson/extconf.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# rubocop:todo all
require "mkmf"
$CFLAGS << " -Wall -g -std=c99"
create_makefile("bson_native")
require 'mkmf'

$CFLAGS << ' -Wall -g -std=c99'
have_func 'arc4random'

create_makefile('bson_native')
5 changes: 3 additions & 2 deletions ext/bson/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,16 @@ void Init_bson_native()
rb_define_method(rb_byte_buffer_class, "to_s", rb_bson_byte_buffer_to_s, 0);

rb_define_method(rb_bson_object_id_generator_class, "next_object_id", rb_bson_object_id_generator_next, -1);
rb_define_method(rb_bson_object_id_generator_class, "reset_counter", rb_bson_object_id_generator_reset_counter, -1);

// Get the object id machine id and hash it.
rb_require("digest/md5");
gethostname(rb_bson_machine_id, sizeof(rb_bson_machine_id));
rb_bson_machine_id[255] = '\0';
rb_bson_generate_machine_id(rb_md5_class, rb_bson_machine_id);

// Set the object id counter to a random number
rb_bson_object_id_counter = FIX2INT(rb_funcall(rb_mKernel, rb_intern("rand"), 1, INT2FIX(0x1000000)));
// Set the object id counter to a random 3-byte integer
rb_bson_object_id_counter = pvt_rand() % 0x1000000;

rb_bson_registry = rb_const_get(rb_bson_module, rb_intern("Registry"));
rb_gc_register_mark_object(rb_bson_registry);
Expand Down
130 changes: 114 additions & 16 deletions ext/bson/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,69 @@ void rb_bson_generate_machine_id(VALUE rb_md5_class, char *rb_bson_machine_id)

/**
* Generate the next object id.
*
* Specification:
* https://github.com/mongodb/specifications/blob/master/source/objectid.rst
*
* The ObjectID BSON type is a 12-byte value consisting of three different portions (fields):
* * a 4-byte value representing the seconds since the Unix epoch in the highest order bytes,
* * a 5-byte random number unique to a machine and process,
* * a 3-byte counter, starting with a random value.
*/
VALUE rb_bson_object_id_generator_next(int argc, VALUE* args, VALUE self)
{
char bytes[12];
uint32_t t;
uint32_t c;
uint16_t pid = BSON_UINT16_TO_BE(getpid());
uint32_t time_component;
uint8_t* random_component;
uint32_t counter_component;
VALUE timestamp;
VALUE rb_bson_object_id_class;

if (argc == 0 || (argc == 1 && *args == Qnil)) {
t = BSON_UINT32_TO_BE((int) time(NULL));
}
else {
t = BSON_UINT32_TO_BE(NUM2ULONG(rb_funcall(*args, rb_intern("to_i"), 0)));
}
rb_bson_object_id_class = pvt_const_get_2("BSON", "ObjectId");

/* "Drivers SHOULD have an accessor method on an ObjectID class for
* obtaining the timestamp value." */

timestamp = rb_funcall(rb_bson_object_id_class, rb_intern("timestamp"), 0);
time_component = BSON_UINT32_TO_BE(NUM2INT(timestamp));

/* "A 5-byte field consisting of a random value generated once per process.
* This random value is unique to the machine and process.
*
* "Drivers MUST NOT have an accessor method on an ObjectID class for
* obtaining this value."
*/

random_component = pvt_get_object_id_random_value();

/* shift left 8 bits, so that the first three bytes of the result are
* the meaningful ones */
counter_component = BSON_UINT32_TO_BE(rb_bson_object_id_counter << 8);

memcpy(&bytes, &time_component, 4);
memcpy(&bytes[4], random_component, 5);
memcpy(&bytes[9], &counter_component, 3);

c = BSON_UINT32_TO_BE(rb_bson_object_id_counter << 8);
rb_bson_object_id_counter = (rb_bson_object_id_counter + 1) % 0x1000000;

memcpy(&bytes, &t, 4);
memcpy(&bytes[4], rb_bson_machine_id_hash, 3);
memcpy(&bytes[7], &pid, 2);
memcpy(&bytes[9], &c, 3);
rb_bson_object_id_counter++;
return rb_str_new(bytes, 12);
}

/**
* Reset the counter. This is purely as an aid for testing.
*
* @param [ Integer ] i the value to set the counter to (default is 0)
*/
VALUE rb_bson_object_id_generator_reset_counter(int argc, VALUE* args, VALUE self) {
switch(argc) {
case 0: rb_bson_object_id_counter = 0; break;
case 1: rb_bson_object_id_counter = FIX2INT(args[0]); break;
default: rb_raise(rb_eArgError, "Expected 0 or 1 arguments, got %d", argc);
}

return T_NIL;
}

/**
* Returns a Ruby constant nested one level, e.g. BSON::Document.
*/
Expand All @@ -77,7 +115,7 @@ VALUE pvt_const_get_3(const char *c1, const char *c2, const char *c3) {
int pvt_get_mode_option(int argc, VALUE *argv) {
VALUE opts;
VALUE mode;

rb_scan_args(argc, argv, ":", &opts);
if (NIL_P(opts)) {
return BSON_MODE_DEFAULT;
Expand All @@ -93,3 +131,63 @@ int pvt_get_mode_option(int argc, VALUE *argv) {
}
}
}

/**
* Returns the random number associated with this host and process. If the
* process ID changes (e.g. via fork), this will detect the change and
* generate another random number.
*/
uint8_t* pvt_get_object_id_random_value() {
static pid_t remembered_pid = 0;
static uint8_t remembered_value[BSON_OBJECT_ID_RANDOM_VALUE_LENGTH] = {0};
pid_t pid = getpid();

if (remembered_pid != pid) {
remembered_pid = pid;
pvt_rand_buf(remembered_value, BSON_OBJECT_ID_RANDOM_VALUE_LENGTH, pid);
}

return remembered_value;
}

/**
* Fills the buffer with random bytes. If arc4random is available, it is used,
* otherwise a less-ideal fallback is used.
*/
void pvt_rand_buf(uint8_t* bytes, int len, int pid) {
#if HAVE_ARC4RANDOM
arc4random_buf(bytes, len);

This comment has been minimized.

Copy link
@jraby

jraby Jul 26, 2023

Comment from the sidelines: I think using getentropy here would yield the same result, but would expand the covered user base.

from what I could gather, arc4random_buf has been added to glibc 2.36 (2022), while getentropy has been available since 2.25 (2017)

Context: I was in the process of temporarily backporting this commit locally so we could stop having such a big amount of id collision when I noticed that our ruby image is based on debian bullseye, which ships glibc2.31, so it would have fallen back to the rand seeded with time and pid.
getentropy is available in that libc, so I ended up using that.

(in our case it is pretty bad since pid is always 1 -- container fun!. meaning that 2 containers started at the same second have a high chance of generating collisions)

cc @jamis

This comment has been minimized.

Copy link
@jamis

jamis Jul 27, 2023

Author Contributor

Excellent suggestion. We've got a ticket in the queue for revisiting this and improving the situation; it's definitely something we'd like to address before we release BSON 5. I'll add your notes to the ticket so we can be sure to consider all our options.

This comment has been minimized.

Copy link
@jraby

jraby Jul 27, 2023

I was reflecting on this, and it might be more portable (BSD) to keep the arc4random_buf support, but allow falling back to getentropy too, before falling back to the "timeofday + pid" implementation

#else
time_t t;
uint32_t seed;
int ofs = 0;

/* TODO: spec says to include hostname as part of the seed */
t = time(NULL);
seed = ((uint32_t)t << 16) + ((uint32_t)pid % 0xFFFF);
srand(seed);

while (ofs < len) {
int n = rand();
unsigned remaining = len - ofs;

if (remaining > sizeof(n)) remaining = sizeof(n);
memcpy(bytes+ofs, &n, remaining);

ofs += remaining;
}
#endif
}

/**
* Returns a random integer between 0 and INT_MAX. If arc4random is available,
* it is used, otherwise a less-ideal fallback is used.
*/
int pvt_rand() {
#if HAVE_ARC4RANDOM
return arc4random();
#else
srand((unsigned)time(NULL));
return rand();
#endif
}
37 changes: 35 additions & 2 deletions lib/bson/object_id.rb
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,26 @@ def to_s
end
alias :to_str :to_s

# Extract the process-specific part of the object id. This is used only
# internally, for testing, and should not be used elsewhere.
#
# @return [ String ] The process portion of the id.
#
# @api private
def _process_part
to_s[8,10]
end

# Extract the counter-specific part of the object id. This is used only
# internally, for testing, and should not be used elsewhere.
#
# @return [ String ] The counter portion of the id.
#
# @api private
def _counter_part
to_s[18,6]
end

private

def initialize_copy(other)
Expand Down Expand Up @@ -325,6 +345,14 @@ def repair(object)
raise Error::InvalidObjectId.new("#{object.inspect} is not a valid object id.")
end
end

# Returns an integer timestamp (seconds since the Epoch). Primarily used
# by the generator to produce object ids.
#
# @return [ Integer ] the number of seconds since the Epoch.
def timestamp
::Time.now.to_i
end
end

# Extended by native code (see init.c, util.c, GeneratorExtension.java)
Expand All @@ -334,10 +362,15 @@ class Generator
end

# We keep one global generator for object ids.
#
# @since 2.0.0
@@generator = Generator.new

# Accessor for querying the generator directly; used in testing.
#
# @api private
def self._generator
@@generator
end

# Register this type when the module is loaded.
#
# @since 2.0.0
Expand Down
27 changes: 27 additions & 0 deletions spec/bson/object_id_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -607,4 +607,31 @@
}.to raise_error(BSON::Error::InvalidKey)
end
end

context 'when the counter wraps' do
before do
BSON::ObjectId._generator.reset_counter(0xFFFFFF)
end

let(:before) { BSON::ObjectId.new }
let(:after) { BSON::ObjectId.new }

it 'resets the counter portion to 0' do
expect(before._counter_part).to be == "ffffff"
expect(after._counter_part).to be == "000000"
end
end

context 'when fork changes the pid' do
before do
skip 'requires Process.fork' unless Process.respond_to?(:fork)
end

let(:parent_id) { BSON::ObjectId.new }
let(:child_id) { Utils.perform_in_child { BSON::ObjectId.new } }

it 'changes the process portion of the object-id' do
expect(child_id._process_part).not_to be == parent_id._process_part
end
end
end
49 changes: 48 additions & 1 deletion spec/support/utils.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,58 @@
# rubocop:todo all
module Utils
extend self

# JRuby chokes when strings like "\xfe\x00\xff", which are not valid UTF-8,
# appear in the source. Use this method to build such strings.
# char_array is an array of byte values to use for the string.
module_function def make_byte_string(char_array, encoding = 'BINARY')
def make_byte_string(char_array, encoding = 'BINARY')
char_array.map do |char|
char.chr.force_encoding('BINARY')
end.join.force_encoding(encoding)
end

# Forks the current process and executes the given block in the child.
# The value returned by the block is then returned in the parent process
# by this method.
#
# @return [ Object ] the value returned by the block
def perform_in_child(&block)
reader, writer = IO.pipe

if fork
parent_worker(reader, writer)
else
child_worker(reader, writer, &block)
end
end

private

# A utility method for #perform_in_child, to handle tasks for the parent
# side of the fork.
#
# @param [ IO ] reader The reader IO for the pipe
# @param [ IO ] writer The writer IO for the pipe
#
# @return [ Object ] the value returned by the child process
def parent_worker(reader, writer)
writer.close
blob = reader.read
reader.close
Process.wait
Marshal.load(blob)
end

# A utility method for #perform_in_child, to handle tasks for the child
# side of the fork.
#
# @param [ IO ] reader The reader IO for the pipe
# @param [ IO ] writer The writer IO for the pipe
def child_worker(reader, writer, &block)
reader.close
result = block.call
writer.write Marshal.dump(result)
writer.close
exit! 0
end
end
Loading

0 comments on commit 9484d2a

Please sign in to comment.