Skip to content

Commit

Permalink
buffer: add Buffer.prototype.lastIndexOf()
Browse files Browse the repository at this point in the history
* Remove unnecessary templating from SearchString

  SearchString used to have separate PatternChar and SubjectChar template type
  arguments, apparently to support things like searching for an 8-bit string
  inside a 16-bit string or vice versa. However, SearchString is only used from
  node_buffer.cc, where PatternChar and SubjectChar are always the same. Since
  this is extra complexity that's unused and untested (simplifying to a single
  Char template argument still compiles and didn't break any unit tests), I
  removed it.

* Use Boyer-Hoore[-Horspool] for both indexOf and lastIndexOf

  Add test cases for lastIndexOf. Test the fallback from BMH to
  Boyer-Moore, which looks like it was totally untested before.

* Extra bounds checks in node_buffer.cc

* Extra asserts in string_search.h

* Buffer.lastIndexOf: clean up, enforce consistency w/ String.lastIndexOf

* Polyfill memrchr(3) for non-GNU systems

PR-URL: #4846
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Trevor Norris <trev.norris@gmail.com>
  • Loading branch information
dcposch authored and jasnell committed Apr 26, 2016
1 parent d18daaa commit 3350b47
Show file tree
Hide file tree
Showing 5 changed files with 471 additions and 252 deletions.
40 changes: 40 additions & 0 deletions doc/api/buffer.md
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,46 @@ for (var key of buf.keys()) {
// 5
```

### buf.lastIndexOf(value[, byteOffset][, encoding])

* `value` {String|Buffer|Number}
* `byteOffset` {Number} Default: `buf.length`
* `encoding` {String} Default: `'utf8'`
* Return: {Number}

Identical to [`Buffer#indexOf()`][], but searches the Buffer from back to front
instead of front to back. Returns the starting index position of `value` in
Buffer or `-1` if the Buffer does not contain `value`. The `value` can be a
String, Buffer or Number. Strings are by default interpreted as UTF8. If
`byteOffset` is provided, will return the last match that begins at or before
`byteOffset`.

```js
const buf = new Buffer('this buffer is a buffer');

buf.lastIndexOf('this');
// returns 0
buf.lastIndexOf('buffer');
// returns 17
buf.lastIndexOf(new Buffer('buffer'));
// returns 17
buf.lastIndexOf(97); // ascii for 'a'
// returns 15
buf.lastIndexOf(new Buffer('yolo'));
// returns -1
buf.lastIndexOf('buffer', 5)
// returns 5
buf.lastIndexOf('buffer', 4)
// returns -1

const utf16Buffer = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2');

utf16Buffer.lastIndexOf('\u03a3', null, 'ucs2');
// returns 6
utf16Buffer.lastIndexOf('\u03a3', -5, 'ucs2');
// returns 4
```

### buf.length

* {Number}
Expand Down
72 changes: 49 additions & 23 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,48 @@ Buffer.prototype.compare = function compare(target,
return binding.compareOffset(this, target, start, thisStart, end, thisEnd);
};

function slowIndexOf(buffer, val, byteOffset, encoding) {

// Finds either the first index of `val` in `buffer` at offset >= `byteOffset`,
// OR the last index of `val` in `buffer` at offset <= `byteOffset`.
//
// Arguments:
// - buffer - a Buffer to search
// - val - a string, Buffer, or number
// - byteOffset - an index into `buffer`; will be clamped to an int32
// - encoding - an optional encoding, relevant is val is a string
// - dir - true for indexOf, false for lastIndexOf
function bidirectionalIndexOf(buffer, val, byteOffset, encoding, dir) {
if (typeof byteOffset === 'string') {
encoding = byteOffset;
byteOffset = undefined;
} else if (byteOffset > 0x7fffffff) {
byteOffset = 0x7fffffff;
} else if (byteOffset < -0x80000000) {
byteOffset = -0x80000000;
}
byteOffset = +byteOffset; // Coerce to Number.
if (isNaN(byteOffset)) {
// If the offset is undefined, null, NaN, "foo", etc, search whole buffer.
byteOffset = dir ? 0 : (buffer.length - 1);
}
dir = !!dir; // Cast to bool.

if (typeof val === 'string') {
if (encoding === undefined) {
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);
}
return slowIndexOf(buffer, val, byteOffset, encoding, dir);
} else if (val instanceof Buffer) {
return binding.indexOfBuffer(buffer, val, byteOffset, encoding, dir);
} else if (typeof val === 'number') {
return binding.indexOfNumber(buffer, val, byteOffset, dir);
}

throw new TypeError('"val" argument must be string, number or Buffer');
}


function slowIndexOf(buffer, val, byteOffset, encoding, dir) {
var loweredCase = false;
for (;;) {
switch (encoding) {
Expand All @@ -609,13 +650,13 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
case 'utf16le':
case 'utf-16le':
case 'binary':
return binding.indexOfString(buffer, val, byteOffset, encoding);
return binding.indexOfString(buffer, val, byteOffset, encoding, dir);

case 'base64':
case 'ascii':
case 'hex':
return binding.indexOfBuffer(
buffer, Buffer.from(val, encoding), byteOffset, encoding);
buffer, Buffer.from(val, encoding), byteOffset, encoding, dir);

default:
if (loweredCase) {
Expand All @@ -628,29 +669,14 @@ function slowIndexOf(buffer, val, byteOffset, encoding) {
}
}


Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
if (typeof byteOffset === 'string') {
encoding = byteOffset;
byteOffset = 0;
} else if (byteOffset > 0x7fffffff) {
byteOffset = 0x7fffffff;
} else if (byteOffset < -0x80000000) {
byteOffset = -0x80000000;
}
byteOffset >>= 0;
return bidirectionalIndexOf(this, val, byteOffset, encoding, true);
};

if (typeof val === 'string') {
if (encoding === undefined) {
return binding.indexOfString(this, val, byteOffset, encoding);
}
return slowIndexOf(this, val, byteOffset, encoding);
} else if (val instanceof Buffer) {
return binding.indexOfBuffer(this, val, byteOffset, encoding);
} else if (typeof val === 'number') {
return binding.indexOfNumber(this, val, byteOffset);
}

throw new TypeError('"val" argument must be string, number or Buffer');
Buffer.prototype.lastIndexOf = function lastIndexOf(val, byteOffset, encoding) {
return bidirectionalIndexOf(this, val, byteOffset, encoding, false);
};


Expand Down
128 changes: 80 additions & 48 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -943,9 +943,44 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
}


// Computes the offset for starting an indexOf or lastIndexOf search.
// Returns either a valid offset in [0...<length - 1>], ie inside the Buffer,
// or -1 to signal that there is no possible match.
int64_t IndexOfOffset(size_t length, int64_t offset_i64, bool is_forward) {
int64_t length_i64 = static_cast<int64_t>(length);
if (length_i64 == 0) {
// Empty buffer, no match.
return -1;
}
if (offset_i64 < 0) {
if (offset_i64 + length_i64 >= 0) {
// Negative offsets count backwards from the end of the buffer.
return length_i64 + offset_i64;
} else if (is_forward) {
// indexOf from before the start of the buffer: search the whole buffer.
return 0;
} else {
// lastIndexOf from before the start of the buffer: no match.
return -1;
}
} else {
if (offset_i64 < length_i64) {
// Valid positive offset.
return offset_i64;
} else if (is_forward) {
// indexOf from past the end of the buffer: no match.
return -1;
} else {
// lastIndexOf from past the end of the buffer: search the whole buffer.
return length_i64 - 1;
}
}
}

void IndexOfString(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsString());
ASSERT(args[2]->IsNumber());
ASSERT(args[4]->IsBoolean());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
Expand All @@ -955,31 +990,26 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
SPREAD_ARG(args[0], ts_obj);

Local<String> needle = args[1].As<String>();
int64_t offset_i64 = args[2]->IntegerValue();
bool is_forward = args[4]->IsTrue();

const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
// Extended latin-1 characters are 2 bytes in Utf8.
const size_t needle_length =
enc == BINARY ? needle->Length() : needle->Utf8Length();


if (needle_length == 0 || haystack_length == 0) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
offset = 0;
} else {
offset = static_cast<size_t>(haystack_length + offset_i64);
}
} else {
offset = static_cast<size_t>(offset_i64);
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
if (opt_offset <= -1) {
return args.GetReturnValue().Set(-1);
}

if (haystack_length < offset || needle_length + offset > haystack_length) {
size_t offset = static_cast<size_t>(opt_offset);
CHECK_LT(offset, haystack_length);
if (is_forward && needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

Expand Down Expand Up @@ -1007,13 +1037,15 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
haystack_length / 2,
decoded_string,
decoder.size() / 2,
offset / 2);
offset / 2,
is_forward);
} else {
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
haystack_length / 2,
reinterpret_cast<const uint16_t*>(*needle_value),
needle_value.length(),
offset / 2);
offset / 2,
is_forward);
}
result *= 2;
} else if (enc == UTF8) {
Expand All @@ -1025,7 +1057,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
haystack_length,
reinterpret_cast<const uint8_t*>(*needle_value),
needle_length,
offset);
offset,
is_forward);
} else if (enc == BINARY) {
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
if (needle_data == nullptr) {
Expand All @@ -1038,7 +1071,8 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
haystack_length,
needle_data,
needle_length,
offset);
offset,
is_forward);
free(needle_data);
}

Expand All @@ -1049,17 +1083,18 @@ void IndexOfString(const FunctionCallbackInfo<Value>& args) {
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsObject());
ASSERT(args[2]->IsNumber());
ASSERT(args[4]->IsBoolean());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
UTF8);

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[1]);
SPREAD_ARG(args[0], ts_obj);
SPREAD_ARG(args[1], buf);

if (buf_length > 0)
CHECK_NE(buf_data, nullptr);
int64_t offset_i64 = args[2]->IntegerValue();
bool is_forward = args[4]->IsTrue();

const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
Expand All @@ -1070,19 +1105,13 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
offset = 0;
else
offset = static_cast<size_t>(haystack_length + offset_i64);
} else {
offset = static_cast<size_t>(offset_i64);
int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward);
if (opt_offset <= -1) {
return args.GetReturnValue().Set(-1);
}

if (haystack_length < offset || needle_length + offset > haystack_length) {
size_t offset = static_cast<size_t>(opt_offset);
CHECK_LT(offset, haystack_length);
if (is_forward && needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

Expand All @@ -1097,15 +1126,17 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
haystack_length / 2,
reinterpret_cast<const uint16_t*>(needle),
needle_length / 2,
offset / 2);
offset / 2,
is_forward);
result *= 2;
} else {
result = SearchString(
reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
reinterpret_cast<const uint8_t*>(needle),
needle_length,
offset);
offset,
is_forward);
}

args.GetReturnValue().Set(
Expand All @@ -1115,28 +1146,29 @@ void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsNumber());
ASSERT(args[2]->IsNumber());
ASSERT(args[3]->IsBoolean());

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
SPREAD_ARG(args[0], ts_obj);

uint32_t needle = args[1]->Uint32Value();
int64_t offset_i64 = args[2]->IntegerValue();
size_t offset;

if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
offset = 0;
else
offset = static_cast<size_t>(ts_obj_length + offset_i64);
} else {
offset = static_cast<size_t>(offset_i64);
}
bool is_forward = args[3]->IsTrue();

if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
int64_t opt_offset = IndexOfOffset(ts_obj_length, offset_i64, is_forward);
if (opt_offset <= -1) {
return args.GetReturnValue().Set(-1);
}
size_t offset = static_cast<size_t>(opt_offset);
CHECK_LT(offset, ts_obj_length);

void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
char* ptr_char = static_cast<char*>(ptr);
const void* ptr;
if (is_forward) {
ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
} else {
ptr = node::stringsearch::MemrchrFill(ts_obj_data, needle, offset + 1);
}
const char* ptr_char = static_cast<const char*>(ptr);
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
: -1);
}
Expand Down
Loading

0 comments on commit 3350b47

Please sign in to comment.