Skip to content

Commit aa72915

Browse files
committed
Merge: Stringify Bytes
This PR adds some String-like functions to Bytes, along with an abstraction for Byte-based patterns. Also, a bug was found within `Bytes::is_empty` and `FlatText` is now public since there was no real rationale to keep it private Pull-Request: #1860 Reviewed-by: Jean Privat <jean@pryen.org> Reviewed-by: Alexis Laferrière <alexis.laf@xymus.net>
2 parents 4a2d2f1 + fd67956 commit aa72915

File tree

3 files changed

+316
-4
lines changed

3 files changed

+316
-4
lines changed

lib/core/bytes.nit

Lines changed: 311 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,39 @@ import kernel
1919
import collection::array
2020
intrude import text::flat
2121

22+
# Any kind of entity which can be searched for in a Sequence of Byte
23+
interface BytePattern
24+
# Return the first occurence of `self` in `b`, or -1 if not found
25+
fun first_index_in(b: SequenceRead[Byte]): Int do return first_index_in_from(b, 0)
26+
27+
# Return the first occurence of `self` in `b` starting at `from`, or -1 if not found
28+
fun first_index_in_from(b: SequenceRead[Byte], from: Int): Int is abstract
29+
30+
# Return the last occurence of `self` in `b`, or -1 if not found
31+
fun last_index_in(b: SequenceRead[Byte]): Int do return last_index_in_from(b, b.length - 1)
32+
33+
# Return the last occurence of `self` in `b`, or -1 if not found
34+
fun last_index_in_from(b: SequenceRead[Byte], from: Int): Int is abstract
35+
36+
# Returns the indexes of all the occurences of `self` in `b`
37+
fun search_all_in(b: SequenceRead[Byte]): SequenceRead[Int] is abstract
38+
39+
# Length of the pattern
40+
fun pattern_length: Int is abstract
41+
42+
# Appends `self` to `b`
43+
fun append_to(b: Sequence[Byte]) is abstract
44+
45+
# Is `self` a prefix for `b` ?
46+
fun is_prefix(b: SequenceRead[Byte]): Bool is abstract
47+
48+
# Is `self` a suffix for `b` ?
49+
fun is_suffix(b: SequenceRead[Byte]): Bool is abstract
50+
end
51+
2252
redef class Byte
53+
super BytePattern
54+
2355
# Write self as a string into `ns` at position `pos`
2456
private fun add_digest_at(ns: NativeString, pos: Int) do
2557
var tmp = (0xF0u8 & self) >> 4
@@ -72,13 +104,47 @@ redef class Byte
72104
# i.e. this abort is here to please the compiler
73105
abort
74106
end
107+
108+
redef fun first_index_in_from(b, from) do
109+
for i in [from .. b.length[ do if b[i] == self then return i
110+
return -1
111+
end
112+
113+
redef fun last_index_in_from(b, from) do
114+
for i in [0 .. from].step(-1) do if b[i] == self then return i
115+
return -1
116+
end
117+
118+
redef fun search_all_in(b) do
119+
var ret = new Array[Int]
120+
var pos = 0
121+
loop
122+
pos = first_index_in_from(b, pos)
123+
if pos == -1 then return ret
124+
ret.add pos
125+
pos += 1
126+
end
127+
end
128+
129+
redef fun pattern_length do return 1
130+
131+
redef fun append_to(b) do b.push self
132+
133+
# assert 'b'.ascii.is_suffix("baqsdb".to_bytes)
134+
# assert not 'b'.ascii.is_suffix("baqsd".to_bytes)
135+
redef fun is_suffix(b) do return b.length != 0 and b.last == self
136+
137+
# assert 'b'.ascii.is_prefix("baqsdb".to_bytes)
138+
# assert not 'b'.ascii.is_prefix("aqsdb".to_bytes)
139+
redef fun is_prefix(b) do return b.length != 0 and b.first == self
75140
end
76141

77142
# A buffer containing Byte-manipulation facilities
78143
#
79144
# Uses Copy-On-Write when persisted
80145
class Bytes
81146
super AbstractArray[Byte]
147+
super BytePattern
82148

83149
# A NativeString being a char*, it can be used as underlying representation here.
84150
var items: NativeString
@@ -107,7 +173,9 @@ class Bytes
107173
init(ns, 0, cap)
108174
end
109175

110-
redef fun is_empty do return length != 0
176+
redef fun pattern_length do return length
177+
178+
redef fun is_empty do return length == 0
111179

112180
# var b = new Bytes.empty
113181
# b.add 101u8
@@ -118,6 +186,71 @@ class Bytes
118186
return items[i]
119187
end
120188

189+
# Returns a copy of `self`
190+
fun clone: Bytes do
191+
var b = new Bytes.with_capacity(length)
192+
b.append(self)
193+
return b
194+
end
195+
196+
# Trims off the whitespaces at the beginning and the end of `self`
197+
#
198+
# var b = "102041426E6F1020" .hexdigest_to_bytes
199+
# assert b.trim.hexdigest == "41426E6F"
200+
#
201+
# NOTE: A whitespace is defined here as a byte whose value is <= 0x20
202+
fun trim: Bytes do
203+
var st = 0
204+
while st < length do
205+
if self[st] > 0x20u8 then break
206+
st += 1
207+
end
208+
if st >= length then return new Bytes.empty
209+
var ed = length - 1
210+
while ed > 0 do
211+
if self[ed] > 0x20u8 then break
212+
ed -= 1
213+
end
214+
return slice(st, ed - st + 1)
215+
end
216+
217+
# Returns a subset of the content of `self` starting at `from` and of length `count`
218+
#
219+
# var b = "abcd".to_bytes
220+
# assert b.slice(1, 2).hexdigest == "6263"
221+
# assert b.slice(-1, 2).hexdigest == "61"
222+
# assert b.slice(1, 0).hexdigest == ""
223+
# assert b.slice(2, 5).hexdigest == "6364"
224+
fun slice(from, count: Int): Bytes do
225+
if count <= 0 then return new Bytes.empty
226+
227+
if from < 0 then
228+
count += from
229+
if count < 0 then count = 0
230+
from = 0
231+
end
232+
233+
if (count + from) > length then count = length - from
234+
if count <= 0 then return new Bytes.empty
235+
236+
var ret = new Bytes.with_capacity(count)
237+
238+
ret.append_ns(items.fast_cstring(from), count)
239+
return ret
240+
end
241+
242+
# Returns a copy of `self` starting at `from`
243+
#
244+
# var b = "abcd".to_bytes
245+
# assert b.slice_from(1).hexdigest == "626364"
246+
# assert b.slice_from(-1).hexdigest == "61626364"
247+
# assert b.slice_from(2).hexdigest == "6364"
248+
fun slice_from(from: Int): Bytes do
249+
if from >= length then return new Bytes.empty
250+
if from < 0 then from = 0
251+
return slice(from, length)
252+
end
253+
121254
# Returns self as a hexadecimal digest
122255
fun hexdigest: String do
123256
var elen = length * 2
@@ -218,6 +351,15 @@ class Bytes
218351
length += ln
219352
end
220353

354+
# Appends the bytes of `s` to `selftextextt`
355+
fun append_text(s: Text) do
356+
for i in s.substrings do
357+
append_ns(i.fast_cstring, i.bytelen)
358+
end
359+
end
360+
361+
redef fun append_to(b) do b.append self
362+
221363
redef fun enlarge(sz) do
222364
if capacity >= sz then return
223365
persisted = false
@@ -237,6 +379,157 @@ class Bytes
237379

238380
redef fun iterator do return new BytesIterator.with_buffer(self)
239381

382+
redef fun first_index_in_from(b, from) do
383+
if is_empty then return -1
384+
var fst = self[0]
385+
var bpos = fst.first_index_in_from(self, from)
386+
for i in [0 .. length[ do
387+
if self[i] != b[bpos] then return first_index_in_from(b, bpos + 1)
388+
bpos += 1
389+
end
390+
return bpos
391+
end
392+
393+
redef fun last_index_in_from(b, from) do
394+
if is_empty then return -1
395+
var lst = self[length - 1]
396+
var bpos = lst.last_index_in_from(b, from)
397+
for i in [0 .. length[.step(-1) do
398+
if self[i] != b[bpos] then return last_index_in_from(b, bpos - 1)
399+
bpos -= 1
400+
end
401+
return bpos
402+
end
403+
404+
redef fun search_all_in(b) do
405+
var ret = new Array[Int]
406+
var pos = first_index_in_from(b, 0)
407+
if pos == -1 then return ret
408+
pos = pos + 1
409+
ret.add pos
410+
loop
411+
pos = first_index_in_from(b, pos)
412+
if pos == -1 then return ret
413+
ret.add pos
414+
pos += length
415+
end
416+
end
417+
418+
# Splits the content on self when encountering `b`
419+
#
420+
# var a = "String is string".to_bytes.split_with('s'.ascii)
421+
# assert a.length == 3
422+
# assert a[0].hexdigest == "537472696E672069"
423+
# assert a[1].hexdigest == "20"
424+
# assert a[2].hexdigest == "7472696E67"
425+
fun split_with(b: BytePattern): Array[Bytes] do
426+
var fst = b.search_all_in(self)
427+
if fst.is_empty then return [clone]
428+
var retarr = new Array[Bytes]
429+
var prev = 0
430+
for i in fst do
431+
retarr.add(slice(prev, i - prev))
432+
prev = i + b.pattern_length
433+
end
434+
retarr.add slice_from(prev)
435+
return retarr
436+
end
437+
438+
# Splits `self` in two parts at the first occurence of `b`
439+
#
440+
# var a = "String is string".to_bytes.split_once_on('s'.ascii)
441+
# assert a[0].hexdigest == "537472696E672069"
442+
# assert a[1].hexdigest == "20737472696E67"
443+
fun split_once_on(b: BytePattern): Array[Bytes] do
444+
var spl = b.first_index_in(self)
445+
if spl == -1 then return [clone]
446+
var ret = new Array[Bytes].with_capacity(2)
447+
ret.add(slice(0, spl))
448+
ret.add(slice_from(spl + b.pattern_length))
449+
return ret
450+
end
451+
452+
# Replaces all the occurences of `this` in `self` by `by`
453+
#
454+
# var b = "String is string".to_bytes.replace(0x20u8, 0x41u8)
455+
# assert b.hexdigest == "537472696E6741697341737472696E67"
456+
fun replace(pattern: BytePattern, bytes: BytePattern): Bytes do
457+
if is_empty then return new Bytes.empty
458+
var pos = pattern.search_all_in(self)
459+
if pos.is_empty then return clone
460+
var ret = new Bytes.with_capacity(length)
461+
var prev = 0
462+
for i in pos do
463+
ret.append_ns(items.fast_cstring(prev), i - prev)
464+
bytes.append_to ret
465+
prev = i + pattern.pattern_length
466+
end
467+
ret.append(slice_from(pos.last + pattern.pattern_length))
468+
return ret
469+
end
470+
471+
# Decode `self` from percent (or URL) encoding to a clear string
472+
#
473+
# Replace invalid use of '%' with '?'.
474+
#
475+
# assert "aBc09-._~".to_bytes.from_percent_encoding == "aBc09-._~".to_bytes
476+
# assert "%25%28%29%3c%20%3e".to_bytes.from_percent_encoding == "%()< >".to_bytes
477+
# assert ".com%2fpost%3fe%3dasdf%26f%3d123".to_bytes.from_percent_encoding == ".com/post?e=asdf&f=123".to_bytes
478+
# assert "%25%28%29%3C%20%3E".to_bytes.from_percent_encoding == "%()< >".to_bytes
479+
# assert "incomplete %".to_bytes.from_percent_encoding == "incomplete ?".to_bytes
480+
# assert "invalid % usage".to_bytes.from_percent_encoding == "invalid ? usage".to_bytes
481+
# assert "%c3%a9%e3%81%82%e3%81%84%e3%81%86".to_bytes.from_percent_encoding == "éあいう".to_bytes
482+
fun from_percent_encoding: Bytes do
483+
var tmp = new Bytes.with_capacity(length)
484+
var pos = 0
485+
while pos < length do
486+
var b = self[pos]
487+
if b != '%'.ascii then
488+
tmp.add b
489+
pos += 1
490+
continue
491+
end
492+
if length - pos < 2 then
493+
tmp.add '?'.ascii
494+
pos += 1
495+
continue
496+
end
497+
var bn = self[pos + 1]
498+
var bnn = self[pos + 2]
499+
if not bn.is_valid_hexdigit or not bnn.is_valid_hexdigit then
500+
tmp.add '?'.ascii
501+
pos += 1
502+
continue
503+
end
504+
tmp.add((bn.hexdigit_to_byteval << 4) + bnn.hexdigit_to_byteval)
505+
pos += 3
506+
end
507+
return tmp
508+
end
509+
510+
# Is `b` a prefix of `self` ?
511+
fun has_prefix(b: BytePattern): Bool do return b.is_prefix(self)
512+
513+
# Is `b` a suffix of `self` ?
514+
fun has_suffix(b: BytePattern): Bool do return b.is_suffix(self)
515+
516+
redef fun is_suffix(b) do
517+
if length > b.length then return false
518+
var j = b.length - 1
519+
var i = length - 1
520+
while i > 0 do
521+
if self[i] != b[j] then return false
522+
i -= 1
523+
j -= 1
524+
end
525+
return true
526+
end
527+
528+
redef fun is_prefix(b) do
529+
if length > b.length then return false
530+
for i in [0 .. length[ do if self[i] != b[i] then return false
531+
return true
532+
end
240533
end
241534

242535
private class BytesIterator
@@ -397,3 +690,20 @@ redef class NativeString
397690
return new Bytes(nns, len, len)
398691
end
399692
end
693+
694+
# Joins an array of bytes `arr` separated by `sep`
695+
#
696+
# assert join_bytes(["String".to_bytes, "is".to_bytes, "string".to_bytes], ' '.ascii).hexdigest == "537472696E6720697320737472696E67"
697+
fun join_bytes(arr: Array[Bytes], sep: nullable BytePattern): Bytes do
698+
if arr.is_empty then return new Bytes.empty
699+
sep = sep or else new Bytes.empty
700+
var endln = sep.pattern_length * (arr.length - 1)
701+
for i in arr do endln += i.length
702+
var ret = new Bytes.with_capacity(endln)
703+
ret.append(arr.first)
704+
for i in [1 .. arr.length[ do
705+
sep.append_to(ret)
706+
ret.append arr[i]
707+
end
708+
return ret
709+
end

lib/core/text/abstract_text.nit

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ abstract class Text
998998
end
999999

10001000
# All kinds of array-based text representations.
1001-
private abstract class FlatText
1001+
abstract class FlatText
10021002
super Text
10031003

10041004
# Underlying C-String (`char*`)

lib/core/text/flat.nit

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@ end
3636

3737
redef class FlatText
3838

39-
fun first_byte: Int do return 0
39+
# First byte of the NativeString
40+
protected fun first_byte: Int do return 0
4041

41-
fun last_byte: Int do return _bytelen - 1
42+
# Last byte of the NativeString
43+
protected fun last_byte: Int do return _bytelen - 1
4244

4345
# Cache of the latest position (char) explored in the string
4446
var position: Int = 0

0 commit comments

Comments
 (0)