Skip to content

Commit ac7f21a

Browse files
committed
Implement line reading in steaming decoder on JVM.
1 parent f9f6667 commit ac7f21a

File tree

3 files changed

+160
-38
lines changed

3 files changed

+160
-38
lines changed

src/vm/jvm/runtime/org/perl6/nqp/runtime/Ops.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4441,6 +4441,25 @@ public static String decodertakeallchars(SixModelObject decoder, ThreadContext t
44414441
throw ExceptionHandling.dieInternal(tc, "decodertakeallchars requires an instance with the Decoder REPR");
44424442
}
44434443

4444+
public static String decodertakeline(SixModelObject decoder, long chomp, long eof,
4445+
ThreadContext tc) {
4446+
if (decoder instanceof DecoderInstance)
4447+
return ((DecoderInstance)decoder).takeLine(tc, chomp != 0, eof != 0);
4448+
else
4449+
throw ExceptionHandling.dieInternal(tc, "decodertakeline requires an instance with the Decoder REPR");
4450+
}
4451+
4452+
public static SixModelObject decodersetlineseps(SixModelObject decoder, SixModelObject seps,
4453+
ThreadContext tc) {
4454+
if (decoder instanceof DecoderInstance) {
4455+
((DecoderInstance)decoder).setLineSeps(tc, seps);
4456+
return decoder;
4457+
}
4458+
else {
4459+
throw ExceptionHandling.dieInternal(tc, "decodersetlineseps requires an instance with the Decoder REPR");
4460+
}
4461+
}
4462+
44444463
private static final int CCLASS_ANY = 65535;
44454464
private static final int CCLASS_UPPERCASE = 1;
44464465
private static final int CCLASS_LOWERCASE = 2;

src/vm/jvm/runtime/org/perl6/nqp/sixmodel/reprs/DecoderInstance.java

Lines changed: 112 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,37 @@
1010
import org.perl6.nqp.runtime.ExceptionHandling;
1111
import org.perl6.nqp.runtime.ThreadContext;
1212
import org.perl6.nqp.sixmodel.SixModelObject;
13+
import org.perl6.nqp.sixmodel.StorageSpec;
1314

1415
public class DecoderInstance extends SixModelObject {
1516
private CharsetDecoder decoder;
1617
private List<ByteBuffer> toDecode;
1718
private List<CharBuffer> decoded;
19+
private List<String> lineSeps;
1820

1921
public void configure(ThreadContext tc, String encoding, SixModelObject config) {
20-
if (decoder == null)
22+
if (decoder == null) {
2123
decoder = Charset.forName(encoding).newDecoder();
22-
else
24+
lineSeps = new ArrayList<String>();
25+
lineSeps.add("\n");
26+
lineSeps.add("\r\n");
27+
}
28+
else {
2329
throw ExceptionHandling.dieInternal(tc, "Decoder already configured");
30+
}
31+
}
32+
33+
public void setLineSeps(ThreadContext tc, SixModelObject seps) {
34+
final int prim = seps.st.REPR.get_value_storage_spec(tc, seps.st).boxed_primitive;
35+
if (prim != StorageSpec.BP_STR)
36+
ExceptionHandling.dieInternal(tc,
37+
"Line separators must be provided as an array of native strings");
38+
lineSeps.clear();
39+
long numSeps = seps.elems(tc);
40+
for (long i = 0; i < numSeps; i++) {
41+
seps.at_pos_native(tc, i);
42+
lineSeps.add(tc.native_s);
43+
}
2444
}
2545

2646
public void addBytes(ThreadContext tc, ByteBuffer bytes) {
@@ -37,7 +57,7 @@ public String takeChars(ThreadContext tc, long chars) {
3757
return "";
3858

3959
CharBuffer target = CharBuffer.allocate((int)chars + 1);
40-
eatDecodedChars(target);
60+
eatAllDecodedChars(target);
4161
if (target.position() != chars)
4262
eatUndecodedBytes(target, false);
4363

@@ -68,7 +88,7 @@ public String takeAvailableChars(ThreadContext tc) {
6888

6989
int maxChars = availableDecodedChars() + availableUndecodedBytes();
7090
CharBuffer target = CharBuffer.allocate(maxChars);
71-
eatDecodedChars(target);
91+
eatAllDecodedChars(target);
7292
eatUndecodedBytes(target, true);
7393

7494
String normalized = Normalizer.normalize(
@@ -86,7 +106,7 @@ public String takeAllChars(ThreadContext tc) {
86106
ensureConfigured(tc);
87107
int maxChars = availableDecodedChars() + availableUndecodedBytes();
88108
CharBuffer target = CharBuffer.allocate(maxChars);
89-
eatDecodedChars(target);
109+
eatAllDecodedChars(target);
90110
if (toDecode != null) {
91111
if (toDecode.size() == 0)
92112
toDecode.add(ByteBuffer.allocate(0));
@@ -97,6 +117,72 @@ public String takeAllChars(ThreadContext tc) {
97117
return Normalizer.normalize(decodedBuffer(target), Normalizer.Form.NFC);
98118
}
99119

120+
public String takeLine(ThreadContext tc, boolean chomp, boolean eof) {
121+
ensureConfigured(tc);
122+
while (true) {
123+
/* See if we can find the separator in any of the decoded chars. */
124+
int charsToTake = 0;
125+
for (int i = 0; i < (decoded == null ? 0 : decoded.size()); i++) {
126+
CharBuffer search = decoded.get(i);
127+
for (int j = 0; j < search.remaining(); j++) {
128+
char c = search.charAt(j);
129+
for (int k = 0; k < lineSeps.size(); k++) {
130+
String sep = lineSeps.get(k);
131+
if (sep.charAt(0) == c) {
132+
if (sep.length() == 1 || sepMatchAt(i, j, sep)) {
133+
return takeCharsSkipChars(
134+
chomp ? charsToTake : charsToTake + sep.length(),
135+
chomp ? sep.length() : 0);
136+
}
137+
}
138+
}
139+
charsToTake++;
140+
}
141+
}
142+
143+
/* If there are no more buffers to decode then we're done. */
144+
if (toDecode == null || toDecode.size() == 0)
145+
break;
146+
147+
/* Otherwise decode one of them. */
148+
ByteBuffer decodee = toDecode.get(0);
149+
CharBuffer target = CharBuffer.allocate(decodee.limit());
150+
decoder.decode(decodee, target, eof && toDecode.size() == 1);
151+
target.rewind();
152+
if (decoded == null)
153+
decoded = new ArrayList<CharBuffer>();
154+
decoded.add(target);
155+
toDecode.remove(0);
156+
}
157+
158+
return eof ? takeAllChars(tc) : null;
159+
}
160+
161+
private boolean sepMatchAt(int decStart, int charStart, String sep) {
162+
int sepIndex = 0;
163+
boolean firstBuffer = true;
164+
for (int i = decStart; i < decoded.size(); i++) {
165+
CharBuffer search = decoded.get(i);
166+
for (int j = firstBuffer ? charStart : 0; j < search.remaining(); j++) {
167+
if (search.charAt(j) != sep.charAt(sepIndex++))
168+
return false;
169+
if (sepIndex == sep.length())
170+
return true;
171+
}
172+
firstBuffer = false;
173+
}
174+
return false;
175+
}
176+
177+
private String takeCharsSkipChars(int take, int skip) {
178+
CharBuffer target = CharBuffer.allocate(take);
179+
eatDecodedChars(target, take);
180+
if (skip > 0)
181+
eatDecodedChars(CharBuffer.allocate(skip), skip);
182+
target.rewind();
183+
return Normalizer.normalize(target, Normalizer.Form.NFC);
184+
}
185+
100186
private int availableDecodedChars() {
101187
int available = 0;
102188
if (decoded != null)
@@ -113,14 +199,32 @@ private int availableUndecodedBytes() {
113199
return available;
114200
}
115201

116-
private void eatDecodedChars(CharBuffer target) {
202+
private void eatAllDecodedChars(CharBuffer target) {
117203
if (decoded != null) {
118-
for (int i = 0; i < decoded.size(); i++)
204+
for (int i = 0; i < decoded.size(); i++) {
119205
target.append(decoded.get(i));
206+
}
120207
decoded.clear();
121208
}
122209
}
123210

211+
private void eatDecodedChars(CharBuffer target, int n) {
212+
int remaining = n;
213+
while (remaining > 0 && decoded.size() > 0) {
214+
CharBuffer source = decoded.get(0);
215+
if (source.remaining() <= remaining) {
216+
target.append(source);
217+
remaining -= source.remaining();
218+
decoded.remove(0);
219+
}
220+
else {
221+
target.append(source.subSequence(0, remaining));
222+
decoded.set(0, source.subSequence(remaining, source.remaining()));
223+
remaining = 0;
224+
}
225+
}
226+
}
227+
124228
private void eatUndecodedBytes(CharBuffer target, boolean eof) {
125229
if (toDecode != null) {
126230
while (toDecode.size() > 0) {
@@ -156,6 +260,6 @@ public long isEmpty(ThreadContext tc) {
156260

157261
private void ensureConfigured(ThreadContext tc) {
158262
if (decoder == null)
159-
throw ExceptionHandling.dieInternal(tc, "Docder not yet configured");
263+
throw ExceptionHandling.dieInternal(tc, "Decoder not yet configured");
160264
}
161265
}

t/jvm/05-decoder.t

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -76,35 +76,35 @@ nqp::composetype($buf_type, nqp::hash('array', nqp::hash('type', uint8)));
7676
ok(nqp::decoderempty($dec), 'Empty after taking all chars');
7777
}
7878

79-
#{
80-
# my $testbuf1 := nqp::encode("line 1\nli", 'utf8', nqp::create($buf_type));
81-
# my $testbuf2 := nqp::encode("ne 2\nline 3", 'utf8', nqp::create($buf_type));
82-
# my $dec := nqp::create(VMDecoder);
83-
# nqp::decoderconfigure($dec, 'utf8', nqp::hash());
84-
# nqp::decoderaddbytes($dec, $testbuf1);
85-
# ok(nqp::decodertakeline($dec, 1, 0) eq 'line 1', 'read 1 line, chomped');
86-
# ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now');
87-
# nqp::decoderaddbytes($dec, $testbuf2);
88-
# ok(nqp::decodertakeline($dec, 0, 0) eq "line 2\n", 'read line 2, not chomped');
89-
# ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now');
90-
# ok(nqp::decodertakeline($dec, 1, 1) eq "line 3", 'with incomplete flag, read final line');
91-
#}
92-
#
93-
#{
94-
# my $testbuf1 := nqp::encode("line 1AAli", 'utf8', nqp::create($buf_type));
95-
# my $testbuf2 := nqp::encode("ne 2BBline 3", 'utf8', nqp::create($buf_type));
96-
# my $dec := nqp::create(VMDecoder);
97-
# nqp::decoderconfigure($dec, 'utf8', nqp::hash());
98-
# nqp::decodersetlineseps($dec, nqp::list_s('AA', 'BB'));
99-
# nqp::decoderaddbytes($dec, $testbuf1);
100-
# ok(nqp::decodertakeline($dec, 1, 0) eq 'line 1', 'read 1 line, chomped (custom seps)');
101-
# ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now (custom seps)');
102-
# nqp::decoderaddbytes($dec, $testbuf2);
103-
# ok(nqp::decodertakeline($dec, 0, 0) eq "line 2BB", 'read line 2, not chomped (custom seps)');
104-
# ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now (custom seps)');
105-
# ok(nqp::decodertakeline($dec, 1, 1) eq "line 3", 'with incomplete flag, read final line (custom seps)');
106-
#}
107-
#
79+
{
80+
my $testbuf1 := nqp::encode("line 1\nli", 'utf8', nqp::create($buf_type));
81+
my $testbuf2 := nqp::encode("ne 2\nline 3", 'utf8', nqp::create($buf_type));
82+
my $dec := nqp::create(VMDecoder);
83+
nqp::decoderconfigure($dec, 'utf8', nqp::hash());
84+
nqp::decoderaddbytes($dec, $testbuf1);
85+
ok(nqp::decodertakeline($dec, 1, 0) eq 'line 1', 'read 1 line, chomped');
86+
ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now');
87+
nqp::decoderaddbytes($dec, $testbuf2);
88+
ok(nqp::decodertakeline($dec, 0, 0) eq "line 2\n", 'read line 2, not chomped');
89+
ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now');
90+
ok(nqp::decodertakeline($dec, 1, 1) eq "line 3", 'with incomplete flag, read final line');
91+
}
92+
93+
{
94+
my $testbuf1 := nqp::encode("line 1AAli", 'utf8', nqp::create($buf_type));
95+
my $testbuf2 := nqp::encode("ne 2BBline 3", 'utf8', nqp::create($buf_type));
96+
my $dec := nqp::create(VMDecoder);
97+
nqp::decoderconfigure($dec, 'utf8', nqp::hash());
98+
nqp::decodersetlineseps($dec, nqp::list_s('AA', 'BB'));
99+
nqp::decoderaddbytes($dec, $testbuf1);
100+
ok(nqp::decodertakeline($dec, 1, 0) eq 'line 1', 'read 1 line, chomped (custom seps)');
101+
ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now (custom seps)');
102+
nqp::decoderaddbytes($dec, $testbuf2);
103+
ok(nqp::decodertakeline($dec, 0, 0) eq "line 2BB", 'read line 2, not chomped (custom seps)');
104+
ok(nqp::isnull_s(nqp::decodertakeline($dec, 1, 0)), 'cannot lead a complete line now (custom seps)');
105+
ok(nqp::decodertakeline($dec, 1, 1) eq "line 3", 'with incomplete flag, read final line (custom seps)');
106+
}
107+
108108
#{
109109
# my $testbuf1 := nqp::encode("над\nп", 'utf8', nqp::create($buf_type));
110110
# my $testbuf2 := nqp::encode('од', 'utf8', nqp::create($buf_type));
@@ -127,4 +127,3 @@ nqp::composetype($buf_type, nqp::hash('array', nqp::hash('type', uint8)));
127127
# ok(nqp::decoderbytesavailable($dec) == 0, 'Now no bytes available');
128128
# ok(nqp::decoderempty($dec), 'And so the decoder is empty');
129129
#}
130-
#

0 commit comments

Comments
 (0)