-
Notifications
You must be signed in to change notification settings - Fork 5
/
CustomUtf8Encoder.java
178 lines (162 loc) · 5.87 KB
/
CustomUtf8Encoder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
package psy.lob.saw;
import java.nio.ByteBuffer;
import java.nio.charset.CoderResult;
import sun.nio.cs.Surrogate;
/**
* Customized version of the JDK7 UTF8 encoder targeting the use-case of
* encoding strings that should fit into a byte buffer.
*
* @author nitsan
*
*/
public class CustomUtf8Encoder {
// as opposed to the JDK version where this is allocated lazily if required
private final Surrogate.Parser sgp = new Surrogate.Parser();
// taking these off the stack seems to make it go faster
private int lastSp;
private int lastDp;
/**
* Encodes a string into the byte buffer using the UTF-8 encoding. Like the
* JDK encoder this will return UNDERFLOW on success and ERROR/OVERFLOW otherwise,
* but unlike the JDK encode it does not allow resuming the operation and will
* not move the byte buffer position should the string not fit in it.
*
* @param src
* @param dst
* @return
*/
public final CoderResult encodeString(String src, ByteBuffer dst) {
if (dst.hasArray())
return encodeStringToHeap(src, dst);
else
return encodeStringToDirect(src, dst);
}
public final CoderResult encodeStringToDirect(String src, ByteBuffer dst) {
lastDp = 0;
int dp = dst.position();
int dl = dst.limit();
// in JDK7 offset is always 0, but earlier versions accomodated substrings
// pointing back to original array and having a separate offset and length.
int spCurr = UnsafeString.getOffset(src);
int sl = src.length();
// pluck the chars array out of the String, saving us an array copy
CoderResult result = encode(UnsafeString.getChars(src), spCurr, sl,
UnsafeDirectByteBuffer.getAddress(dst), dp, dl);
// only move the position if we fit the whole thing in.
if(lastDp != 0)
dst.position(lastDp);
return result;
}
/**
* The parameter naming is from the JDK source and I kept it to make diffing easier.
* The s stands for source, the d for destination. It actually grew on me as I played
* with the code, but I agree longer names are more readable.
*
* @param sa source char array
* @param spCurr the source position starting point
* @param sl source array length/limit
* @param dAddress destination address(plucked out of Buffer using Unsafe)
* @param dp destination position
* @param dl destination limit
* @return UNDERFLOW is successful, OVERFLOW/ERROR otherwise
*/
private final CoderResult encode(char[] sa, int spCurr, int sl, long dAddress, int dp,
int dl) {
lastSp = spCurr;
int dlASCII = Math.min(sl - lastSp, dl - dp);
// handle ascii encoded strings in an optimised loop
while (dp < dlASCII && sa[lastSp] < 128)
// TODO: could arguably skip this utility and compute the target address
// directly...
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) sa[lastSp++]);
while (lastSp < sl) {
int c = sa[lastSp];
if (c < 128) {
if (dp >= dl)
return CoderResult.OVERFLOW;
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) c);
} else if (c < 2048) {
if (dl - dp < 2)
return CoderResult.OVERFLOW;
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0xC0 | (c >> 6)));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | (c & 0x3F)));
} else if (Surrogate.is(c)) {
int uc = sgp.parse((char) c, sa, lastSp, sl);
if (uc < 0) {
lastDp = dp;
return sgp.error();
}
if (dl - dp < 4)
return CoderResult.OVERFLOW;
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0xF0 | uc >> 18));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | uc >> 12 & 0x3F));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | uc >> 6 & 0x3F));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | uc & 0x3F));
++lastSp;
} else {
if (dl - dp < 3)
return CoderResult.OVERFLOW;
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0xE0 | c >> 12));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | c >> 6 & 0x3F));
UnsafeDirectByteBuffer.putByte(dAddress, dp++, (byte) (0x80 | c & 0x3F));
}
++lastSp;
}
lastDp = dp;
return CoderResult.UNDERFLOW;
}
public CoderResult encodeStringToHeap(String src, ByteBuffer dst) {
lastDp = 0;
int arrayOffset = dst.arrayOffset();
int dp = arrayOffset + dst.position();
int dl = arrayOffset + dst.limit();
int spCurr = UnsafeString.getOffset(src);
int sl = src.length();
try {
CoderResult result = encode(UnsafeString.getChars(src), spCurr, sl,
dst.array(), dp, dl);
dst.position(lastDp - arrayOffset);
return result;
} catch (ArrayIndexOutOfBoundsException e) {
return CoderResult.OVERFLOW;
}
}
private CoderResult encode(char[] sa, int spCurr, int sl, byte[] da,
int dp, int dl) {
lastSp = spCurr;
int dlASCII = dp + Math.min(sl - lastSp, dl - dp);
// handle ascii encoded strings in an optimised loop
while (dp < dlASCII && sa[lastSp] < 128)
da[dp++] = (byte) sa[lastSp++];
// we are counting on the JVM array boundary checks to throw an exception rather then
// checkin boundaries ourselves... no nice, and potentailly not that much of a
// performance enhancement.
while (lastSp < sl) {
int c = sa[lastSp];
if (c < 128) {
da[dp++] = (byte) c;
} else if (c < 2048) {
da[dp++] = (byte) (0xC0 | (c >> 6));
da[dp++] = (byte) (0x80 | (c & 0x3F));
} else if (Surrogate.is(c)) {
int uc = sgp.parse((char) c, sa, lastSp, sl);
if (uc < 0) {
lastDp = dp;
return sgp.error();
}
da[(dp++)] = (byte) (0xF0 | uc >> 18);
da[(dp++)] = (byte) (0x80 | uc >> 12 & 0x3F);
da[(dp++)] = (byte) (0x80 | uc >> 6 & 0x3F);
da[(dp++)] = (byte) (0x80 | uc & 0x3F);
++lastSp;
} else {
da[(dp++)] = (byte) (0xE0 | c >> 12);
da[(dp++)] = (byte) (0x80 | c >> 6 & 0x3F);
da[(dp++)] = (byte) (0x80 | c & 0x3F);
}
++lastSp;
}
lastDp = dp;
return CoderResult.UNDERFLOW;
}
}