Skip to content

Commit

Permalink
Implement trim on UTFStringValue
Browse files Browse the repository at this point in the history
  • Loading branch information
pontusmelke committed Nov 2, 2017
1 parent 920c77e commit c027b1e
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 36 deletions.
Expand Up @@ -23,15 +23,17 @@
import java.util.Arrays;

/*
* Just as a normal StringValue but is backed by a byte array and does string
* serialization lazily.
* Just as a normal StringValue but is backed by a byte array and does string
* serialization lazily.
*
* TODO in this implementation most operations will actually load the string
* such as hashCode. These could be implemented using
* the byte array directly in later optimizations
*/
*/
public final class UTF8StringValue extends StringValue
{
//0111 1111, used for removing HIGH BIT from byte
private static final int HIGH_BIT_MASK = 127;
//0100 000, used for detecting non-continuation bytes 10xx xxxx
private static final int NON_CONTINUATION_BIT_MASK = 64;

private volatile String value;
private final byte[] bytes;
private final int offset;
Expand Down Expand Up @@ -110,20 +112,20 @@ public int length()
return count;
}

private static final int HIGH_BIT_MASK = 127;

@Override
public int computeHash()
{
if ( bytes.length == 0 )
byte[] values = bytes;

if ( values.length == 0 || length == 0 )
{
return 0;
}

int hash = 1, i = offset, len = offset + length;
while ( i < len )
{
byte b = bytes[i];
byte b = values[i];
//If high bit is zero (equivalent to the byte being positive in two's complement)
//we are dealing with an ascii value and use a single byte for storing the value.
if ( b >= 0 )
Expand All @@ -145,25 +147,9 @@ public int computeHash()
bytesNeeded++;
b = (byte) (b << 1);
}
int codePoint;
switch ( bytesNeeded )
{
case 2:
codePoint = (b << 4) | (bytes[i + 1] & HIGH_BIT_MASK);
i += 2;
break;
case 3:
codePoint = (b << 9) | ((bytes[i + 1] & HIGH_BIT_MASK) << 6) | (bytes[i + 2] & HIGH_BIT_MASK);
i += 3;
break;
case 4:
codePoint = (b << 14) | ((bytes[i + 1] & HIGH_BIT_MASK) << 12) | ((bytes[i + 2] & HIGH_BIT_MASK) << 6)
| (bytes[i + 3] & HIGH_BIT_MASK);
i += 4;
break;
default:
throw new IllegalArgumentException( "Malformed UTF8 value" );
}
int codePoint = codePoint( b, i, bytesNeeded );
i += bytesNeeded;

hash = 31 * hash + codePoint;
}

Expand All @@ -175,6 +161,7 @@ public TextValue substring( int start, int end )
{
assert start > 0;
assert end > start && end < length();
byte[] values = bytes;

int count = 0, byteStart = -1, byteEnd = -1, i = offset, len = offset + length;
while ( i < len )
Expand All @@ -188,7 +175,7 @@ public TextValue substring( int start, int end )
byteEnd = i;
break;
}
byte b = bytes[i];
byte b = values[i];
//If high bit is zero (equivalent to the byte being positive in two's complement)
//we are dealing with an ascii value and use a single byte for storing the value.
if ( b >= 0 )
Expand All @@ -210,11 +197,135 @@ public TextValue substring( int start, int end )

assert byteStart >= 0;
assert byteEnd >= byteStart;
return Values.utf8Value( bytes, byteStart, byteEnd - byteStart );
return new UTF8StringValue( values, byteStart, byteEnd - byteStart );
}

@Override
public TextValue trim()
{
byte[] values = bytes;

if ( values.length == 0 || length == 0 )
{
return this;
}

int startIndex = trimLeftIndex();
int endIndex = trimRightIndex();
return new UTF8StringValue( values, startIndex, Math.max( endIndex + 1 - startIndex, 0 ) );
}

/**
* Returns the left-most index into the underlying byte array that does not belong to a whitespace code point
*/
private int trimLeftIndex()
{
int i = offset, len = offset + length;
while ( i < len )
{
byte b = bytes[i];
//If high bit is zero (equivalent to the byte being positive in two's complement)
//we are dealing with an ascii value and use a single byte for storing the value.
if ( b >= 0 )
{
if ( b > 32 )
{
return i;
}
i++;
continue;
}

//We can now have one of three situations.
//Byte1 Byte2 Byte3 Byte4
//110xxxxx 10xxxxxx
//1110xxxx 10xxxxxx 10xxxxxx
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//Figure out how many bytes we need by reading the number of leading bytes
int bytesNeeded = 0;
while ( b < 0 )
{
bytesNeeded++;
b = (byte) (b << 1);
}
int codePoint = codePoint( b, i, bytesNeeded );
if ( !Character.isWhitespace( codePoint ) )
{
return i;
}
i += bytesNeeded;
}
return i;
}

/**
* Returns the right-most index into the underlying byte array that does not belong to a whitespace code point
*/
private int trimRightIndex()
{
int index = offset + length - 1;
while ( index >= 0 )
{
byte b = bytes[index];
//If high bit is zero (equivalent to the byte being positive in two's complement)
//we are dealing with an ascii value and use a single byte for storing the value.
if ( b >= 0 )
{
if ( b > 32 )
{
return index;
}
index--;
continue;
}

//We can now have one of three situations.
//Byte1 Byte2 Byte3 Byte4
//110xxxxx 10xxxxxx
//1110xxxx 10xxxxxx 10xxxxxx
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
int bytesNeeded = 1;
while ( (b & NON_CONTINUATION_BIT_MASK) == 0 )
{
bytesNeeded++;
b = bytes[--index];
}

int codePoint = codePoint( (byte) (b << bytesNeeded), index, bytesNeeded );
if ( !Character.isWhitespace( codePoint ) )
{
return Math.min( index + bytesNeeded, length - 1 );
}
}
return index;
}


public byte[] bytes()
{
return bytes;
}

private int codePoint( byte currentByte, int i, int bytesNeeded )
{
int codePoint;
byte[] values = bytes;
switch ( bytesNeeded )
{
case 2:
codePoint = (currentByte << 4) | (values[i + 1] & HIGH_BIT_MASK);
break;
case 3:
codePoint = (currentByte << 9) | ((values[i + 1] & HIGH_BIT_MASK) << 6) | (values[i + 2] & HIGH_BIT_MASK);
break;
case 4:
codePoint = (currentByte << 14) | ((values[i + 1] & HIGH_BIT_MASK) << 12) |
((values[i + 2] & HIGH_BIT_MASK) << 6)
| (values[i + 3] & HIGH_BIT_MASK);
break;
default:
throw new IllegalArgumentException( "Malformed UTF8 value" );
}
return codePoint;
}
}
Expand Up @@ -23,14 +23,16 @@

import java.nio.charset.StandardCharsets;

import static java.lang.String.format;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.neo4j.values.storable.Values.stringValue;
import static org.neo4j.values.storable.Values.utf8Value;

public class UTF8StringValueTest
{
private String[] strings = {"", "1337", " ", "普通话/普通話", "\uD83D\uDE21"};
private String[] strings = {"", "1337", " ", "普通话/普通話", "\uD83D\uDE21", " a b c ", "䤹᳽", "熨", "ۼ",
"ⲹ楡톜ഷۢ⼈늉₭샺ጚ砧攡跿家䯶鲏⬖돛犽ۼ"};

@Test
public void shouldHandleDifferentTypesOfStrings()
Expand All @@ -44,6 +46,29 @@ public void shouldHandleDifferentTypesOfStrings()
}
}

@Test
public void shouldTrimDifferentTypesOfStrings()
{
for ( String string : strings )
{
TextValue stringValue = stringValue( string );
byte[] bytes = string.getBytes( StandardCharsets.UTF_8 );
TextValue utf8 = utf8Value( bytes );
assertSame( stringValue.trim(), utf8.trim() );
}
}

@Test
public void shouldFoo()
{
String string = "熨"; // "ۼ";

TextValue stringValue = stringValue( string );
byte[] bytes = string.getBytes( StandardCharsets.UTF_8 );
TextValue utf8 = utf8Value( bytes );
assertSame( stringValue.trim(), utf8.trim() );
}

@Test
public void shouldHandleOffset()
{
Expand All @@ -59,9 +84,10 @@ public void shouldHandleOffset()

private void assertSame( TextValue lhs, TextValue rhs )
{
assertThat( lhs.length(), equalTo( rhs.length() ) );
assertThat( lhs, equalTo( rhs ) );
assertThat( rhs, equalTo( lhs ) );
assertThat( lhs.hashCode(), equalTo( rhs.hashCode() ) );
assertThat( format( "%s.length != %s.length", lhs, rhs ), lhs.length(),
equalTo( rhs.length() ) );
assertThat( format( "%s != %s", lhs, rhs ), lhs, equalTo( rhs ) );
assertThat( format( "%s != %s", rhs, lhs ), rhs, equalTo( lhs ) );
assertThat( format( "%s.hashCode != %s.hashCode", rhs, lhs ), lhs.hashCode(), equalTo( rhs.hashCode() ) );
}
}

0 comments on commit c027b1e

Please sign in to comment.