Implement trim on UTFStringValue

neo4j · Nov 2, 2017 · c027b1e · c027b1e
1 parent 920c77e
commit c027b1e
Show file tree

Hide file tree

Showing 2 changed files with 173 additions and 36 deletions.
diff --git a/community/values/src/main/java/org/neo4j/values/storable/UTF8StringValue.java b/community/values/src/main/java/org/neo4j/values/storable/UTF8StringValue.java
@@ -23,15 +23,17 @@
 import java.util.Arrays;
 
 /*
-* Just as a normal StringValue but is backed by a byte array and does string
-* serialization lazily.
+ * Just as a normal StringValue but is backed by a byte array and does string
+ * serialization lazily.
  *
- * TODO in this implementation most operations will actually load the string
- * such as hashCode. These could be implemented using
- * the byte array directly in later optimizations
-*/
+ */
 public final class UTF8StringValue extends StringValue
 {
+    //0111 1111, used for removing HIGH BIT from byte
+    private static final int HIGH_BIT_MASK = 127;
+    //0100 000, used for detecting non-continuation bytes 10xx xxxx
+    private static final int NON_CONTINUATION_BIT_MASK = 64;
+
     private volatile String value;
     private final byte[] bytes;
     private final int offset;
@@ -110,20 +112,20 @@ public int length()
         return count;
     }
 
-    private static final int HIGH_BIT_MASK = 127;
-
     @Override
     public int computeHash()
     {
-        if ( bytes.length == 0 )
+        byte[] values = bytes;
+
+        if ( values.length == 0 || length == 0 )
         {
             return 0;
         }
 
         int hash = 1, i = offset, len = offset + length;
         while ( i < len )
         {
-            byte b = bytes[i];
+            byte b = values[i];
             //If high bit is zero (equivalent to the byte being positive in two's complement)
             //we are dealing with an ascii value and use a single byte for storing the value.
             if ( b >= 0 )
@@ -145,25 +147,9 @@ public int computeHash()
                 bytesNeeded++;
                 b = (byte) (b << 1);
             }
-            int codePoint;
-            switch ( bytesNeeded )
-            {
-            case 2:
-                codePoint = (b << 4) | (bytes[i + 1] & HIGH_BIT_MASK);
-                i += 2;
-                break;
-            case 3:
-                codePoint = (b << 9) | ((bytes[i + 1] & HIGH_BIT_MASK) << 6) | (bytes[i + 2] & HIGH_BIT_MASK);
-                i += 3;
-                break;
-            case 4:
-                codePoint = (b << 14) | ((bytes[i + 1] & HIGH_BIT_MASK) << 12) | ((bytes[i + 2] & HIGH_BIT_MASK) << 6)
-                            | (bytes[i + 3] & HIGH_BIT_MASK);
-                i += 4;
-                break;
-            default:
-                throw new IllegalArgumentException( "Malformed UTF8 value" );
-            }
+            int codePoint = codePoint( b, i, bytesNeeded );
+            i += bytesNeeded;
+
             hash = 31 * hash + codePoint;
         }
 
@@ -175,6 +161,7 @@ public TextValue substring( int start, int end )
     {
         assert start > 0;
         assert end > start && end < length();
+        byte[] values = bytes;
 
         int count = 0, byteStart = -1, byteEnd = -1, i = offset, len = offset + length;
         while ( i < len )
@@ -188,7 +175,7 @@ public TextValue substring( int start, int end )
                 byteEnd = i;
                 break;
             }
-            byte b = bytes[i];
+            byte b = values[i];
             //If high bit is zero (equivalent to the byte being positive in two's complement)
             //we are dealing with an ascii value and use a single byte for storing the value.
             if ( b >= 0 )
@@ -210,11 +197,135 @@ public TextValue substring( int start, int end )
 
         assert byteStart >= 0;
         assert byteEnd >= byteStart;
-        return Values.utf8Value( bytes, byteStart, byteEnd - byteStart );
+        return new UTF8StringValue( values, byteStart, byteEnd - byteStart );
+    }
+
+    @Override
+    public TextValue trim()
+    {
+        byte[] values = bytes;
+
+        if ( values.length == 0 || length == 0 )
+        {
+            return this;
+        }
+
+        int startIndex = trimLeftIndex();
+        int endIndex = trimRightIndex();
+        return new UTF8StringValue( values, startIndex, Math.max( endIndex + 1 - startIndex, 0 ) );
+    }
+
+    /**
+     * Returns the left-most index into the underlying byte array that does not belong to a whitespace code point
+     */
+    private int trimLeftIndex()
+    {
+        int i = offset, len = offset + length;
+        while ( i < len )
+        {
+            byte b = bytes[i];
+            //If high bit is zero (equivalent to the byte being positive in two's complement)
+            //we are dealing with an ascii value and use a single byte for storing the value.
+            if ( b >= 0 )
+            {
+                if ( b > 32 )
+                {
+                    return i;
+                }
+                i++;
+                continue;
+            }
+
+            //We can now have one of three situations.
+            //Byte1    Byte2    Byte3    Byte4
+            //110xxxxx 10xxxxxx
+            //1110xxxx 10xxxxxx 10xxxxxx
+            //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            //Figure out how many bytes we need by reading the number of leading bytes
+            int bytesNeeded = 0;
+            while ( b < 0 )
+            {
+                bytesNeeded++;
+                b = (byte) (b << 1);
+            }
+            int codePoint = codePoint( b, i, bytesNeeded );
+            if ( !Character.isWhitespace( codePoint ) )
+            {
+                return i;
+            }
+            i += bytesNeeded;
+        }
+        return i;
+    }
+
+    /**
+     * Returns the right-most index into the underlying byte array that does not belong to a whitespace code point
+     */
+    private int trimRightIndex()
+    {
+        int index = offset + length - 1;
+        while ( index >= 0 )
+        {
+            byte b = bytes[index];
+            //If high bit is zero (equivalent to the byte being positive in two's complement)
+            //we are dealing with an ascii value and use a single byte for storing the value.
+            if ( b >= 0 )
+            {
+                if ( b > 32 )
+                {
+                    return index;
+                }
+                index--;
+                continue;
+            }
+
+            //We can now have one of three situations.
+            //Byte1    Byte2    Byte3    Byte4
+            //110xxxxx 10xxxxxx
+            //1110xxxx 10xxxxxx 10xxxxxx
+            //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            int bytesNeeded = 1;
+            while ( (b & NON_CONTINUATION_BIT_MASK) == 0 )
+            {
+                bytesNeeded++;
+                b = bytes[--index];
+            }
+
+            int codePoint = codePoint( (byte) (b << bytesNeeded), index, bytesNeeded );
+            if ( !Character.isWhitespace( codePoint ) )
+            {
+                return Math.min( index + bytesNeeded, length - 1 );
+            }
+        }
+        return index;
     }
 
+
     public byte[] bytes()
     {
         return bytes;
     }
+
+    private int codePoint( byte currentByte, int i, int bytesNeeded )
+    {
+        int codePoint;
+        byte[] values = bytes;
+        switch ( bytesNeeded )
+        {
+        case 2:
+            codePoint = (currentByte << 4) | (values[i + 1] & HIGH_BIT_MASK);
+            break;
+        case 3:
+            codePoint = (currentByte << 9) | ((values[i + 1] & HIGH_BIT_MASK) << 6) | (values[i + 2] & HIGH_BIT_MASK);
+            break;
+        case 4:
+            codePoint = (currentByte << 14) | ((values[i + 1] & HIGH_BIT_MASK) << 12) |
+                        ((values[i + 2] & HIGH_BIT_MASK) << 6)
+                        | (values[i + 3] & HIGH_BIT_MASK);
+            break;
+        default:
+            throw new IllegalArgumentException( "Malformed UTF8 value" );
+        }
+        return codePoint;
+    }
 }
diff --git a/community/values/src/test/java/org/neo4j/values/storable/UTF8StringValueTest.java b/community/values/src/test/java/org/neo4j/values/storable/UTF8StringValueTest.java
@@ -23,14 +23,16 @@
 
 import java.nio.charset.StandardCharsets;
 
+import static java.lang.String.format;
 import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.neo4j.values.storable.Values.stringValue;
 import static org.neo4j.values.storable.Values.utf8Value;
 
 public class UTF8StringValueTest
 {
-    private String[] strings = {"", "1337", " ", "普通话/普通話", "\uD83D\uDE21"};
+    private String[] strings = {"", "1337", " ", "普通话/普通話", "\uD83D\uDE21", " a b c ", "䤹᳽", "熨", "ۼ",
+            "ⲹ楡톜ഷۢ⼈늉₭샺ጚ砧攡跿家䯶鲏⬖돛犽ۼ"};
 
     @Test
     public void shouldHandleDifferentTypesOfStrings()
@@ -44,6 +46,29 @@ public void shouldHandleDifferentTypesOfStrings()
         }
     }
 
+    @Test
+    public void shouldTrimDifferentTypesOfStrings()
+    {
+        for ( String string : strings )
+        {
+            TextValue stringValue = stringValue( string );
+            byte[] bytes = string.getBytes( StandardCharsets.UTF_8 );
+            TextValue utf8 = utf8Value( bytes );
+            assertSame( stringValue.trim(), utf8.trim() );
+        }
+    }
+
+    @Test
+    public void shouldFoo()
+    {
+        String string = "熨"; // "ۼ";
+
+        TextValue stringValue = stringValue( string );
+        byte[] bytes = string.getBytes( StandardCharsets.UTF_8 );
+        TextValue utf8 = utf8Value( bytes );
+        assertSame( stringValue.trim(), utf8.trim() );
+    }
+
     @Test
     public void shouldHandleOffset()
     {
@@ -59,9 +84,10 @@ public void shouldHandleOffset()
 
     private void assertSame( TextValue lhs, TextValue rhs )
     {
-        assertThat( lhs.length(), equalTo( rhs.length() ) );
-        assertThat( lhs, equalTo( rhs ) );
-        assertThat( rhs, equalTo( lhs ) );
-        assertThat( lhs.hashCode(), equalTo( rhs.hashCode() ) );
+        assertThat( format( "%s.length != %s.length", lhs, rhs ), lhs.length(),
+                equalTo( rhs.length() ) );
+        assertThat( format( "%s != %s", lhs, rhs ), lhs, equalTo( rhs ) );
+        assertThat( format( "%s != %s", rhs, lhs ), rhs, equalTo( lhs ) );
+        assertThat( format( "%s.hashCode != %s.hashCode", rhs, lhs ), lhs.hashCode(), equalTo( rhs.hashCode() ) );
     }
 }