Skip to content

Commit

Permalink
revert to use String::compareTo
Browse files Browse the repository at this point in the history
  • Loading branch information
pontusmelke committed Sep 7, 2018
1 parent 7307115 commit 538750b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 87 deletions.
Expand Up @@ -20,6 +20,7 @@
package org.neo4j.values.storable;


import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;

Expand All @@ -37,8 +38,11 @@ public class TextValueFuzzTest

private static final int ITERATIONS = 1000;

@Disabled(
"we have decided to stick with String::compareTo under the hood which doesn't respect code point order " +
"whenever the code point doesn't fit 16bits" )
@Test
void shouldCompareTo()
void shouldCompareToForAllValidStrings()
{
for ( int i = 0; i < ITERATIONS; i++ )
{
Expand All @@ -47,6 +51,16 @@ void shouldCompareTo()
}
}

@Test
void shouldCompareToForAllStringsInBasicMultilingualPlane()
{
for ( int i = 0; i < ITERATIONS; i++ )
{
assertConsistent( random.nextBasicMultilingualPlaneString(), random.nextBasicMultilingualPlaneString(),
( t1, t2 ) -> Math.signum( t1.compareTo( t2 ) ) );
}
}

@Test
void shouldAdd()
{
Expand Down
Expand Up @@ -30,7 +30,6 @@
import org.neo4j.values.virtual.ListValue;
import org.neo4j.values.virtual.VirtualValues;

import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.String.format;

public abstract class StringValue extends TextValue
Expand Down Expand Up @@ -169,94 +168,13 @@ public <T> T map( ValueMapper<T> mapper )
return mapper.mapString( this );
}

//NOTE: this doesn't respect code point order for code points that doesn't fit 16bits
@Override
public int compareTo( TextValue other )
{
/*
* The normal String::compareTo contrary to what documentation claims sort in code point order. It does not
* properly handle chars in the surrogate range. This leads to inconsistent sort-orders when mixing
* with UT8StringValue.
*
* This is based on https://ssl.icu-project.org/docs/papers/utf16_code_point_order.html. The basic idea
* is to first check for identical string prefixes and only when we find two different chars we perform
* a fix-up if necessary before comparing.
*/

if ( this == other )
{
return 0;
}

final String thisString = value();
final String thatString = other.stringValue();
final int l1 = thisString.length();
final int l2 = thatString.length();
char c1, c2;
int pos = 0;

//First compare identical substrings, here we need no fix-up
while ( true )
{
//if we are at the end any of the strings they are the same
if ( pos >= l1 || pos >= l2 )
{
return l1 - l2;
}
c1 = thisString.charAt( pos );
c2 = thatString.charAt( pos );
if ( c1 == c2 )
{
pos++;
}
else
{
break;
}
}

//We found c1, and c2 where c1 != c2, before comparing we need
//to perform fix-up if they are in surrogate range before comparing.
return orderCharsByCodePoint( c1, c2 );
}

/*
* In UTF-16 when a code point doesn't fit in a single char - two chars (also known as surrogate pairs) are used
* following the schema:
*
* - Subtract `0x010000` -> now we have a 20bit number in range ` 0x000000..0x0FFFFF`
* - Take the top ten bits and add `0xD800`. This number is used as the first char and is referred to as the high
* surrogate, it will be in the range ` 0xD800..0xDBFF`
* - Take the low ten bits and add ` 0xDC00`. This number will be the second char or the low surrogate, it will be
* in range `0xDC00..0xDFFF`.
* - Note that the higher surrogate range is lower that the low surrogate range, meaning that it is enough to
* to check if a character is bigger than `0xD800`(MIN_HIGH_SURROGATE) in order to check if the character is a
* supplementary character.
*
* This algorithm uses the following characteristics of UTF-16:
* - If any of the characters are not in surrogate range the characters are already ordered by code points.
* - If a char is in the surrogate range it needs a fixup in order to preserve order.
*/
private int orderCharsByCodePoint( char c1, char c2 )
{
if ( c1 >= MIN_HIGH_SURROGATE && c2 >= MIN_HIGH_SURROGATE )
{
return fixUp( c1 ) - fixUp( c2 );
}

return c1 - c2;
}

private char fixUp( char c1 )
{
if ( c1 >= '\ue000' )
{
c1 -= '\u0800';
}
else
{
c1 += '\u2000';
}
return c1;
String thisString = value();
String thatString = other.stringValue();
return thisString.compareTo( thatString );
}

static TextValue EMPTY = new StringValue()
Expand Down

0 comments on commit 538750b

Please sign in to comment.