Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More space-efficient data structures for detecting duplicates in IdMa…
…pper Big import data sets may have a large amount of collisions (accidental or actual duplicates). Detecting duplicate input ids within the same group was previously done using a combination of maps, although that could quickly run out of heap memory. This commit introduces another way of doing this detection. Basically it works by copying the subset of collisions into a new cache (NumberArray so an live off-heap) with its own tracker cache associated with it. This pair of arrays will be sorted with ParallelSort just like the whole data set was sorted just previously. Given the now sorted tracker cache over the collisions and the kept input ids for these collisions, all potential duplicates are next to each other and can be compared in isolation. co-author: @alexaverbuch
- Loading branch information
Showing
7 changed files
with
368 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
303 changes: 214 additions & 89 deletions
303
.../main/java/org/neo4j/unsafe/impl/batchimport/cache/idmapping/string/EncodingIdMapper.java
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
62 changes: 62 additions & 0 deletions
62
...main/java/org/neo4j/unsafe/impl/batchimport/cache/idmapping/string/SourceInformation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
/** | ||
* Copyright (c) 2002-2015 "Neo Technology," | ||
* Network Engine for Objects in Lund AB [http://neotechnology.com] | ||
* | ||
* This file is part of Neo4j. | ||
* | ||
* Neo4j is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
package org.neo4j.unsafe.impl.batchimport.cache.idmapping.string; | ||
|
||
import java.util.List; | ||
|
||
import org.neo4j.csv.reader.SourceTraceability; | ||
|
||
/** | ||
* Encodes source id (effectively and id refering to a file name or similar, | ||
* i.e {@link SourceTraceability#sourceDescription()}, group id and line number. | ||
*/ | ||
class SourceInformation implements Cloneable | ||
{ | ||
static final long LINE_NUMBER_MASK = 0xFFFF_FFFFFFFFL; | ||
|
||
int sourceId; | ||
long lineNumber; | ||
|
||
SourceInformation decode( long sourceInformation ) | ||
{ | ||
sourceId = (int) ((sourceInformation & ~LINE_NUMBER_MASK) >>> 48); // >>> we don't want the sign to matter | ||
lineNumber = (sourceInformation & LINE_NUMBER_MASK); | ||
return this; | ||
} | ||
|
||
static long encodeSourceInformation( int sourceId, long lineNumber ) | ||
{ | ||
if ( (sourceId & 0xFFFF0000) != 0 ) | ||
{ | ||
throw new IllegalArgumentException( "Collisions in too many sources (currently at " + sourceId + ")" ); | ||
} | ||
if ( (lineNumber & ~LINE_NUMBER_MASK) != 0 ) | ||
{ | ||
throw new IllegalArgumentException( "Collision in source with too many lines (" + lineNumber + ")" ); | ||
} | ||
|
||
return ((long)sourceId << 48) | lineNumber; | ||
} | ||
|
||
public String describe( List<String> sourceDescriptions ) | ||
{ | ||
return sourceDescriptions.get( sourceId ) + ":" + lineNumber; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
.../java/org/neo4j/unsafe/impl/batchimport/cache/idmapping/string/SourceInformationTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/** | ||
* Copyright (c) 2002-2015 "Neo Technology," | ||
* Network Engine for Objects in Lund AB [http://neotechnology.com] | ||
* | ||
* This file is part of Neo4j. | ||
* | ||
* Neo4j is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
*/ | ||
package org.neo4j.unsafe.impl.batchimport.cache.idmapping.string; | ||
|
||
import org.junit.Test; | ||
|
||
import java.util.Random; | ||
import java.util.concurrent.ThreadLocalRandom; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import static java.lang.Math.abs; | ||
import static java.lang.String.format; | ||
|
||
public class SourceInformationTest | ||
{ | ||
@Test | ||
public void shouldEncodeAndDecodeInformation() throws Exception | ||
{ | ||
// GIVEN | ||
SourceInformation codec = new SourceInformation(); | ||
Random random = ThreadLocalRandom.current(); | ||
|
||
// WHEN/THEN | ||
for ( int i = 0; i < 100; i++ ) | ||
{ | ||
int sourceId = random.nextInt( 0xFFFF + 1 ); | ||
long lineNumber = abs( random.nextLong() ) & SourceInformation.LINE_NUMBER_MASK; | ||
|
||
long encoded = SourceInformation.encodeSourceInformation( sourceId, lineNumber ); | ||
codec.decode( encoded ); | ||
|
||
String hint = format( "sourceId:%d, lineNumber:%d --> %d --> sourceId:%d, lineNumber:%d", | ||
sourceId, lineNumber, encoded, codec.sourceId, codec.lineNumber ); | ||
assertEquals( hint, sourceId, codec.sourceId ); | ||
assertEquals( hint, lineNumber, codec.lineNumber ); | ||
} | ||
} | ||
} |