Skip to content

Commit

Permalink
Ability to specify input encoding in import tool
Browse files Browse the repository at this point in the history
  • Loading branch information
tinwelint committed Mar 29, 2015
1 parent 533c5ef commit 6d29696
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
import org.neo4j.unsafe.impl.batchimport.staging.ExecutionMonitors;

import static java.lang.System.out;
import static java.nio.charset.Charset.defaultCharset;

import static org.neo4j.graphdb.factory.GraphDatabaseSettings.store_dir;
import static org.neo4j.helpers.Exceptions.launderedException;
Expand Down Expand Up @@ -140,7 +141,12 @@ enum Options
"<max number of bad entries>",
"Number of bad entries before the import is considered failed. This tolerance threshold is "
+ "about relationships refering to missing nodes. Format errors in input data are "
+ "still treated as errors" );
+ "still treated as errors" ),
INPUT_ENCODING( "input-encoding", null,
"<character set>",
"Character set that input data is encoded in. Provided value must be one out of the available "
+ "character sets in the JVM, as provided by Charset#availableCharsets(). "
+ "If no input encoding is provided, the default character set of the JVM will be used." );

private final String key;
private final Object defaultValue;
Expand Down Expand Up @@ -245,6 +251,7 @@ public static void main( String[] incomingArguments, boolean defaultSettingsSuit
Input input = null;
String badFileName;
int badTolerance;
Charset inputEncoding;
try
{
storeDir = args.interpretOption( Options.STORE_DIR.key(), Converters.<File>mandatory(),
Expand All @@ -255,12 +262,13 @@ public static void main( String[] incomingArguments, boolean defaultSettingsSuit
processors = args.getNumber( Options.PROCESSORS.key(), null );
IdType idType = args.interpretOption( Options.ID_TYPE.key(),
withDefault( (IdType)Options.ID_TYPE.defaultValue() ), TO_ID_TYPE );
badTolerance = args.getNumber( Options.BAD_TOLERANCE.key,
badTolerance = args.getNumber( Options.BAD_TOLERANCE.key(),
(Number) Options.BAD_TOLERANCE.defaultValue() ).intValue();
badFileName = args.get( Options.BAD.key );
badFileName = args.get( Options.BAD.key() );
inputEncoding = Charset.forName( args.get( Options.INPUT_ENCODING.key(), defaultCharset().name() ) );
input = new CsvInput(
nodeData( nodesFiles ), defaultFormatNodeFileHeader(),
relationshipData( relationshipsFiles ), defaultFormatRelationshipFileHeader(),
nodeData( inputEncoding, nodesFiles ), defaultFormatNodeFileHeader(),
relationshipData( inputEncoding, relationshipsFiles ), defaultFormatRelationshipFileHeader(),
idType, csvConfiguration( args, defaultSettingsSuitableForTests ),
Collectors.badRelationships( badTolerance ) );
}
Expand Down Expand Up @@ -379,19 +387,20 @@ public void uncaughtException( Thread t, Throwable e )
}

private static Iterable<DataFactory<InputRelationship>>
relationshipData( Collection<Option<File[]>> relationshipsFiles )
relationshipData( final Charset encoding, Collection<Option<File[]>> relationshipsFiles )
{
return new IterableWrapper<DataFactory<InputRelationship>,Option<File[]>>( relationshipsFiles )
{
@Override
protected DataFactory<InputRelationship> underlyingObjectToObject( Option<File[]> group )
{
return data( defaultRelationshipType( group.metadata() ), Charset.defaultCharset(), group.value() );
return data( defaultRelationshipType( group.metadata() ), encoding, group.value() );
}
};
}

private static Iterable<DataFactory<InputNode>> nodeData( Collection<Option<File[]>> nodesFiles )
private static Iterable<DataFactory<InputNode>> nodeData( final Charset encoding,
Collection<Option<File[]>> nodesFiles )
{
return new IterableWrapper<DataFactory<InputNode>,Option<File[]>>( nodesFiles )
{
Expand All @@ -401,7 +410,7 @@ protected DataFactory<InputNode> underlyingObjectToObject( Option<File[]> input
Function<InputNode,InputNode> decorator = input.metadata() != null
? additiveLabels( input.metadata().split( ":" ) )
: NO_NODE_DECORATOR;
return data( decorator, Charset.defaultCharset(), input.value() );
return data( decorator, encoding, input.value() );
}
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import org.junit.Test;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
Expand Down Expand Up @@ -87,7 +86,7 @@ public void shouldImportWithAsManyDefaultsAsAvailable() throws Exception
importTool(
"--into", dbRule.getStoreDir().getAbsolutePath(),
"--nodes", nodeData( true, config, nodeIds, alwaysTrue() ).getAbsolutePath(),
"--relationships", relationshipData( true, config, nodeIds, alwaysTrue() ).getAbsolutePath() );
"--relationships", relationshipData( true, config, nodeIds, alwaysTrue(), true ).getAbsolutePath() );

// THEN
verifyData();
Expand All @@ -110,7 +109,7 @@ public void shouldImportWithHeadersBeingInSeparateFiles() throws Exception
nodeData( false, config, nodeIds, alwaysTrue() ).getAbsolutePath(),
"--relationships",
relationshipHeader( config ).getAbsolutePath() + MULTI_FILE_DELIMITER +
relationshipData( false, config, nodeIds, alwaysTrue() ).getAbsolutePath() );
relationshipData( false, config, nodeIds, alwaysTrue(), true ).getAbsolutePath() );

// THEN
verifyData();
Expand All @@ -135,7 +134,7 @@ public void shouldImportSplitInputFiles() throws Exception
nodeData( false, config, nodeIds, lines( NODE_COUNT * 3 / 4, NODE_COUNT ) ).getAbsolutePath(),
"--relationships",
relationshipHeader( config ).getAbsolutePath() + MULTI_FILE_DELIMITER +
relationshipData( false, config, nodeIds, alwaysTrue() ).getAbsolutePath() );
relationshipData( false, config, nodeIds, alwaysTrue(), true ).getAbsolutePath() );

// THEN
verifyData();
Expand Down Expand Up @@ -427,7 +426,7 @@ public void shouldHandleAdditiveLabelsWithSpaces() throws Exception
"--into", dbRule.getStoreDir().getAbsolutePath(),
"--nodes:My First Label:My Other Label",
nodeData( true, config, nodeIds, alwaysTrue() ).getAbsolutePath(),
"--relationships", relationshipData( true, config, nodeIds, alwaysTrue() ).getAbsolutePath() );
"--relationships", relationshipData( true, config, nodeIds, alwaysTrue(), true ).getAbsolutePath() );

// THEN
verifyData( new Validator<Node>()
Expand All @@ -441,6 +440,26 @@ public void validate( Node node )
}, Validators.<Relationship>emptyValidator() );
}

@Test
public void shouldImportFromInputDataEncodedWithSpecificCharset() throws Exception
{
// GIVEN
List<String> nodeIds = nodeIds();
Configuration config = Configuration.COMMAS;
Charset charset = Charset.forName( "UTF-16" );

// WHEN
importTool(
"--into", dbRule.getStoreDir().getAbsolutePath(),
"--input-encoding", charset.name(),
"--nodes", nodeData( true, config, nodeIds, alwaysTrue(), charset ).getAbsolutePath(),
"--relationships", relationshipData( true, config, nodeIds, alwaysTrue(), true, charset )
.getAbsolutePath() );

// THEN
verifyData();
}

protected void assertNodeHasLabels( Node node, String[] names )
{
for ( String name : names )
Expand Down Expand Up @@ -554,10 +573,16 @@ private String randomNodeId()
}

private File nodeData( boolean includeHeader, Configuration config, List<String> nodeIds,
PrimitiveIntPredicate linePredicate ) throws FileNotFoundException
PrimitiveIntPredicate linePredicate ) throws Exception
{
return nodeData( includeHeader, config, nodeIds, linePredicate, Charset.defaultCharset() );
}

private File nodeData( boolean includeHeader, Configuration config, List<String> nodeIds,
PrimitiveIntPredicate linePredicate, Charset encoding ) throws Exception
{
File file = file( fileName( "nodes.csv" ) );
try ( PrintStream writer = new PrintStream( file ) )
try ( PrintStream writer = writer( file, encoding ) )
{
if ( includeHeader )
{
Expand All @@ -568,15 +593,25 @@ private File nodeData( boolean includeHeader, Configuration config, List<String>
return file;
}

private File nodeHeader( Configuration config ) throws FileNotFoundException
private PrintStream writer( File file, Charset encoding ) throws Exception
{
return new PrintStream( file, encoding.name() );
}

private File nodeHeader( Configuration config ) throws Exception
{
return nodeHeader( config, null );
}

private File nodeHeader( Configuration config, String idGroup ) throws FileNotFoundException
private File nodeHeader( Configuration config, String idGroup ) throws Exception
{
return nodeHeader( config, idGroup, Charset.defaultCharset() );
}

private File nodeHeader( Configuration config, String idGroup, Charset encoding ) throws Exception
{
File file = file( fileName( "nodes-header.csv" ) );
try ( PrintStream writer = new PrintStream( file ) )
try ( PrintStream writer = writer( file, encoding ) )
{
writeNodeHeader( writer, config, idGroup );
}
Expand Down Expand Up @@ -636,23 +671,31 @@ private String randomName()
}

private File relationshipData( boolean includeHeader, Configuration config, List<String> nodeIds,
PrimitiveIntPredicate linePredicate ) throws FileNotFoundException
PrimitiveIntPredicate linePredicate, boolean specifyType ) throws Exception
{
return relationshipData( includeHeader, config, nodeIds, linePredicate, true );
return relationshipData( includeHeader, config, nodeIds, linePredicate, specifyType, Charset.defaultCharset() );
}

private File relationshipData( boolean includeHeader, Configuration config, List<String> nodeIds,
PrimitiveIntPredicate linePredicate, boolean specifyType ) throws FileNotFoundException
PrimitiveIntPredicate linePredicate, boolean specifyType, Charset encoding ) throws Exception
{
return relationshipData( includeHeader, config, randomRelationships( nodeIds ), linePredicate,
specifyType, encoding );
}

private File relationshipData( boolean includeHeader, Configuration config,
Iterator<RelationshipDataLine> data, PrimitiveIntPredicate linePredicate,
boolean specifyType ) throws Exception
{
return relationshipData( includeHeader, config, randomRelationships( nodeIds ), linePredicate, specifyType );
return relationshipData( includeHeader, config, data, linePredicate, specifyType, Charset.defaultCharset() );
}

private File relationshipData( boolean includeHeader, Configuration config,
Iterator<RelationshipDataLine> data, PrimitiveIntPredicate linePredicate,
boolean specifyType ) throws FileNotFoundException
boolean specifyType, Charset encoding ) throws Exception
{
File file = file( fileName( "relationships.csv" ) );
try ( PrintStream writer = new PrintStream( file ) )
try ( PrintStream writer = writer( file, encoding ) )
{
if ( includeHeader )
{
Expand All @@ -663,16 +706,27 @@ private File relationshipData( boolean includeHeader, Configuration config,
return file;
}

private File relationshipHeader( Configuration config ) throws FileNotFoundException
private File relationshipHeader( Configuration config ) throws Exception
{
return relationshipHeader( config, null, null, true );
return relationshipHeader( config, Charset.defaultCharset() );
}

private File relationshipHeader( Configuration config, Charset encoding ) throws Exception
{
return relationshipHeader( config, null, null, true, encoding );
}

private File relationshipHeader( Configuration config, String startIdGroup, String endIdGroup, boolean specifyType )
throws FileNotFoundException
throws Exception
{
return relationshipHeader( config, startIdGroup, endIdGroup, specifyType, Charset.defaultCharset() );
}

private File relationshipHeader( Configuration config, String startIdGroup, String endIdGroup, boolean specifyType,
Charset encoding ) throws Exception
{
File file = file( fileName( "relationships-header.csv" ) );
try ( PrintStream writer = new PrintStream( file ) )
try ( PrintStream writer = writer( file, encoding ) )
{
writeRelationshipHeader( writer, config, startIdGroup, endIdGroup, specifyType );
}
Expand Down Expand Up @@ -724,11 +778,6 @@ public String toString()
}
}

private static RelationshipDataLine relationship( String startNodeId, String endNodeId )
{
return relationship( startNodeId, endNodeId, null );
}

private static RelationshipDataLine relationship( String startNodeId, String endNodeId, String type )
{
return relationship( startNodeId, endNodeId, type, null );
Expand Down

0 comments on commit 6d29696

Please sign in to comment.