Skip to content

Commit

Permalink
QuickImport utility
Browse files Browse the repository at this point in the history
for just getting a dataset of size X imported as quickly as possible. Uses
CsvDataGenerator for generating very crude, random data of varying sizes.
Focuses on size more than layout of the data.

QuickImport uses CsvDataGenerator as Input to BatchImporter for
short-circuting the data generator and importer.

Made refactorings around InputEntityDeserializer (CharSeeker->InputEntity)
to allow for composition and reuse of code for this purpose.
  • Loading branch information
tinwelint committed Feb 24, 2015
1 parent 46ec0e6 commit 9725329
Show file tree
Hide file tree
Showing 13 changed files with 845 additions and 260 deletions.
Expand Up @@ -27,35 +27,54 @@
import java.util.Iterator;
import java.util.Random;

import org.neo4j.csv.reader.Extractor;
import org.neo4j.csv.reader.Extractors;
import org.neo4j.helpers.Args;
import org.neo4j.helpers.collection.PrefetchingIterator;
import org.neo4j.helpers.progress.ProgressListener;
import org.neo4j.unsafe.impl.batchimport.InputIterator;
import org.neo4j.unsafe.impl.batchimport.input.csv.Configuration;
import org.neo4j.unsafe.impl.batchimport.input.csv.Deserialization;
import org.neo4j.unsafe.impl.batchimport.input.csv.Header;
import org.neo4j.unsafe.impl.batchimport.input.csv.Header.Entry;
import org.neo4j.unsafe.impl.batchimport.input.csv.IdType;
import org.neo4j.unsafe.impl.batchimport.input.csv.Type;

import static java.lang.System.currentTimeMillis;

import static org.neo4j.helpers.progress.ProgressMonitorFactory.textual;

/**
* Utility for generating a nodes.csv and relationships.csv, with random data structured according
* to supplied headers. Mostly for testing and trying out the batch importer tool.
*/
public class CsvDataGenerator
public class CsvDataGenerator<NODEFORMAT,RELFORMAT>
{
private final Random random = new Random();
private int highNodeId;
private final long nodesSeed, relationshipsSeed;
private final Header nodeHeader;
private final Header relationshipHeader;
private final Configuration config;

public CsvDataGenerator( Header nodeHeader, Header relationshipHeader, Configuration config )
private final long nodes;
private final long relationships;
private final Deserialization<NODEFORMAT> nodeDeserialization;
private final Deserialization<RELFORMAT> relDeserialization;
private final int numberOfLabels;
private final int numberOfRelationshipTypes;

public CsvDataGenerator( Header nodeHeader, Header relationshipHeader, Configuration config,
long nodes, long relationships, Deserialization<NODEFORMAT> nodeDeserialization,
Deserialization<RELFORMAT> relDeserialization,
int numberOfLabels, int numberOfRelationshipTypes )
{
this.nodeHeader = nodeHeader;
this.relationshipHeader = relationshipHeader;
this.config = config;
this.nodes = nodes;
this.relationships = relationships;
this.nodeDeserialization = nodeDeserialization;
this.relDeserialization = relDeserialization;
this.numberOfLabels = numberOfLabels;
this.numberOfRelationshipTypes = numberOfRelationshipTypes;
this.nodesSeed = currentTimeMillis();
this.relationshipsSeed = nodesSeed+1;
}

public String serializeNodeHeader()
Expand Down Expand Up @@ -99,149 +118,82 @@ public String serializeRelationshipHeader()
return serializeHeader( relationshipHeader );
}

public Iterator<String> pullNodeData()
{
return pullIterator( nodeHeader );
}

private Iterator<String> pullIterator( final Header header )
{
return new PrefetchingIterator<String>()
{
@Override
protected String fetchNextOrNull()
{
return serializeDataLine( header );
}
};
}

private String serializeDataLine( Header header )
{
StringBuilder builder = new StringBuilder();
for ( Entry entry : header.entries() )
{
if ( builder.length() > 0 )
{
builder.append( config.delimiter() );
}
serializeDataEntry( builder, entry );
}
return builder.toString();
}

private void serializeDataEntry( StringBuilder builder, Entry entry )
{
switch ( entry.type() )
{
case ID:
builder.append( highNodeId++ );
break;
case PROPERTY:
randomValue( builder, entry.extractor() );
break;
case LABEL:
randomLabels( builder, config.arrayDelimiter() );
break;
case START_ID: case END_ID:
builder.append( random.nextInt( highNodeId ) );
break;
case TYPE:
builder.append( "TYPE_" ).append( random.nextInt( 4 ) );
break;
default:
return;
}
}

private void randomValue( StringBuilder builder, Extractor<?> extractor )
public InputIterator<NODEFORMAT> nodeData()
{
// TODO crude way of determining value type
String type = extractor.toString();
if ( type.equals( "String" ) )
{
randomString( builder );
}
else if ( type.equals( "long" ) )
{
builder.append( random.nextInt( Integer.MAX_VALUE ) );
}
else if ( type.equals( "int" ) )
{
builder.append( random.nextInt( 20 ) );
}
else
{
throw new IllegalArgumentException( "" + extractor );
}
return new RandomDataIterator<>( nodeHeader, nodes, new Random( nodesSeed ), nodeDeserialization, nodes,
numberOfLabels, numberOfRelationshipTypes );
}

public Iterator<String> pullRelationshipData()
public InputIterator<RELFORMAT> relationshipData()
{
return pullIterator( relationshipHeader );
}

private void randomLabels( StringBuilder builder, char arrayDelimiter )
{
int length = random.nextInt( 3 );
for ( int i = 0; i < length; i++ )
{
if ( i > 0 )
{
builder.append( arrayDelimiter );
}
builder.append( "LABEL_" ).append( random.nextInt( 4 ) );
}
}

private void randomString( StringBuilder builder )
{
int length = random.nextInt( 10 )+5;
for ( int i = 0; i < length; i++ )
{
builder.append( (char) ('a' + random.nextInt( 20 )) );
}
return new RandomDataIterator<>( relationshipHeader, relationships, new Random( relationshipsSeed ),
relDeserialization, nodes, numberOfLabels, numberOfRelationshipTypes );
}

public static void main( String[] arguments ) throws IOException
{
Args args = Args.parse( arguments );
int nodeCount = args.getNumber( "nodes", null ).intValue();
int relationshipCount = args.getNumber( "relationships", null ).intValue();
int labelCount = args.getNumber( "labels", 4 ).intValue();
int relationshipTypeCount = args.getNumber( "relationship-types", 4 ).intValue();

Configuration config = Configuration.COMMAS;
Extractors extractors = new Extractors( config.arrayDelimiter() );
Header nodeHeader = new Header( new Entry[] {
new Entry( null, Type.ID, null, extractors.string() ),
IdType idType = IdType.ACTUAL;
Header nodeHeader = sillyNodeHeader( idType, extractors );
Header relationshipHeader = bareboneRelationshipHeader( idType, extractors );

ProgressListener progress = textual( System.out ).singlePart( "Generating", nodeCount + relationshipCount );
CsvDataGenerator<String,String> generator = new CsvDataGenerator<>(
nodeHeader, relationshipHeader,
config, nodeCount, relationshipCount,
new StringDeserialization( config ), new StringDeserialization( config ),
labelCount, relationshipTypeCount );
writeData( generator.serializeNodeHeader(), generator.nodeData(),
new File( "target", "nodes.csv" ), progress );
writeData( generator.serializeRelationshipHeader(), generator.relationshipData(),
new File( "target", "relationships.csv" ), progress );
progress.done();
}

public static Header sillyNodeHeader( IdType idType, Extractors extractors )
{
return new Header( new Entry[] {
new Entry( null, Type.ID, null, idType.extractor( extractors ) ),
new Entry( "name", Type.PROPERTY, null, extractors.string() ),
new Entry( "age", Type.PROPERTY, null, extractors.int_() ),
new Entry( "something", Type.PROPERTY, null, extractors.string() ),
new Entry( null, Type.LABEL, null, extractors.stringArray() ),
} );
Header relationshipHeader = new Header( new Entry[] {
new Entry( null, Type.START_ID, null, extractors.string() ),
new Entry( null, Type.END_ID, null, extractors.string() ),
new Entry( null, Type.TYPE, null, extractors.string() )
}

public static Header bareboneNodeHeader( IdType idType, Extractors extractors )
{
return new Header( new Entry[] {
new Entry( null, Type.ID, null, idType.extractor( extractors ) ),
new Entry( null, Type.LABEL, null, extractors.stringArray() ),
} );
}

ProgressListener progress = textual( System.out ).singlePart( "Generating", nodeCount + relationshipCount );
CsvDataGenerator generator = new CsvDataGenerator( nodeHeader, relationshipHeader, config );
writeData( generator.serializeNodeHeader(), generator.pullNodeData(),
new File( "target", "nodes.csv" ), progress, nodeCount );
writeData( generator.serializeRelationshipHeader(), generator.pullRelationshipData(),
new File( "target", "relationships.csv" ), progress, relationshipCount );
progress.done();
public static Header bareboneRelationshipHeader( IdType idType, Extractors extractors )
{
return new Header( new Entry[] {
new Entry( null, Type.START_ID, null, idType.extractor( extractors ) ),
new Entry( null, Type.END_ID, null, idType.extractor( extractors ) ),
new Entry( null, Type.TYPE, null, extractors.string() )
} );
}

private static void writeData( String header, Iterator<String> iterator, File file,
ProgressListener progress, int count ) throws IOException
ProgressListener progress ) throws IOException
{
System.out.println( "Writing " + file.getAbsolutePath() );
try ( Writer out = new BufferedWriter( new FileWriter( file ), 102*1024*10 ) )
{
out.write( header );
out.append( '\n' );
for ( int i = 0; i < count; i++ )
while ( iterator.hasNext() )
{
out.write( iterator.next() );
out.append( '\n' );
Expand Down
@@ -0,0 +1,98 @@
/**
* Copyright (c) 2002-2015 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.tooling;

import org.neo4j.unsafe.impl.batchimport.BatchImporter;
import org.neo4j.unsafe.impl.batchimport.InputIterable;
import org.neo4j.unsafe.impl.batchimport.InputIterator;
import org.neo4j.unsafe.impl.batchimport.cache.idmapping.IdGenerator;
import org.neo4j.unsafe.impl.batchimport.cache.idmapping.IdMapper;
import org.neo4j.unsafe.impl.batchimport.input.Groups;
import org.neo4j.unsafe.impl.batchimport.input.Input;
import org.neo4j.unsafe.impl.batchimport.input.InputNode;
import org.neo4j.unsafe.impl.batchimport.input.InputRelationship;
import org.neo4j.unsafe.impl.batchimport.input.csv.Configuration;
import org.neo4j.unsafe.impl.batchimport.input.csv.Header;
import org.neo4j.unsafe.impl.batchimport.input.csv.IdType;
import org.neo4j.unsafe.impl.batchimport.input.csv.InputNodeDeserialization;
import org.neo4j.unsafe.impl.batchimport.input.csv.InputRelationshipDeserialization;

/**
* Uses {@link CsvDataGenerator} as an {@link Input} directly into a {@link BatchImporter}.
*/
public class CsvDataGeneratorInput extends CsvDataGenerator<InputNode,InputRelationship> implements Input
{
private final IdType idType;

public CsvDataGeneratorInput( Header nodeHeader, Header relationshipHeader,
Configuration config, long nodes, long relationships, Groups groups, IdType idType,
int numberOfLabels, int numberOfRelationshipTypes )
{
super( nodeHeader, relationshipHeader, config, nodes, relationships,
new InputNodeDeserialization( nodeHeader, groups, idType.idsAreExternal() ),
new InputRelationshipDeserialization( relationshipHeader, groups ),
numberOfLabels, numberOfRelationshipTypes );
this.idType = idType;
}

@Override
public InputIterable<InputNode> nodes()
{
return new InputIterable<InputNode>()
{
@Override
public InputIterator<InputNode> iterator()
{
return nodeData();
}
};
}

@Override
public InputIterable<InputRelationship> relationships()
{
return new InputIterable<InputRelationship>()
{
@Override
public InputIterator<InputRelationship> iterator()
{
return relationshipData();
}
};
}

@Override
public IdMapper idMapper()
{
return idType.idMapper();
}

@Override
public IdGenerator idGenerator()
{
return idType.idGenerator();
}

@Override
public boolean specificRelationshipIds()
{
return false;
}
}

0 comments on commit 9725329

Please sign in to comment.