contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/HadoopStoreBuilder.java

/*
 * Copyright 2008-2009 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package voldemort.store.readonly.mr;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;

import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.Pair;
import org.apache.commons.codec.binary.Hex;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

import voldemort.VoldemortException;
import voldemort.cluster.Cluster;
import voldemort.cluster.Node;
import voldemort.store.StoreDefinition;
import voldemort.store.readonly.ReadOnlyStorageFormat;
import voldemort.store.readonly.ReadOnlyStorageMetadata;
import voldemort.store.readonly.checksum.CheckSum;
import voldemort.store.readonly.checksum.CheckSum.CheckSumType;
import voldemort.store.readonly.disk.KeyValueWriter;
import voldemort.utils.Utils;
import voldemort.xml.ClusterMapper;
import voldemort.xml.StoreDefinitionsMapper;

/**
 * Builds a read-only voldemort store as a hadoop job from the given input data.
 * 
 */
@SuppressWarnings("deprecation")
public class HadoopStoreBuilder {

    public static final long MIN_CHUNK_SIZE = 1L;
    public static final long MAX_CHUNK_SIZE = (long) (1.9 * 1024 * 1024 * 1024);
    public static final int DEFAULT_BUFFER_SIZE = 64 * 1024;

    private static final Logger logger = Logger.getLogger(HadoopStoreBuilder.class);

    private final Configuration config;
    private final Class mapperClass;
    @SuppressWarnings("unchecked")
    private final Class<? extends InputFormat> inputFormatClass;
    private final Cluster cluster;
    private final StoreDefinition storeDef;
    private final long chunkSizeBytes;
    private final Path inputPath;
    private final Path outputDir;
    private final Path tempDir;
    private CheckSumType checkSumType = CheckSumType.NONE;
    private boolean saveKeys = false;
    private boolean reducerPerBucket = false;
    private int numChunks = -1;

    private boolean isAvro;

    /**
     * Kept for backwards compatibility. We do not use replicationFactor any
     * more since it is derived from the store definition
     * 
     * @param conf A base configuration to start with
     * @param mapperClass The class to use as the mapper
     * @param inputFormatClass The input format to use for reading values
     * @param cluster The voldemort cluster for which the stores are being built
     * @param storeDef The store definition of the store
     * @param replicationFactor NOT USED
     * @param chunkSizeBytes The size of the chunks used by the read-only store
     * @param tempDir The temporary directory to use in hadoop for intermediate
     *        reducer output
     * @param outputDir The directory in which to place the built stores
     * @param inputPath The path from which to read input data
     */
    @SuppressWarnings("unchecked")
    @Deprecated
    public HadoopStoreBuilder(Configuration conf,
                              Class mapperClass,
                              Class<? extends InputFormat> inputFormatClass,
                              Cluster cluster,
                              StoreDefinition storeDef,
                              int replicationFactor,
                              long chunkSizeBytes,
                              Path tempDir,
                              Path outputDir,
                              Path inputPath) {
        this(conf,
             mapperClass,
             inputFormatClass,
             cluster,
             storeDef,
             chunkSizeBytes,
             tempDir,
             outputDir,
             inputPath);
    }

    /**
     * Create the store builder
     * 
     * @param conf A base configuration to start with
     * @param mapperClass The class to use as the mapper
     * @param inputFormatClass The input format to use for reading values
     * @param cluster The voldemort cluster for which the stores are being built
     * @param storeDef The store definition of the store
     * @param chunkSizeBytes The size of the chunks used by the read-only store
     * @param tempDir The temporary directory to use in hadoop for intermediate
     *        reducer output
     * @param outputDir The directory in which to place the built stores
     * @param inputPath The path from which to read input data
     */
    @SuppressWarnings("unchecked")
    public HadoopStoreBuilder(Configuration conf,
                              Class mapperClass,
                              Class<? extends InputFormat> inputFormatClass,
                              Cluster cluster,
                              StoreDefinition storeDef,
                              long chunkSizeBytes,
                              Path tempDir,
                              Path outputDir,
                              Path inputPath) {
        super();
        this.config = conf;
        this.mapperClass = Utils.notNull(mapperClass);
        this.inputFormatClass = Utils.notNull(inputFormatClass);
        this.inputPath = inputPath;
        this.cluster = Utils.notNull(cluster);
        this.storeDef = Utils.notNull(storeDef);
        this.chunkSizeBytes = chunkSizeBytes;
        this.tempDir = tempDir;
        this.outputDir = Utils.notNull(outputDir);
        isAvro = false;
        if(chunkSizeBytes > MAX_CHUNK_SIZE || chunkSizeBytes < MIN_CHUNK_SIZE)
            throw new VoldemortException("Invalid chunk size, chunk size must be in the range "
                                         + MIN_CHUNK_SIZE + "..." + MAX_CHUNK_SIZE);
    }

    /**
     * Create the store builder
     * 
     * @param conf A base configuration to start with
     * @param mapperClass The class to use as the mapper
     * @param inputFormatClass The input format to use for reading values
     * @param cluster The voldemort cluster for which the stores are being built
     * @param storeDef The store definition of the store
     * @param chunkSizeBytes The size of the chunks used by the read-only store
     * @param tempDir The temporary directory to use in hadoop for intermediate
     *        reducer output
     * @param outputDir The directory in which to place the built stores
     * @param inputPath The path from which to read input data
     * @param checkSumType The checksum algorithm to use
     */
    @SuppressWarnings("unchecked")
    public HadoopStoreBuilder(Configuration conf,
                              Class mapperClass,
                              Class<? extends InputFormat> inputFormatClass,
                              Cluster cluster,
                              StoreDefinition storeDef,
                              long chunkSizeBytes,
                              Path tempDir,
                              Path outputDir,
                              Path inputPath,
                              CheckSumType checkSumType) {
        this(conf,
             mapperClass,
             inputFormatClass,
             cluster,
             storeDef,
             chunkSizeBytes,
             tempDir,
             outputDir,
             inputPath);
        this.checkSumType = checkSumType;

    }

    /**
     * Create the store builder
     * 
     * @param conf A base configuration to start with
     * @param mapperClass The class to use as the mapper
     * @param inputFormatClass The input format to use for reading values
     * @param cluster The voldemort cluster for which the stores are being built
     * @param storeDef The store definition of the store
     * @param chunkSizeBytes The size of the chunks used by the read-only store
     * @param tempDir The temporary directory to use in hadoop for intermediate
     *        reducer output
     * @param outputDir The directory in which to place the built stores
     * @param inputPath The path from which to read input data
     * @param checkSumType The checksum algorithm to use
     * @param saveKeys Boolean to signify if we want to save the key as well
     * @param reducerPerBucket Boolean to signify whether we want to have a
     *        single reducer for a bucket ( thereby resulting in all chunk files
     *        for a bucket being generated in a single reducer )
     */
    @SuppressWarnings("unchecked")
    public HadoopStoreBuilder(Configuration conf,
                              Class mapperClass,
                              Class<? extends InputFormat> inputFormatClass,
                              Cluster cluster,
                              StoreDefinition storeDef,
                              long chunkSizeBytes,
                              Path tempDir,
                              Path outputDir,
                              Path inputPath,
                              CheckSumType checkSumType,
                              boolean saveKeys,
                              boolean reducerPerBucket) {
        this(conf,
             mapperClass,
             inputFormatClass,
             cluster,
             storeDef,
             chunkSizeBytes,
             tempDir,
             outputDir,
             inputPath,
             checkSumType);
        this.saveKeys = saveKeys;
        this.reducerPerBucket = reducerPerBucket;
    }

    /**
     * Create the store builder
     * 
     * @param conf A base configuration to start with
     * @param mapperClass The class to use as the mapper
     * @param inputFormatClass The input format to use for reading values
     * @param cluster The voldemort cluster for which the stores are being built
     * @param storeDef The store definition of the store
     * @param tempDir The temporary directory to use in hadoop for intermediate
     *        reducer output
     * @param outputDir The directory in which to place the built stores
     * @param inputPath The path from which to read input data
     * @param checkSumType The checksum algorithm to use
     * @param saveKeys Boolean to signify if we want to save the key as well
     * @param reducerPerBucket Boolean to signify whether we want to have a
     *        single reducer for a bucket ( thereby resulting in all chunk files
     *        for a bucket being generated in a single reducer )
     * @param numChunks Number of chunks per bucket ( partition or partition
     *        replica )
     */
    @SuppressWarnings("unchecked")
    public HadoopStoreBuilder(Configuration conf,
                              Class mapperClass,
                              Class<? extends InputFormat> inputFormatClass,
                              Cluster cluster,
                              StoreDefinition storeDef,
                              Path tempDir,
                              Path outputDir,
                              Path inputPath,
                              CheckSumType checkSumType,
                              boolean saveKeys,
                              boolean reducerPerBucket,
                              int numChunks) {
        super();
        this.config = conf;
        this.mapperClass = Utils.notNull(mapperClass);
        this.inputFormatClass = Utils.notNull(inputFormatClass);
        this.inputPath = inputPath;
        this.cluster = Utils.notNull(cluster);
        this.storeDef = Utils.notNull(storeDef);
        this.chunkSizeBytes = -1;
        this.tempDir = tempDir;
        this.outputDir = Utils.notNull(outputDir);
        this.checkSumType = checkSumType;
        this.saveKeys = saveKeys;
        this.reducerPerBucket = reducerPerBucket;
        this.numChunks = numChunks;
        isAvro = false;
        if(numChunks <= 0)
            throw new VoldemortException("Number of chunks should be greater than zero");
    }

    /**
     * Run the job
     */
    public void build() {
        try {
            JobConf conf = new JobConf(config);
            conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
            conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
            conf.set("stores.xml",
                     new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
            conf.setBoolean("save.keys", saveKeys);
            conf.setBoolean("reducer.per.bucket", reducerPerBucket);
            if(!isAvro) {
                conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
                conf.setMapperClass(mapperClass);
                conf.setMapOutputKeyClass(BytesWritable.class);
                conf.setMapOutputValueClass(BytesWritable.class);
                if(reducerPerBucket) {
                    conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
                } else {
                    conf.setReducerClass(HadoopStoreBuilderReducer.class);
                }
            }
            conf.setInputFormat(inputFormatClass);
            conf.setOutputFormat(SequenceFileOutputFormat.class);
            conf.setOutputKeyClass(BytesWritable.class);
            conf.setOutputValueClass(BytesWritable.class);
            conf.setJarByClass(getClass());
            conf.setReduceSpeculativeExecution(false);
            FileInputFormat.setInputPaths(conf, inputPath);
            conf.set("final.output.dir", outputDir.toString());
            conf.set("checksum.type", CheckSum.toString(checkSumType));
            FileOutputFormat.setOutputPath(conf, tempDir);

            FileSystem outputFs = outputDir.getFileSystem(conf);
            if(outputFs.exists(outputDir)) {
                throw new IOException("Final output directory already exists.");
            }

            // delete output dir if it already exists
            FileSystem tempFs = tempDir.getFileSystem(conf);
            tempFs.delete(tempDir, true);

            long size = sizeOfPath(tempFs, inputPath);
            logger.info("Data size = " + size + ", replication factor = "
                        + storeDef.getReplicationFactor() + ", numNodes = "
                        + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

            // Derive "rough" number of chunks and reducers
            int numReducers;
            if(saveKeys) {

                if(this.numChunks == -1) {
                    this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                                                     / cluster.getNumberOfPartitions()
                                                     / storeDef.getReplicationFactor() / chunkSizeBytes),
                                              1);
                } else {
                    logger.info("Overriding chunk size byte and taking num chunks ("
                                + this.numChunks + ") directly");
                }

                if(reducerPerBucket) {
                    numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
                } else {
                    numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor()
                                  * numChunks;
                }
            } else {

                if(this.numChunks == -1) {
                    this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                                                     / cluster.getNumberOfPartitions() / chunkSizeBytes),
                                              1);
                } else {
                    logger.info("Overriding chunk size byte and taking num chunks ("
                                + this.numChunks + ") directly");
                }

                if(reducerPerBucket) {
                    numReducers = cluster.getNumberOfPartitions();
                } else {
                    numReducers = cluster.getNumberOfPartitions() * numChunks;
                }
            }
            conf.setInt("num.chunks", numChunks);
            conf.setNumReduceTasks(numReducers);

            if(isAvro) {
                conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
                // conf.setMapperClass(mapperClass);
                conf.setMapOutputKeyClass(ByteBuffer.class);
                conf.setMapOutputValueClass(ByteBuffer.class);

                conf.setInputFormat(inputFormatClass);

                conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
                conf.setOutputKeyClass(ByteBuffer.class);
                conf.setOutputValueClass(ByteBuffer.class);

                // AvroJob confs for the avro mapper
                AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

                AvroJob.setOutputSchema(conf,
                                        Pair.getPairSchema(Schema.create(Schema.Type.BYTES),
                                                           Schema.create(Schema.Type.BYTES)));

                AvroJob.setMapperClass(conf, mapperClass);

                if(reducerPerBucket) {
                    conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
                } else {
                    conf.setReducerClass(AvroStoreBuilderReducer.class);
                }

            }

            logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers
                        + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
            logger.info("Building store...");
            RunningJob job = JobClient.runJob(conf);

            // Once the job has completed log the counter
            Counters counters = job.getCounters();

            if(saveKeys) {
                if(reducerPerBucket) {
                    logger.info("Number of collisions in the job - "
                                + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                    logger.info("Maximum number of collisions for one entry - "
                                + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
                } else {
                    logger.info("Number of collisions in the job - "
                                + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                    logger.info("Maximum number of collisions for one entry - "
                                + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
                }
            }

            // Do a CheckSumOfCheckSum - Similar to HDFS
            CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
            if(!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
                throw new VoldemortException("Could not generate checksum digest for type "
                                             + this.checkSumType);
            }

            // Check if all folder exists and with format file
            for(Node node: cluster.getNodes()) {

                ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

                if(saveKeys) {
                    metadata.add(ReadOnlyStorageMetadata.FORMAT,
                                 ReadOnlyStorageFormat.READONLY_V2.getCode());
                } else {
                    metadata.add(ReadOnlyStorageMetadata.FORMAT,
                                 ReadOnlyStorageFormat.READONLY_V1.getCode());
                }

                Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

                if(!outputFs.exists(nodePath)) {
                    logger.info("No data generated for node " + node.getId()
                                + ". Generating empty folder");
                    outputFs.mkdirs(nodePath); // Create empty folder
                    outputFs.setPermission(nodePath, new FsPermission("755"));
                    logger.info("Setting permission to 755 for " + nodePath);
                }

                if(checkSumType != CheckSumType.NONE) {

                    FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                        public boolean accept(Path arg0) {
                            if(arg0.getName().endsWith("checksum")
                               && !arg0.getName().startsWith(".")) {
                                return true;
                            }
                            return false;
                        }
                    });

                    if(storeFiles != null && storeFiles.length > 0) {
                        Arrays.sort(storeFiles, new IndexFileLastComparator());
                        FSDataInputStream input = null;

                        for(FileStatus file: storeFiles) {
                            try {
                                input = outputFs.open(file.getPath());
                                byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                                input.read(fileCheckSum);
                                logger.debug("Checksum for file " + file.toString() + " - "
                                             + new String(Hex.encodeHex(fileCheckSum)));
                                checkSumGenerator.update(fileCheckSum);
                            } catch(Exception e) {
                                logger.error("Error while reading checksum file " + e.getMessage(),
                                             e);
                            } finally {
                                if(input != null)
                                    input.close();
                            }
                            outputFs.delete(file.getPath(), false);
                        }

                        metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE,
                                     CheckSum.toString(checkSumType));

                        String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                        logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                        metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                    }
                }

                // Write metadata
                Path metadataPath = new Path(nodePath, ".metadata");
                FSDataOutputStream metadataStream = outputFs.create(metadataPath);
                outputFs.setPermission(metadataPath, new FsPermission("755"));
                logger.info("Setting permission to 755 for " + metadataPath);
                metadataStream.write(metadata.toJsonString().getBytes());
                metadataStream.flush();
                metadataStream.close();

            }

        } catch(Exception e) {
            logger.error("Error in Store builder", e);
            throw new VoldemortException(e);
        }

    }

    /**
     * Run the job
     */
    public void buildAvro() {

        isAvro = true;
        build();
        return;

    }

    /**
     * A comparator that sorts index files last. This is required to maintain
     * the order while calculating checksum
     * 
     */
    static class IndexFileLastComparator implements Comparator<FileStatus> {

        public int compare(FileStatus fs1, FileStatus fs2) {
            // directories before files
            if(fs1.isDir())
                return fs2.isDir() ? 0 : -1;
            if(fs2.isDir())
                return fs1.isDir() ? 0 : 1;

            String f1 = fs1.getPath().getName(), f2 = fs2.getPath().getName();

            // if both same, lexicographically
            if((f1.contains(".index") && f2.contains(".index"))
               || (f1.contains(".data") && f2.contains(".data"))) {
                return f1.compareToIgnoreCase(f2);
            }

            if(f1.contains(".index")) {
                return 1;
            } else {
                return -1;
            }
        }
    }

    private long sizeOfPath(FileSystem fs, Path path) throws IOException {
        long size = 0;
        FileStatus[] statuses = fs.listStatus(path);
        if(statuses != null) {
            for(FileStatus status: statuses) {
                if(status.isDir())
                    size += sizeOfPath(fs, status.getPath());
                else
                    size += status.getLen();
            }
        }
        return size;
    }

}