# Read and Write a Sequence File

This notebook shows how to create a sequence file, append (K, V) and then store it. Then the same file is openned and the structures are read back.

## Dependencies

In [1]:
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import geotrellis.proj4.CRS
import geotrellis.spark.io.hadoop._
import geotrellis.vector.{Extent, ProjectedExtent}
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

## Functions to (de)serialize any structure into Array[Byte]

In [2]:
def serialize(value: Any): Array[Byte] = {
    val out_stream: ByteArrayOutputStream = new ByteArrayOutputStream()
    val obj_out_stream = new ObjectOutputStream(out_stream)
    obj_out_stream.writeObject(value)
    obj_out_stream.close
    out_stream.toByteArray
}

def deserialize(bytes: Array[Byte]): Any = {
    val obj_in_stream = new ObjectInputStream(new ByteArrayInputStream(bytes))
    val value = obj_in_stream.readObject
    obj_in_stream.close
    value
}

serialize: (value: Any)Array[Byte]
deserialize: (bytes: Array[Byte])Any


## Create sequence file

In [3]:
var conf = sc.hadoopConfiguration
var fs = org.apache.hadoop.fs.FileSystem.get(conf)

var metadata_path = "hdfs:///user/emma/spring-index/LastFreeze/sequence_file"
var projected_extent = new ProjectedExtent(new Extent(10,110,10,110), CRS.fromName("EPSG:3857"))
var num_cols_rows :(Int, Int) = (100, 8000000)

val writer: SequenceFile.Writer = SequenceFile.createWriter(conf,
    Writer.file(metadata_path),
    Writer.keyClass(classOf[IntWritable]),
    Writer.valueClass(classOf[BytesWritable])
)

writer.append(new IntWritable(1), new BytesWritable(serialize(projected_extent)))
writer.append(new IntWritable(2), new BytesWritable(serialize(num_cols_rows._1)))
writer.append(new IntWritable(3), new BytesWritable(serialize(num_cols_rows._2)))
writer.hflush()
writer.close()

Waiting for a Spark session to start...

conf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, file:/usr/lib/spark-2.1.1-bin-without-hadoop/conf/hive-site.xml
fs = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1280313744_36, ugi=emma (auth:SIMPLE)]]
metadata_path = hdfs:///user/emma/spring-index/LastFreeze/sequence_file
projected_extent = ProjectedExtent(Extent(10.0, 110.0, 10.0, 110.0),geotrellis.proj4.CRS$$anon$3@a942e1e1)
num_cols_rows = (100,8000000)
writer = org.apache.hadoop.io.SequenceFile$RecordCompressWriter@1dc9de7f


org.apache.hadoop.io.SequenceFile$RecordCompressWriter@1dc9de7f

## Read sequence file

In [4]:
val metadata = sc.sequenceFile(metadata_path, classOf[IntWritable], classOf[BytesWritable]).map(_._2.copyBytes()).collect()
val projected_extent_out = deserialize(metadata(0))
val num_cols_rows_out = (deserialize(metadata(1)), deserialize(metadata(2)))

metadata = Array(Array(-84, -19, 0, 5, 115, 114, 0, 33, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 46, 118, 101, 99, 116, 111, 114, 46, 80, 114, 111, 106, 101, 99, 116, 101, 100, 69, 120, 116, 101, 110, 116, 18, -8, -17, -14, 105, 64, -105, -86, 2, 0, 2, 76, 0, 3, 99, 114, 115, 116, 0, 22, 76, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 47, 112, 114, 111, 106, 52, 47, 67, 82, 83, 59, 76, 0, 6, 101, 120, 116, 101, 110, 116, 116, 0, 26, 76, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 47, 118, 101, 99, 116, 111, 114, 47, 69, 120, 116, 101, 110, 116, 59, 120, 112, 115, 114, 0, 28, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 46, 112, 114, 111, 106, 52, 46, 67, 82, 83, 36, 36, 97, 110, 111, 110, 36, 51, 80, 95, -10, 36, -66, 65, 91, -28, 2, 0, 2, 6...


[[-84, -19, 0, 5, 115, 114, 0, 33, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 46, 118, 101, 99, 116, 111, 114, 46, 80, 114, 111, 106, 101, 99, 116, 101, 100, 69, 120, 116, 101, 110, 116, 18, -8, -17, -14, 105, 64, -105, -86, 2, 0, 2, 76, 0, 3, 99, 114, 115, 116, 0, 22, 76, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 47, 112, 114, 111, 106, 52, 47, 67, 82, 83, 59, 76, 0, 6, 101, 120, 116, 101, 110, 116, 116, 0, 26, 76, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 47, 118, 101, 99, 116, 111, 114, 47, 69, 120, 116, 101, 110, 116, 59, 120, 112, 115, 114, 0, 28, 103, 101, 111, 116, 114, 101, 108, 108, 105, 115, 46, 112, 114, 111, 106, 52, 46, 67, 82, 83, 36, 36, 97, 110, 111, 110, 36, 51, 80, 95, -10, 36, -66, 65, 91, -28, 2, 0, 2, 68, 0, 7, 69, 112, 115, 105, 108, 111, 110, 76, 0, 9, 112, 114, 111, 106, 52, 106, 67, 114, 115, 116, 0, 44, 76, 111, 114, 103, 47, 111, 115, 103, 101, 111, 47, 112, 114, 111, 106, 52, 106, 47, 67, 111, 111, 114, 100, 105, 110, 97, 116, 101, 82,

## Compare outputs

In [5]:
println(projected_extent)
println(projected_extent_out)

println(num_cols_rows)
println(num_cols_rows_out)

ProjectedExtent(Extent(10.0, 110.0, 10.0, 110.0),geotrellis.proj4.CRS$$anon$3@a942e1e1)
ProjectedExtent(Extent(10.0, 110.0, 10.0, 110.0),geotrellis.proj4.CRS$$anon$3@a942e1e1)
(100,8000000)
(100,8000000)
