Created a COGLayerProvider for the hadoop backend

Signed-off-by: Jacob Bouffard <jbouffard@azavea.com>
locationtech · Mar 28, 2018 · b99d179 · b99d179
1 parent a605f8b
commit b99d179
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 0 deletions.
diff --git a/spark/src/main/scala/geotrellis/spark/io/hadoop/cog/HadoopCOGLayerProvider.scala b/spark/src/main/scala/geotrellis/spark/io/hadoop/cog/HadoopCOGLayerProvider.scala
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2017 Azavea
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package geotrellis.spark.io.hadoop.cog
+
+import geotrellis.spark._
+import geotrellis.spark.io._
+import geotrellis.spark.io.cog._
+import geotrellis.spark.io.hadoop._
+import geotrellis.util.UriUtils
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.conf.Configuration
+import org.apache.spark.SparkContext
+import java.net.URI
+
+/**
+ * Provides [[HadoopAttributeStore]] instance for URI with `hdfs`, `hdfs+file`, `s3n`, `s3a`, `wasb` and `wasbs` schemes.
+ * The uri represents Hadoop [[Path]] of catalog root.
+ * `wasb` and `wasbs` provide support for the Hadoop Azure connector. Additional
+ * configuration is required for this.
+ * This Provider intentinally does not handle the `s3` scheme because the Hadoop implemintation is poor.
+ * That support is provided by [[HadoopAttributeStore]]
+ */
+class HadoopCOGLayerProvider extends AttributeStoreProvider
+    with COGLayerReaderProvider with COGLayerWriterProvider with COGValueReaderProvider with COGCollectionLayerReaderProvider {
+  val schemes: Array[String] = Array("hdfs", "hdfs+file", "s3n", "s3a", "wasb", "wasbs")
+
+  private def trim(uri: URI): URI =
+    if (uri.getScheme.startsWith("hdfs+"))
+      new URI(uri.toString.stripPrefix("hdfs+"))
+    else uri
+
+  def canProcess(uri: URI): Boolean = schemes contains uri.getScheme.toLowerCase
+
+  def attributeStore(uri: URI): AttributeStore = {
+    val path = new Path(trim(uri))
+    val conf = new Configuration()
+    HadoopAttributeStore(path, conf)
+  }
+
+  def layerReader(uri: URI, store: AttributeStore, sc: SparkContext): COGLayerReader[LayerId] = {
+    // don't need uri because HadoopLayerHeader contains full path of the layer
+    new HadoopCOGLayerReader(store)(sc)
+  }
+
+  def layerWriter(uri: URI, store: AttributeStore): COGLayerWriter[LayerId] = {
+    val _uri = trim(uri)
+    val path = new Path(_uri)
+    new HadoopCOGLayerWriter(path.toString, store)
+  }
+
+  def valueReader(uri: URI, store: AttributeStore): COGValueReader[LayerId] = {
+    val _uri = trim(uri)
+    val path = new Path(_uri)
+    val params = UriUtils.getParams(_uri)
+    val conf = new Configuration()
+    val maxOpenFiles = params.getOrElse("maxOpenFiles", "16").toInt
+    new HadoopCOGValueReader(store, conf, maxOpenFiles)
+  }
+
+  def collectionLayerReader(uri: URI, store: AttributeStore) = {
+    val _uri = trim(uri)
+    val path = new Path(_uri)
+    val conf = new Configuration()
+    HadoopCOGCollectionLayerReader(path, conf)
+  }
+}
diff --git a/spark/src/main/scala/geotrellis/spark/io/hadoop/cog/HadoopCOGLayerWriter.scala b/spark/src/main/scala/geotrellis/spark/io/hadoop/cog/HadoopCOGLayerWriter.scala
@@ -23,6 +23,7 @@ import scala.reflect.{ClassTag, classTag}
 class HadoopCOGLayerWriter(
   rootPath: String,
   val attributeStore: AttributeStore
+) extends COGLayerWriter[LayerId] {
   def writeCOGLayer[K: SpatialComponent: Ordering: JsonFormat: ClassTag, V <: CellGrid: GeoTiffReader: ClassTag](
     layerName: String,
     cogLayer: COGLayer[K, V],