In [3]:
/**
 The following notebook does:
     - Reads multi single- or a multi- band GeoTiff from HDFS
     - Filters out or converts NaN values
     - Runs Kmeans
     - Saves the kmeans model to HDFS
**/


import geotrellis.raster.MultibandTile
import geotrellis.vector.Extent
import geotrellis.spark.io.hadoop._
import geotrellis.vector.ProjectedExtent
import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD

In [10]:
//The following example shows how to get the number of bands, however, it is not known 
//val band_count = geotrellis.raster.io.geotiff.reader.TiffTagsReader.read(filepath).bandCount;

val band_count = 1;
var band_RDD :RDD[Array[Double]] = sc.emptyRDD
val pattern: String = "tif"
var filepath :String = ""

var extent_USA = new Extent(-126.30312894720473, 14.29219617034159, -56.162671563152486, 49.25462702827337)
var bands_USA :(Double, Double) = (0, 0)  

if (band_count == 1) {
    //Single band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/LastFreeze/"
} else {
    //Multi band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/BloomFinal/"
}
if (band_count == 1) {
    //Lets load a Singleband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopGeoTiffRDD(filepath, pattern).values
    val extents_withIndex = sc.hadoopGeoTiffRDD(filepath, pattern).keys.zipWithIndex().map{case (e,v) => (v,e)}
    extent_USA = extents_withIndex.lookup(0).apply(0).extent
    
    val bands_withIndex = bands_RDD.zipWithIndex().map{case (e,v) => (v,e)}
    bands_USA = (bands_withIndex.lookup(0).apply(0).cols, bands_withIndex.lookup(0).apply(0).rows)
    
    //Lets filter out NaN
    band_RDD = bands_RDD.map(m => m.toArrayDouble().filter(!_.isNaN))
} else {
    //Lets load a Multiband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopMultibandGeoTiffRDD(filepath, pattern).values
    val extents_withIndex = sc.hadoopGeoTiffRDD(filepath, pattern).keys.zipWithIndex().map{case (e,v) => (v,e)}
    extent_USA = extents_withIndex.lookup(0).apply(0).extent
    
    val bands_withIndex = bands_RDD.zipWithIndex().map{case (e,v) => (v,e)}
    bands_USA = (bands_withIndex.lookup(0).apply(0).cols, bands_withIndex.lookup(0).apply(0).rows)
    
    //Extract the 4th band and filter out NaN
    band_RDD = bands_RDD.map(m => m.band(3).toArrayDouble().filter(v => !v.isNaN))
}



In [11]:
println (bands_USA)

(7808.0,3892.0)


In [None]:
println(extent_USA)

In [9]:
// Create a Vector without NaN values
val band_vec = band_RDD.map(s => Vectors.dense(s)).cache()

In [11]:
val numClusters = 3
val numIterations = 5
val clusters = {
    KMeans.train(band_vec,numClusters,numIterations)
}

// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(band_vec)
println("Within Set Sum of Squared Errors = " + WSSSE)

//Un-persist the model
band_vec.unpersist()



Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 24 in stage 23.0 failed 4 times, most recent failure: Lost task 24.3 in stage 23.0 (TID 886, 145.100.116.163, executor 18): ExecutorLostFailure (executor 18 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
Driver stacktrace:
StackTrace: Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
  at org.apache.spark.schedul

In [22]:
// Shows the result.
println("Cluster Centers: ")
//clusters.clusterCenters.foreach(println)

//Clusters save the model
if (band_count == 1) {
    clusters.save(sc, "hdfs:///user/emma/spring_index/LastFreeze/all_kmeans_model")    
} else {
    clusters.save(sc, "hdfs:///user/emma/spring_index/BloomFinal/allkmeans_model")
}


Cluster Centers: 