In [1]:
/**
 The following notebook does:
     - Reads a single- or a multi- band GeoTiff from HDFS
     - Filters out or converts NaN values
     - Runs Kmeans
     - Saves the kmeans model to HDFS
**/


import geotrellis.raster.MultibandTile
import geotrellis.spark.io.hadoop._
import geotrellis.vector.ProjectedExtent
import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD

In [14]:
var band_NaN_RDD :RDD[Array[Double]] = sc.emptyRDD

//The following example shows how to get the number of bands, however, it is not known 
//val band_count = geotrellis.raster.io.geotiff.reader.TiffTagsReader.read(filepath).bandCount;

val band_count = 4;
var filepath :String = ""
if (band_count == 1) {
    //Single band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/LastFreeze/1980.tif"
} else {
    //Multi band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/BloomFinal/1980.tif"
}
    
if (band_count == 1) {
    //Lets load a Singleband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopGeoTiffRDD(filepath).values
    band_NaN_RDD = bands_RDD.map( m => m.toArrayDouble())
} else {
    //Lets load a Multiband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopMultibandGeoTiffRDD(filepath).values
    
    //Extract the 4th band
    band_NaN_RDD = bands_RDD.map( m => m.band(3).toArrayDouble())
}

//Go to each vector and print the length of each vector
band_NaN_RDD.collect().foreach(m => println("vector length with NaN is %d".format(m.length)))

//Go to each vector and filter out all NaN values
val band_RDD = band_NaN_RDD.map(m => m.filter(v => !v.isNaN))

//Go to each vector and print the length of each vector
band_RDD.collect().foreach(m => println("vector length without NaN is %d".format(m.length)))

vector length with NaN is 30388736
vector length without NaN is 13695035


In [15]:
// Create a Vector with NaN converted to 0s
//val band_vec = band_NaN_RDD.map(s => Vectors.dense(s.map(v => if (v.isNaN) 0 else v))).cache()

// Create a Vector without NaN values
val band_vec = band_RDD.map(s => Vectors.dense(s)).cache()

In [16]:
val numClusters = 2
val numIterations = 20
val clusters = {
    KMeans.train(band_vec,numClusters,numIterations)
}

// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(band_vec)
println("Within Set Sum of Squared Errors = " + WSSSE)

Within Set Sum of Squared Errors = 0.0


In [18]:
//Un-persist the model
band_vec.unpersist()

// Shows the result.
println("Cluster Centers: ")
//clusters.clusterCenters.foreach(println)

//Clusters save the model
if (band_count == 1) {
    clusters.save(sc, "hdfs:///user/emma/spring_index/LastFreeze/1980_kmeans_model")    
} else {
    clusters.save(sc, "hdfs:///user/emma/spring_index/BloomFinal/1980_kmeans_model")
}


Cluster Centers: 
