In [1]:
/**
 The following notebook does:
     - Reads multi single- or a multi- band GeoTiff from HDFS
     - Filters out or converts NaN values
     - Runs Kmeans
     - Saves the kmeans model to HDFS
**/


import geotrellis.proj4.CRS
import geotrellis.raster.{DoubleArrayTile, Tile}
import geotrellis.raster.io.geotiff._
import geotrellis.raster.io.geotiff.writer.GeoTiffWriter
import geotrellis.raster.io.geotiff.{GeoTiff, SinglebandGeoTiff}
import geotrellis.spark.io.hadoop._
import geotrellis.vector.{Extent, ProjectedExtent}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

In [2]:
    val band_count = 1;
    val in_memory = 2;
    val sample = 1000;
    var projected_extent = new ProjectedExtent(new Extent(0,0,0,0), CRS.fromName("EPSG:3857"))
    var num_cols_rows :(Int, Int) = (0, 0)
    var band_RDD: RDD[Array[Double]] = sc.emptyRDD
    var band_vec: RDD[Vector] = sc.emptyRDD
    var band0: RDD[(Long, Double)] = sc.emptyRDD
    var band0_index: Array[Int] = Array.emptyIntArray
    val pattern: String = "tif"
    var filepath: String = ""
    if (band_count == 1) {
      //Single band GeoTiff
      filepath = "hdfs:///user/hadoop/spring-index/LastFreeze/"
    } else {
      //Multi band GeoTiff
      filepath = "hdfs:///user/hadoop/spring-index/BloomFinal/"
    }

    if (band_count == 1) {
      //Lets load a Singleband GeoTiff and return RDD just with the tiles.
      //Since it is a single GeoTiff, it will be a RDD with a tile.
      val tiles_RDD = sc.hadoopGeoTiffRDD(filepath, pattern).values
      val bands_RDD = tiles_RDD.map(m => m.toArrayDouble())

      val extents_withIndex = sc.hadoopGeoTiffRDD(filepath, pattern).keys.zipWithIndex().map{case (e,v) => (v,e)}
      projected_extent = extents_withIndex.lookup(0).apply(0)

      val tiles_withIndex = tiles_RDD.zipWithIndex().map{case (e,v) => (v,e)}
      num_cols_rows = (tiles_withIndex.lookup(0).apply(0).cols, tiles_withIndex.lookup(0).apply(0).rows)

      //Get Index for Cells
      val bands_withIndex = bands_RDD.zipWithIndex().map { case (e, v) => (v, e) }
      //band0_index = bands_withIndex.lookup(0).apply(0).zipWithIndex.filter{ case (v, i) => !v.isNaN }.map { case (v, i) => (i) }
      band0_index = bands_withIndex.lookup(0).apply(0).zipWithIndex.filter(m => !m._1.isNaN).take(sample).map { case (v, i) => (i) }

      //Get Array[Double] of a Title to later store the cluster ids.
      band0 = sc.parallelize(bands_withIndex.lookup(0).take(1)).flatMap( m => m).zipWithIndex.map{case (v,i) => (i,v)}

      //Lets filter out NaN
      band_RDD = bands_RDD.map(m => m.filter(!_.isNaN).take(sample))
    } else {
      //Lets load a Multiband GeoTiff and return RDD just with the tiles.
      //Since it is a multi-band GeoTiff, we will take band 4
      val tiles_RDD = sc.hadoopMultibandGeoTiffRDD(filepath, pattern).values
      val bands_RDD = tiles_RDD.map(m => m.band(3).toArrayDouble())

      val extents_withIndex = sc.hadoopGeoTiffRDD(filepath, pattern).keys.zipWithIndex().map{case (e,v) => (v,e)}
      projected_extent = extents_withIndex.lookup(0).apply(0)

      val tiles_withIndex = tiles_RDD.zipWithIndex().map{case (e,v) => (v,e)}
      num_cols_rows = (tiles_withIndex.lookup(0).apply(0).cols, tiles_withIndex.lookup(0).apply(0).rows)

      //Get Index for Cells
      val bands_withIndex = bands_RDD.zipWithIndex().map { case (e, v) => (v, e) }
      band0_index = bands_withIndex.lookup(0).apply(0).zipWithIndex.filter { case (v, i) => !v.isNaN }.take(sample).map { case (v, i) => (i) }

      //Get Array[Double] of a Title to later store the cluster ids.
      band0 = sc.parallelize(bands_withIndex.lookup(0).take(1)).flatMap( m => m).zipWithIndex.map{case (v,i) => (i,v)}

      //Let's filter out NaN
      band_RDD = bands_RDD.map(m => m.filter(v => !v.isNaN).take(sample))
    }

Waiting for a Spark session to start...



In [3]:
    /*
    We need to do a Matrix transpose to have clusters per cell
    and not per year. If we do:

    val band_vec = band_RDD.map(s => Vectors.dense(s)).cache()

    The vectors are rows and therefore the matrix will look like this:
    Vectors.dense(0.0, 1.0, 2.0),
    Vectors.dense(3.0, 4.0, 5.0),
    Vectors.dense(6.0, 7.0, 8.0),
    Vectors.dense(9.0, 0.0, 1.0)

    Inspired in:
    http://jacob119.blogspot.nl/2015/11/how-to-convert-matrix-to-rddvector-in.html
    and
    https://stackoverflow.com/questions/29390717/how-to-transpose-an-rdd-in-spark
    */

    if (in_memory == 1) {
      //A) For small memory footprint RDDs we can simply bring it to memory and transpose it
      //First transpose and then parallelize otherwise you get:
      //error: polymorphic expression cannot be instantiated to expected type;
      val band_vec_T = band_RDD.collect().transpose
      band_vec = sc.parallelize(band_vec_T).map(m => Vectors.dense(m)).cache()
    } else {
      //B) For large memory footpring RDDs we need to run in distributed mode

      // Split the matrix into one number per line.
      val byColumnAndRow = band_RDD.zipWithIndex.flatMap {
        case (row, rowIndex) => row.zipWithIndex.map {
          case (number, columnIndex) => columnIndex -> (rowIndex, number)
        }
      }

      // Build up the transposed matrix. Group and sort by column index first.
      val byColumn = byColumnAndRow.groupByKey.sortByKey().values

      // Then sort by row index.
      val transposed = byColumn.map {
        indexedRow => indexedRow.toSeq.sortBy(_._1).map(_._2)
      }

      band_vec = transposed.map(m => Vectors.dense(m.toArray)).cache()
    }



In [4]:
    /*
     Here we will collect some info to see if the transpose worked correctly
    */

    val band_vec_col = band_vec.collect()

    //Number of Columns, i.e., years
    println(band_vec_col.size)

    //Number of cells after filtering our NaN and a take()
    println(band_vec_col(0).size)

    //Values for a cell over the years.
    println(band_vec_col(0))



36
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0]


In [5]:
    /*
     Here we will train kmeans
    */

    val numClusters = 3
    val numIterations = 5
    val clusters = {
      KMeans.train(band_vec, numClusters, numIterations)
    }

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(band_vec)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    //Un-persist the model
    band_vec.unpersist()

[Stage 48:>                                                        (0 + 0) / 36]Within Set Sum of Squared Errors = 2.4225394591567043E7


MapPartitionsRDD[39] at map at <console>:91

In [6]:
    /*
     Cluster model's result management
    */

    // Lets show the result.
    println("Cluster Centers: ")
    clusters.clusterCenters.foreach(println)

    //Lets save the model into HDFS. If the file already exists it will abort and report error.
    /*
    if (band_count == 1) {
        clusters.save(sc, "hdfs:///user/emma/spring_index/LastFreeze/all_kmeans_model")
    } else {
        clusters.save(sc, "hdfs:///user/emma/spring_index/BloomFinal/all_kmeans_model")
    }
    */

Cluster Centers: 
[141.4864376130199,142.8245931283906,146.18444846292948,131.62206148282098,154.124773960217,157.07956600361663,151.12115732368898,147.86618444846292,142.5388788426763,140.06690777576853,139.39783001808317,150.58770343580468,136.16817359855335,136.09764918625677,127.51717902350813,139.88969258589512,152.17902350813742,150.62748643761302,141.0379746835443,164.16817359855335,162.39602169981916,161.50813743218805,164.875226039783,156.2368896925859,125.9873417721519,128.877034358047,130.72151898734177,127.4755877034358,139.251356238698,132.59493670886076,136.21699819168174,129.77396021699818,131.9240506329114,128.54611211573237,116.60397830018083,136.0]
[17.50462962962963,16.98611111111111,16.62037037037037,9.833333333333332,8.268518518518517,17.828703703703702,20.944444444444443,7.592592592592593,13.347222222222221,25.314814814814813,20.37037037037037,21.38425925925926,2.2268518518518516,22.98148148148148,19.953703703703702,22.23611111111111,25.35648148148148,13.560185185

Name: Syntax Error.
Message: 
StackTrace: 

In [7]:
    /*
     Run Kmeans and obtain the clusters per each cell and collect first 50 results.
    */

    //Cache the model
    band_vec.cache()

    val res = clusters.predict(band_vec)
    res.repartition(1)getNumPartitions

    //Un-persist the model
    band_vec.unpersist()

    //Collect first 50
    val res_out = res.collect()//.take(50)

In [8]:
    /*
     Show the cluster ID for the first 50 cells
    */

    //res_out.foreach(println)
    println(res_out.size)
    println(band0_index.size)

1000
1000


In [9]:
    /*
     Save the result as GeoTiff. However, it is not straightforward.
     We need to get the clusterCenter which is a RDD[Vectors]
     It contains a vector per year. However, the vector indices
     are the ones from the ArrayOfDoubles with the NaN values.
    */

    //Merge two RDDs
    val cluster_cell_pos = res.repartition(1).zip(sc.parallelize(band0_index, 1))
    //val cluster_cell_pos = res.zip(sc.parallelize(band0_index, 36))
    cluster_cell_pos.collect().take(50)

Array((1,0), (1,1), (1,2), (1,3), (1,4), (1,5), (1,6), (1,7), (1,8), (1,9), (1,10), (1,11), (1,12), (1,13), (1,14), (1,15), (1,16), (1,17), (1,18), (1,19), (1,20), (1,21), (1,22), (1,23), (1,24), (1,25), (1,26), (1,27), (2,28), (2,29), (2,30), (2,31), (1,32), (1,33), (1,34), (1,35), (2,36), (0,37), (0,38), (0,39), (0,40), (0,41), (2,42), (2,43), (2,44), (0,45), (0,46), (0,47), (0,48), (0,49))

In [10]:
    /*
     Join the RDD with clusters with the Grid of cells from GeoTiff.
     Inspired in:
      https://stackoverflow.com/questions/31257077/how-do-you-perform-basic-joins-of-two-rdd-tables-in-spark-using-python
    */
    //val grid_clusters = band0.join(cluster_cell_pos)
    //val grid_clusters = band0.take(1000).leftOuterJoin(cluster_cell_pos.map{ case (c,i) => (i.toLong, c)})
    val band0_0_1010 = band0.filterByRange(0,1010)
    val grid_clusters = band0_0_1010.repartition(1000).leftOuterJoin(cluster_cell_pos.map{ case (c,i) => (i.toLong, c)}.repartition(1000))

In [35]:
    //val grid_clusters_res = grid_clusters.take(50).foreach(println)
    //val grid_clusters_res = grid_clusters.sortByKey(true).take(50).foreach(println)
    //val grid_clusters_res = grid_clusters.sortByKey(true).map{case (k, (v, c)) => if (c == None) (k, Double.NaN) else (k, c.get)}//.collect().foreach(println)
    val grid_clusters_res = grid_clusters.sortByKey(true).map{case (k, (v, c)) => if (c == None) (k, -1.0) else (k, c.get.toDouble)}//.take(50).foreach(println)



In [38]:
   /*
     Create a GeoTiff and save to HDFS.
    */

    val cluster_cells :Array[Double] = grid_clusters_res.values.collect()
    //val cluster_tile = DoubleArrayTile(cluster_cells, num_cols_rows._1, num_cols_rows._2)
    val cluster_tile = DoubleArrayTile(cluster_cells, 101, 10)
    val geoTiff = SinglebandGeoTiff(cluster_tile, projected_extent.extent, projected_extent.crs, Tags.empty, GeoTiffOptions.DEFAULT)

    sc.parallelize(geoTiff.toByteArray).saveAsObjectFile("hdfs:///user/emma/spring-index/BloomFinal/cluster.tif")

    //Write to the local file system
    //val path = "~/clusters.tif"
    //GeoTiffWriter.write(geoTiff, path)



Name: Syntax Error.
Message: 
StackTrace: 