In [66]:
/**
 The following notebook does:
     - Reads multi single- or a multi- band GeoTiff from HDFS
     - Filters out or converts NaN values
     - Runs Kmeans
     - Saves the kmeans model to HDFS
**/


import geotrellis.raster.MultibandTile
import geotrellis.spark.io.hadoop._
import geotrellis.vector.ProjectedExtent
import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD

In [67]:
/*
 The following example shows how to get the number of bands, however, it is not known 
val band_count = geotrellis.raster.io.geotiff.reader.TiffTagsReader.read(filepath).bandCount;
*/

val band_count = 1;
val in_memory = 2;
var band_RDD :RDD[Array[Double]] = sc.emptyRDD
var band_vec :RDD[Vector] = sc.emptyRDD
val pattern: String = "tif"
var filepath :String = ""

if (band_count == 1) {
    //Single band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/LastFreeze/"
} else {
    //Multi band GeoTiff
    filepath = "hdfs:///user/hadoop/spring-index/BloomFinal/"
}
    
if (band_count == 1) {
    //Lets load a Singleband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopGeoTiffRDD(filepath, pattern).values
    
    //Lets filter out NaN
    band_RDD = bands_RDD.map(m => m.toArrayDouble().filter(!_.isNaN).take(100))
} else {
    //Lets load a Multiband GeoTiff and return RDD just with the tiles.
    //Since it is a single GeoTiff, it will be a RDD with a tile.
    val bands_RDD = sc.hadoopMultibandGeoTiffRDD(filepath, pattern).values
    
    //Extract the 4th band and filter out NaN
    band_RDD = bands_RDD.map(m => m.band(3).toArrayDouble().filter(v => !v.isNaN).take(1000000))
}

In [68]:
/*
We need to do a Matrix transpose to have clusters per cell
and not per year. If we do:

val band_vec = band_RDD.map(s => Vectors.dense(s)).cache()

The vectors are rows and therefore the matrix will look like this:
Vectors.dense(0.0, 1.0, 2.0),
Vectors.dense(3.0, 4.0, 5.0),
Vectors.dense(6.0, 7.0, 8.0),
Vectors.dense(9.0, 0.0, 1.0)

Inspired in:
http://jacob119.blogspot.nl/2015/11/how-to-convert-matrix-to-rddvector-in.html
and
https://stackoverflow.com/questions/29390717/how-to-transpose-an-rdd-in-spark
*/

if (in_memory == 1) {
    //A) For small memory footprint RDDs we can simply bring it to memory and transpose it
    //First transpose and then parallelize otherwise you get:
    //error: polymorphic expression cannot be instantiated to expected type;
    val band_vec_T = band_RDD.collect().transpose
    band_vec = sc.parallelize(band_vec_T).map(m => Vectors.dense(m)).cache()
} else {
    //B) For large memory footpring RDDs we need to run in distributed mode

    // Split the matrix into one number per line.
    val byColumnAndRow = band_RDD.zipWithIndex.flatMap {
        case (row, rowIndex) => row.zipWithIndex.map {
          case (number, columnIndex) => columnIndex -> (rowIndex, number)
        }
    }

    // Build up the transposed matrix. Group and sort by column index first.
    val byColumn = byColumnAndRow.groupByKey.sortByKey().values

    // Then sort by row index.
    val transposed = byColumn.map {
        indexedRow => indexedRow.toSeq.sortBy(_._1).map(_._2)
    }

    band_vec = transposed.map(m => Vectors.dense(m.toArray)).cache()
}

2
                                                                                

In [69]:
/*
    Here we will collect some info to see if the transpose worked correctly
*/

val band_vec_col = band_vec.collect()

//Number of Columns, i.e., years
println(band_vec_col.size)

//Number of cells after filtering our NaN and a take()
println(band_vec_col(0).size)

//Values for a cell over the years.
println(band_vec_col(0))

36
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,136.0]


In [70]:
/*
 Here we will train kmeans
*/

val numClusters = 3
val numIterations = 5
val clusters = {
    KMeans.train(band_vec,numClusters,numIterations)
}

// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(band_vec)
println("Within Set Sum of Squared Errors = " + WSSSE)

//Un-persist the model
band_vec.unpersist()



MapPartitionsRDD[153] at map at <console>:116

In [71]:
/*
 Cluster model's result management
*/

// Lets show the result.
println("Cluster Centers: ")
clusters.clusterCenters.foreach(println)

//Lets save the model into HDFS. If the file already exists it will abort and report error.
/*
if (band_count == 1) {
    clusters.save(sc, "hdfs:///user/emma/spring_index/LastFreeze/all_kmeans_model")    
} else {
    clusters.save(sc, "hdfs:///user/emma/spring_index/BloomFinal/all_kmeans_model")
}
*/

Cluster Centers: 
[140.6,114.7,130.3,128.4,127.93333333333334,105.8,119.0,119.06666666666666,113.36666666666666,135.8,68.4,104.5,60.266666666666666,104.39999999999999,89.7,99.66666666666667,102.13333333333333,138.6,134.86666666666667,154.46666666666667,158.36666666666667,161.36666666666667,159.6,148.63333333333333,147.76666666666665,130.63333333333333,144.8,146.66666666666666,157.2,151.9,153.53333333333333,143.53333333333333,148.4,126.7,118.56666666666666,136.0]
[7.518518518518518,10.11111111111111,10.703703703703702,0.0,4.407407407407407,2.462962962962963,12.518518518518517,0.25925925925925924,7.703703703703703,14.925925925925926,11.962962962962962,15.407407407407407,1.1296296296296295,13.648148148148147,6.796296296296296,12.018518518518517,12.574074074074073,6.277777777777778,2.888888888888889,0.0,5.666666666666666,5.259259259259259,16.62962962962963,2.2777777777777777,1.259259259259259,7.425925925925926,13.814814814814815,15.055555555555555,9.796296296296296,13.462962962962962,0.0,1

Name: Syntax Error.
Message: 
StackTrace: 

In [72]:
/*
 Run Kmeans and obtain the clusters per each cell and collect first 50 results.
*/

//Cache the model
band_vec.cache()

val res = clusters.predict(band_vec)

//Un-persist the model
band_vec.unpersist()
                           
//Collect first 50
val res_out = res.collect().take(50)

In [73]:
/*
 Show the cluster ID for the first 50 cells
*/

res_out.foreach(println)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
0
0
2
1
1
1
1
1
1
1
1
1
1
1


In [17]:
/* 
 Save the result as GeoTiff. However, it is not straightforward.
 We need to get the clusterCenter which is a RDD[Vectors]
 It contains a vector per year. However, the vector indices
 are the ones from the ArrayOfDoubles with the NaN values.
*/


100
