locationtech · metasim · Aug 22, 2019 · Aug 19, 2019 · Aug 20, 2019 · Aug 20, 2019
diff --git a/core/src/main/resources/reference.conf b/core/src/main/resources/reference.conf
@@ -3,6 +3,7 @@ rasterframes {
   prefer-gdal = true
   showable-tiles = true
   showable-max-cells = 20
+  max-truncate-row-element-length = 40
   raster-source-cache-timeout = 120 seconds
 }
 

diff --git a/core/src/main/scala/org/locationtech/rasterframes/util/package.scala b/core/src/main/scala/org/locationtech/rasterframes/util/package.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.rf._
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.{StringType, StructField}
 import org.apache.spark.sql._
 import org.slf4j.LoggerFactory
 import spire.syntax.cfor._
@@ -184,24 +184,58 @@ package object util {
     }
   }
 
+  private val truncateWidth = rfConfig.getInt("max-truncate-row-element-length")
+
   implicit class DFWithPrettyPrint(val df: Dataset[_]) extends AnyVal {
+
+    def stringifyRowElements(cols: Seq[StructField], truncate: Boolean) = {
+      cols
+        .map(c => s"`${c.name}`")
+        .map(c => df.col(c).cast(StringType))
+        .map(c => if (truncate) {
+          when(length(c) > lit(truncateWidth), concat(substring(c, 1, truncateWidth), lit("...")))
+            .otherwise(c)
+        } else c)
+    }
+
     def toMarkdown(numRows: Int = 5, truncate: Boolean = false): String = {
       import df.sqlContext.implicits._
-      val cols = df.columns
-      val header = cols.mkString("| ", " | ", " |") + "\n" + ("|---" * cols.length) + "|\n"
-      val stringifiers = cols
-        .map(c => s"`$c`")
-        .map(c => df.col(c).cast(StringType))
-        .map(c => if (truncate) substring(c, 1, 40) else c)
+      val cols = df.schema.fields
+      val header = cols.map(_.name).mkString("| ", " | ", " |") + "\n" + ("|---" * cols.length) + "|\n"
+      val stringifiers = stringifyRowElements(cols, truncate)
       val cat = concat_ws(" | ", stringifiers: _*)
-      val body = df
-        .select(cat).limit(numRows)
+      val rows = df
+        .select(cat)
+        .limit(numRows)
         .as[String]
         .collect()
         .map(_.replaceAll("\\[", "\\\\["))
         .map(_.replace('\n', '↩'))
+
+      val body = rows
         .mkString("| ", " |\n| ", " |")
-      header + body
+
+      val caption = if (rows.length >= numRows) s"\n_Showing only top $numRows rows_.\n\n" else ""
+      caption + header + body
+    }
+
+    def toHTML(numRows: Int = 5, truncate: Boolean = false): String = {
+      import df.sqlContext.implicits._
+      val cols = df.schema.fields
+      val header = "<thead>\n" + cols.map(_.name).mkString("<tr><th>", "</th><th>", "</th></tr>\n") + "</thead>\n"
+      val stringifiers = stringifyRowElements(cols, truncate)
+      val cat = concat_ws("</td><td>", stringifiers: _*)
+      val rows = df
+        .select(cat).limit(numRows)
+        .as[String]
+        .collect()
+
+      val body = rows
+        .mkString("<tr><td>", "</td></tr>\n<tr><td>", "</td></tr>\n")
+
+      val caption = if (rows.length >= numRows) s"<caption>Showing only top $numRows rows</caption>\n" else ""
+
+      "<table>\n" + caption + header + "<tbody>\n" + body + "</tbody>\n" + "</table>"
     }
   }
 

diff --git a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala
@@ -26,7 +26,9 @@ import geotrellis.raster.{ByteCellType, GridBounds, TileLayout}
 import geotrellis.spark.tiling.{CRSWorldExtent, LayoutDefinition}
 import geotrellis.spark.{KeyBounds, SpatialKey, TileLayerMetadata}
 import org.apache.spark.sql.Encoders
-import org.locationtech.rasterframes.util.SubdivideSupport
+import org.locationtech.rasterframes.util._
+
+import scala.xml.parsing.XhtmlParser
 
 /**
  * Tests miscellaneous extension methods.
@@ -111,8 +113,18 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu
     }
 
     it("should render Markdown") {
-      import org.locationtech.rasterframes.util._
-      rf.toMarkdown().count(_ == '|') shouldBe >=(3 * 5)
+      val md = rf.toMarkdown()
+      md.count(_ == '|') shouldBe >=(3 * 5)
+      md.count(_ == '\n') should be >=(6)
+
+      val md2 = rf.toMarkdown(truncate=true)
+      md2 should include ("...")
+    }
+
+    it("should render HTML") {
+      noException shouldBe thrownBy {
+        XhtmlParser(scala.io.Source.fromString(rf.toHTML()))
+      }
     }
   }
 }
diff --git a/docs/src/main/paradox/_template/page.st b/docs/src/main/paradox/_template/page.st
@@ -33,6 +33,9 @@
     .md-clear { clear: both; }
     table { font-size: 80%; }
     code { font-size: 0.75em !important; }
+    table a {
+        word-break: break-all;
+    }
   </style>
 </head>
 

diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md
@@ -4,6 +4,7 @@
 
 ### 0.8.1
 
+* Added `toMarkdown()` and `toHTML()` extension methods for `DataFrame`, and registered them with the IPython formatter system when `rf_ipython` is imported.  
 * Fixed: Removed false return type garauntee in cases where an `Expression` accepts either `Tile` or `ProjectedRasterTile` [(#295)](https://github.com/locationtech/rasterframes/issues/295)
 
 ### 0.8.0

diff --git a/pyrasterframes/src/main/python/docs/aggregation.pymd b/pyrasterframes/src/main/python/docs/aggregation.pymd
@@ -33,14 +33,16 @@ print(tiles[1]['tile'].cells)
 
 We use the @ref:[`rf_tile_mean`](reference.md#rf-tile-mean) function to compute the _tile_ aggregate mean of cells in each row of column `tile`. The mean of each _tile_ is computed separately, so the first mean is 1.0 and the second mean is 3.0. Notice that the number of rows in the DataFrame is the same before and after the aggregation.
 
-```python, tile_mean, results='raw'
-rf.select(F.col('id'), rf_tile_mean(F.col('tile'))).show()
+```python, tile_mean
+means = rf.select(F.col('id'), rf_tile_mean(F.col('tile')))
+means
 ```
 
 We use the @ref:[`rf_agg_mean`](reference.md#rf-agg-mean) function to compute the DataFrame aggregate, which averages 25 values of 1.0 and 25 values of 3.0, across the fifty cells in two rows. Note that only a single row is returned since the average is computed over the full DataFrame.
 
-```python, agg_mean, results='raw'
-rf.agg(rf_agg_mean(F.col('tile'))).show()
+```python, agg_mean
+mean = rf.agg(rf_agg_mean(F.col('tile')))
+mean
 ```
 
 We use the @ref:[`rf_agg_local_mean`](reference.md#rf-agg-local-mean) function to compute the element-wise local aggregate mean across the two rows. For this aggregation, we are computing the mean of one value of 1.0 and one value of 3.0 to arrive at the element-wise mean, but doing so twenty-five times, one for each position in the _tile_.
@@ -57,11 +59,10 @@ print(t.cells)
 
 We can also count the total number of data and NoData cells over all the _tiles_ in a DataFrame using @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells) and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). There are ~3.8 million data cells and ~1.9 million NoData cells in this DataFrame. See the section on @ref:["NoData" handling](nodata-handling.md) for additional discussion on handling missing data.
 
-```python, cell_counts, results='raw'
+```python, cell_counts
 rf = spark.read.raster('https://s22s-test-geotiffs.s3.amazonaws.com/MCD43A4.006/11/05/2018233/MCD43A4.A2018233.h11v05.006.2018242035530_B02.TIF')
 stats = rf.agg(rf_agg_data_cells('proj_raster'), rf_agg_no_data_cells('proj_raster'))
-
-stats.show()
+stats
 ```
 
 ## Statistical Summaries
@@ -77,16 +78,16 @@ stats = rf.select(rf_tile_stats('proj_raster').alias('stats'))
 stats.printSchema()
 ```
 
-```python, show_stats, results='raw'
-stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').show(10, truncate=False)
+```python, show_stats
+stats.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance')
 ```
 
 The @ref:[`rf_agg_stats`](reference.md#rf-agg-stats) function aggregates over all of the _tiles_ in a DataFrame and returns a statistical summary of all cell values as shown below.
 
-```python, agg_stats, results='raw'
-rf.agg(rf_agg_stats('proj_raster').alias('stats')) \
-    .select('stats.min', 'stats.max', 'stats.mean', 'stats.variance') \
-    .show()
+```python, agg_stats
+stats = rf.agg(rf_agg_stats('proj_raster').alias('stats')) \
+    .select('stats.min', 'stats.max', 'stats.mean', 'stats.variance')
+stats   
 ```
 
 The @ref:[`rf_agg_local_stats`](reference.md#rf-agg-local-stats) function computes the element-wise local aggregate statistical summary as shown below. The DataFrame used in the previous two code blocks has unequal _tile_ dimensions, so a different DataFrame is used in this code block to avoid a runtime error.

diff --git a/pyrasterframes/src/main/python/docs/getting-started.pymd b/pyrasterframes/src/main/python/docs/getting-started.pymd
@@ -34,17 +34,17 @@ spark = pyrasterframes.get_spark_session()
 
 Then, you can read a raster and work with it in a Spark DataFrame.
 
-```python, local_add, results='raw'
+```python, local_add
 from pyrasterframes.rasterfunctions import *
 from pyspark.sql.functions import lit
 
 # Read a MODIS surface reflectance granule
 df = spark.read.raster('https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF')
 
 # Add 3 element-wise, show some rows of the DataFrame
-df.withColumn('added', rf_local_add(df.proj_raster, lit(3))) \
-  .select(rf_crs('added'), rf_extent('added'), rf_tile('added')) \
-  .show(3)
+sample = df.withColumn('added', rf_local_add(df.proj_raster, lit(3))) \
+  .select(rf_crs('added'), rf_extent('added'), rf_tile('added'))
+sample
 ```
 
 This example is extended in the [getting started Jupyter notebook](https://nbviewer.jupyter.org/github/locationtech/rasterframes/blob/develop/rf-notebook/src/main/notebooks/Getting%20Started.ipynb).

diff --git a/pyrasterframes/src/main/python/docs/index.md b/pyrasterframes/src/main/python/docs/index.md
@@ -10,6 +10,8 @@ The source code can be found on GitHub at [locationtech/rasterframes](https://gi
 
 <img src="RasterFramePipeline.png" width="600px"/>
 
+RasterFrames is released under the [Apache 2.0 License](https://github.com/locationtech/rasterframes/blob/develop/LICENSE).
+
 <hr/>
 
 @@@ div { .md-left}

diff --git a/pyrasterframes/src/main/python/docs/languages.pymd b/pyrasterframes/src/main/python/docs/languages.pymd
@@ -50,7 +50,7 @@ red_nir_tiles_monthly_2017 = spark.read.raster(
 
 ### Step 4: Compute aggregates
 
-```python, step_4_python, results='raw'
+```python, step_4_python
 result = red_nir_tiles_monthly_2017 \
     .where(st_intersects(
         st_reproject(rf_geometry(col('red')), rf_crs(col('red')).crsProj4, rf_mk_crs('EPSG:4326')),
@@ -60,7 +60,7 @@ result = red_nir_tiles_monthly_2017 \
     .agg(rf_agg_stats(rf_normalized_difference(col('nir'), col('red'))).alias('ndvi_stats')) \
     .orderBy(col('month')) \
     .select('month', 'ndvi_stats.*')
-result.show()
+result
 ```
 
 ## SQL
@@ -80,14 +80,14 @@ sql("CREATE OR REPLACE TEMPORARY VIEW modis USING `aws-pds-modis-catalog`")
 
 ### Step 2: Down-select data by month
 
-```python, step_2_sql, results='raw'
+```python, step_2_sql
 sql("""
 CREATE OR REPLACE TEMPORARY VIEW red_nir_monthly_2017 AS
 SELECT granule_id, month(acquisition_date) as month, B01 as red, B02 as nir
 FROM modis
 WHERE year(acquisition_date) = 2017 AND day(acquisition_date) = 15 AND granule_id = 'h21v09'
 """)
-sql('DESCRIBE red_nir_monthly_2017').show()
+sql('DESCRIBE red_nir_monthly_2017')
 ```
 
 ### Step 3: Read tiles
@@ -106,16 +106,17 @@ OPTIONS (
 
 ### Step 4: Compute aggregates
 
-```python, step_4_sql, results='raw'
-sql("""
+```python, step_4_sql
+grouped = sql("""
 SELECT month, ndvi_stats.* FROM (
     SELECT month, rf_agg_stats(rf_normalized_difference(nir, red)) as ndvi_stats
     FROM red_nir_tiles_monthly_2017
     WHERE st_intersects(st_reproject(rf_geometry(red), rf_crs(red), 'EPSG:4326'), st_makePoint(34.870605, -4.729727))
     GROUP BY month
     ORDER BY month
 )
-""").show()
+""")
+grouped
 ```
 
 ## Scala
-Original file line number
+Diff line change
@@ Expand Up @@
     <img src="RasterFramePipeline.png" width="600px"/>
+    RasterFrames is released under the [Apache 2.0 License](https://github.com/locationtech/rasterframes/blob/develop/LICENSE).
     <hr/>
     @@@ div { .md-left}
@@ Expand Down @@