diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd index 568da4a75..b9e2d641d 100644 --- a/pyrasterframes/src/main/python/docs/nodata-handling.pymd +++ b/pyrasterframes/src/main/python/docs/nodata-handling.pymd @@ -2,7 +2,7 @@ ## What is NoData? -In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value. +In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value. RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_. diff --git a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd index 537ffe3df..04c0bd4ce 100644 --- a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd +++ b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd @@ -45,11 +45,11 @@ import pyrasterframes.rf_ipython from pyspark.sql.functions import lit, col cat = spark.read.format('aws-pds-modis-catalog').load() \ - .filter( - (col('granule_id') == 'h11v04') & - (col('acquisition_date') > lit('2018-02-19')) & - (col('acquisition_date') < lit('2018-02-22')) - ) + .filter( + (col('granule_id') == 'h11v04') & + (col('acquisition_date') > lit('2018-02-19')) & + (col('acquisition_date') < lit('2018-02-22')) + ) spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \ .select( @@ -92,7 +92,7 @@ np.abs(diff.cells).max() We can also inspect an image of the difference between the two _tiles_, which is just random noise. Both _tiles_ have the same structure of NoData, as exhibited by the white areas. ```python udf_diff_noise_tile -display(diff) +diff.show(0, 100) ``` ## Creating a Spark DataFrame @@ -105,12 +105,11 @@ The example below will create a Pandas DataFrame with ten rows of noise _tiles_ import pandas as pd from shapely.geometry import Point -pandas_df = pd.DataFrame([ - { - 'tile': Tile(np.random.randn(100, 100)), - 'geom': Point(-90 + 90 * np.random.random((2, 1))) - } for _ in range(10) - ]) +pandas_df = pd.DataFrame([{ + 'tile': Tile(np.random.randn(100, 100)), + 'geom': Point(-90 + 90 * np.random.random((2, 1))) + } for _ in range(10) +]) spark_df = spark.createDataFrame(pandas_df) diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py index c1fe857d4..1e2d32d86 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py @@ -19,9 +19,58 @@ # import pyrasterframes.rf_types +import numpy as np + + +def plot_tile(tile, normalize, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args): + """ + Display an image of the tile + + Parameters + ---------- + normalize: if True, will normalize the data between using + lower_percentile and upper_percentile as bounds + lower_percentile: between 0 and 100 inclusive. + Specifies to clip values below this percentile + upper_percentile: between 0 and 100 inclusive. + Specifies to clip values above this percentile + axis : matplotlib axis object to plot onto. Creates new axis if None + imshow_args : parameters to pass into matplotlib.pyplot.imshow + see https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.imshow.html + Returns + ------- + created or modified axis object + """ + + if axis is None: + import matplotlib.pyplot as plt + axis = plt.gca() + + arr = tile.cells + + def normalize_cells(cells): + assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles {}, {}'.format(lower_percentile, upper_percentile) + sans_mask = np.array(cells) + lower = np.nanpercentile(sans_mask, lower_percentile) + upper = np.nanpercentile(sans_mask, upper_percentile) + cells_clipped = np.clip(cells, lower, upper) + return (cells_clipped - lower) / (upper - lower) + axis.set_aspect('equal') + axis.xaxis.set_ticks([]) + axis.yaxis.set_ticks([]) + + if normalize: + cells = normalize_cells(arr) + else: + cells = arr + + axis.imshow(cells, **imshow_args) -def tile_to_png(tile, fig_size=None): + return axis + + +def tile_to_png(tile, lower_percentile=1, upper_percentile=99, title=None, fig_size=None): """ Provide image of Tile.""" if tile.cells is None: return None @@ -31,7 +80,7 @@ def tile_to_png(tile, fig_size=None): from matplotlib.figure import Figure # Set up matplotlib objects - nominal_size = 2 # approx full size for a 256x256 tile + nominal_size = 3 # approx full size for a 256x256 tile if fig_size is None: fig_size = (nominal_size, nominal_size) @@ -39,15 +88,16 @@ def tile_to_png(tile, fig_size=None): canvas = FigureCanvas(fig) axis = fig.add_subplot(1, 1, 1) - data = tile.cells - - axis.imshow(data) + plot_tile(tile, True, lower_percentile, upper_percentile, axis=axis) axis.set_aspect('equal') axis.xaxis.set_ticks([]) axis.yaxis.set_ticks([]) - axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()), - fontsize=fig_size[0]*4) # compact metadata as title + if title is None: + axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()), + fontsize=fig_size[0]*4) # compact metadata as title + else: + axis.set_title(title, fontsize=fig_size[0]*4) # compact metadata as title with io.BytesIO() as output: canvas.print_png(output) @@ -58,7 +108,7 @@ def tile_to_html(tile, fig_size=None): """ Provide HTML string representation of Tile image.""" import base64 b64_img_html = '' - png_bits = tile_to_png(tile, fig_size) + png_bits = tile_to_png(tile, fig_size=fig_size) b64_png = base64.b64encode(png_bits).decode('utf-8').replace('\n', '') return b64_img_html.format(b64_png) @@ -102,6 +152,7 @@ def _safe_tile_to_html(t): pd.set_option('display.max_colwidth', default_max_colwidth) return return_html + def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False): from pyrasterframes import RFContext return RFContext.active().call("_dfToMarkdown", df._jdf, num_rows, truncate) @@ -122,7 +173,9 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False): markdown_formatter = ip.display_formatter.formatters['text/markdown'] html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown) - Tile.show = lambda t: display_png(t._repr_png_(), raw=True) + Tile.show = lambda tile, normalize=False, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \ + plot_tile(tile, normalize, lower_percentile, upper_percentile, axis, **imshow_args) + Tile.show.__doc__ = plot_tile.__doc__ # See if we're in documentation mode and register a custom show implementation. if 'InProcessInteractiveShell' in ip.__class__.__name__: @@ -130,6 +183,4 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False): pyspark.sql.DataFrame.show = lambda df, num_rows=5, truncate=True: display_markdown(spark_df_to_markdown(df, num_rows, truncate), raw=True) except ImportError as e: - print(e) - raise e pass