Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyrasterframes/src/main/python/docs/nodata-handling.pymd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## What is NoData?

In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.
In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value.

RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_.

Expand Down
23 changes: 11 additions & 12 deletions pyrasterframes/src/main/python/docs/numpy-pandas.pymd
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ import pyrasterframes.rf_ipython
from pyspark.sql.functions import lit, col

cat = spark.read.format('aws-pds-modis-catalog').load() \
.filter(
(col('granule_id') == 'h11v04') &
(col('acquisition_date') > lit('2018-02-19')) &
(col('acquisition_date') < lit('2018-02-22'))
)
.filter(
(col('granule_id') == 'h11v04') &
(col('acquisition_date') > lit('2018-02-19')) &
(col('acquisition_date') < lit('2018-02-22'))
)

spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \
.select(
Expand Down Expand Up @@ -92,7 +92,7 @@ np.abs(diff.cells).max()
We can also inspect an image of the difference between the two _tiles_, which is just random noise. Both _tiles_ have the same structure of NoData, as exhibited by the white areas.

```python udf_diff_noise_tile
display(diff)
diff.show(0, 100)
```

## Creating a Spark DataFrame
Expand All @@ -105,12 +105,11 @@ The example below will create a Pandas DataFrame with ten rows of noise _tiles_
import pandas as pd
from shapely.geometry import Point

pandas_df = pd.DataFrame([
{
'tile': Tile(np.random.randn(100, 100)),
'geom': Point(-90 + 90 * np.random.random((2, 1)))
} for _ in range(10)
])
pandas_df = pd.DataFrame([{
'tile': Tile(np.random.randn(100, 100)),
'geom': Point(-90 + 90 * np.random.random((2, 1)))
} for _ in range(10)
])

spark_df = spark.createDataFrame(pandas_df)

Expand Down
73 changes: 62 additions & 11 deletions pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,58 @@
#

import pyrasterframes.rf_types
import numpy as np


def plot_tile(tile, normalize, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args):
"""
Display an image of the tile

Parameters
----------
normalize: if True, will normalize the data between using
lower_percentile and upper_percentile as bounds
lower_percentile: between 0 and 100 inclusive.
Specifies to clip values below this percentile
upper_percentile: between 0 and 100 inclusive.
Specifies to clip values above this percentile
axis : matplotlib axis object to plot onto. Creates new axis if None
imshow_args : parameters to pass into matplotlib.pyplot.imshow
see https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.imshow.html
Returns
-------
created or modified axis object
"""

if axis is None:
import matplotlib.pyplot as plt
axis = plt.gca()

arr = tile.cells

def normalize_cells(cells):
assert upper_percentile > lower_percentile, 'invalid upper and lower percentiles {}, {}'.format(lower_percentile, upper_percentile)
sans_mask = np.array(cells)
lower = np.nanpercentile(sans_mask, lower_percentile)
upper = np.nanpercentile(sans_mask, upper_percentile)
cells_clipped = np.clip(cells, lower, upper)
return (cells_clipped - lower) / (upper - lower)

axis.set_aspect('equal')
axis.xaxis.set_ticks([])
axis.yaxis.set_ticks([])

if normalize:
cells = normalize_cells(arr)
else:
cells = arr

axis.imshow(cells, **imshow_args)

def tile_to_png(tile, fig_size=None):
return axis


def tile_to_png(tile, lower_percentile=1, upper_percentile=99, title=None, fig_size=None):
""" Provide image of Tile."""
if tile.cells is None:
return None
Expand All @@ -31,23 +80,24 @@ def tile_to_png(tile, fig_size=None):
from matplotlib.figure import Figure

# Set up matplotlib objects
nominal_size = 2 # approx full size for a 256x256 tile
nominal_size = 3 # approx full size for a 256x256 tile
if fig_size is None:
fig_size = (nominal_size, nominal_size)

fig = Figure(figsize=fig_size)
canvas = FigureCanvas(fig)
axis = fig.add_subplot(1, 1, 1)

data = tile.cells

axis.imshow(data)
plot_tile(tile, True, lower_percentile, upper_percentile, axis=axis)
axis.set_aspect('equal')
axis.xaxis.set_ticks([])
axis.yaxis.set_ticks([])

axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()),
fontsize=fig_size[0]*4) # compact metadata as title
if title is None:
axis.set_title('{}, {}'.format(tile.dimensions(), tile.cell_type.__repr__()),
fontsize=fig_size[0]*4) # compact metadata as title
else:
axis.set_title(title, fontsize=fig_size[0]*4) # compact metadata as title

with io.BytesIO() as output:
canvas.print_png(output)
Expand All @@ -58,7 +108,7 @@ def tile_to_html(tile, fig_size=None):
""" Provide HTML string representation of Tile image."""
import base64
b64_img_html = '<img src="data:image/png;base64,{}" />'
png_bits = tile_to_png(tile, fig_size)
png_bits = tile_to_png(tile, fig_size=fig_size)
b64_png = base64.b64encode(png_bits).decode('utf-8').replace('\n', '')
return b64_img_html.format(b64_png)

Expand Down Expand Up @@ -102,6 +152,7 @@ def _safe_tile_to_html(t):
pd.set_option('display.max_colwidth', default_max_colwidth)
return return_html


def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
from pyrasterframes import RFContext
return RFContext.active().call("_dfToMarkdown", df._jdf, num_rows, truncate)
Expand All @@ -122,14 +173,14 @@ def spark_df_to_markdown(df, num_rows=5, truncate=True, vertical=False):
markdown_formatter = ip.display_formatter.formatters['text/markdown']
html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown)

Tile.show = lambda t: display_png(t._repr_png_(), raw=True)
Tile.show = lambda tile, normalize=False, lower_percentile=1, upper_percentile=99, axis=None, **imshow_args: \
plot_tile(tile, normalize, lower_percentile, upper_percentile, axis, **imshow_args)
Tile.show.__doc__ = plot_tile.__doc__

# See if we're in documentation mode and register a custom show implementation.
if 'InProcessInteractiveShell' in ip.__class__.__name__:
pyspark.sql.DataFrame._repr_markdown_ = spark_df_to_markdown
pyspark.sql.DataFrame.show = lambda df, num_rows=5, truncate=True: display_markdown(spark_df_to_markdown(df, num_rows, truncate), raw=True)

except ImportError as e:
print(e)
raise e
pass