Skip to content

Commit

Permalink
Merge 945b5d0 into 202b06c
Browse files Browse the repository at this point in the history
  • Loading branch information
mblackgeo committed Jul 8, 2018
2 parents 202b06c + 945b5d0 commit edd4ce2
Show file tree
Hide file tree
Showing 11 changed files with 337 additions and 135 deletions.
16 changes: 2 additions & 14 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,33 +1,21 @@
language: python
python:
# We don't actually use the Travis Python, but this keeps it organized.
- "3.5"
- "3.6"
install:
- sudo apt-get update
# We do this conditionally because it saves us some downloading if the
# version is the same.
- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
else
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
fi
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
# Useful for debugging any issues with conda
- conda info -a

# Replace dep1 dep2 ... with your dependencies
- conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION gdal geopandas pandas pyproj pytest pytest-cov
- conda create -q -n test-environment -c conda-forge python=$TRAVIS_PYTHON_VERSION numpy gdal geopandas pandas pyproj pytest pytest-cov
- source activate test-environment
- pip install python-coveralls
- python setup.py install

script:
# Your test script goes here
python -m pytest -v --junitxml=result.xml --cov rastertodataframe --cov-report term-missing
after_success:
- coveralls
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
History
=======

0.1.1 (2018-07-08)
------------------

* All logic implement with unit tests. Prepare for PyPI release.

0.1.0 (2018-07-07)
------------------

Expand Down
27 changes: 26 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ Raster To DataFrame

A simple python module that converts a raster to a Pandas DataFrame.

.. code-block:: python
from rastertodataframe import raster_to_dataframe
raster_path = '/some/gdal/compatible/file.tif'
vector_path = '/some/ogr/compatible/file.geojson'
# Extract all image pixels (no vector).
df = raster_to_dataframe(raster_path)
# Extract only pixels the vector touches and include the vector metadata.
df = raster_to_dataframe(raster_path, vector_path=vector_path)
* Free software: MIT license
* Documentation: https://rastertodataframe.readthedocs.io.
Expand All @@ -31,7 +44,19 @@ A simple python module that converts a raster to a Pandas DataFrame.
Features
--------

* TODO
* Convert any GDAL compatible raster to a Pandas DataFrame.
* Optionally, if any OGR compatible vector file is given, only pixels touched by the vector are extracted from the raster. The output DataFrame includes these pixels as well as any attributed from the vector file.


Installation
------------

.. code-block:: python
pip install rastertodataframe
* A working GDAL/OGR installation is required. This is best accomplished with ``conda``.


Credits
-------
Expand Down
11 changes: 10 additions & 1 deletion docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,13 @@ Usage

To use Raster To DataFrame in a project::

import rastertodataframe
from rastertodataframe import raster_to_dataframe

raster_path = '/some/gdal/compatible/file.tif'
vector_path = '/some/ogr/compatible/file.geojson'

# Extract all image pixels (no vector).
df = raster_to_dataframe(raster_path)

# Extract only pixels the vector touches and include the vector metadata.
df = raster_to_dataframe(raster_path, vector_path=vector_path)
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: dev
name: rastertodataframe
channels:
- conda-forge
- defaults
Expand All @@ -7,6 +7,7 @@ dependencies:
- geopandas=0.3.0=py36_0
- pandas=0.23.2=py36_0
- pyproj=1.9.5.1=py36_0
- numpy=1.14.2=py36hdbf6ddf_1
- pip:
- coverage==4.5.1
- flake8==3.5.0
Expand Down
3 changes: 3 additions & 0 deletions rastertodataframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
__author__ = """Martin Black"""
__email__ = 'mblack@posteo.de'
__version__ = '0.1.0'

from .rastertodataframe import *
from . import util
76 changes: 67 additions & 9 deletions rastertodataframe/rastertodataframe.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
# -*- coding: utf-8 -*-
import os
import logging
import tempfile

import numpy as np
import pandas as pd
import geopandas as gpd

from . import util
from rastertodataframe import util

log = logging.getLogger(__name__)


def raster_to_dataframe(raster_path, vector_path=None):
"""Convert a raster to a Pandas DataFrame.
.. note:: This requires enough memory to load the entire raster.
Parameters
----------
raster_path : str
Expand All @@ -25,12 +29,66 @@ def raster_to_dataframe(raster_path, vector_path=None):
-------
pandas.core.frame.DataFrame
"""
raster = util.open_raster(raster_path)
vector = util.open_vector(vector_path) if vector_path is not None else None
# Placeholders for possible temporary files.
fid1 = fid2 = tmp_fname = vector_mask_fname = None

# Get raster band names.
ras = util.open_raster(raster_path)
raster_band_names = util.get_raster_band_names(ras)

# Create a mask from the pixels touched by the vector.
if vector_path is not None:
fid1, tmp_fname = tempfile.mkstemp(suffix='.geojson')

# Add a dummy feature ID column to the vector.
# This is not always present in OGR features.
vec_gdf = util.open_vector(vector_path, with_geopandas=True)
mask_values = list(range(1, len(vec_gdf) + 1))
vec_gdf['__fid__'] = pd.Series(mask_values)
vec_gdf.to_file(tmp_fname, driver='GeoJSON')

# Mask the vector using the feature ID column.
fid2, vector_mask_fname = tempfile.mkstemp()
vector_mask = util.burn_vector_mask_into_raster(
raster_path, tmp_fname, vector_mask_fname,
vector_field='__fid__')

# Loop over mask values to extract pixels.
mask_arr = vector_mask.GetRasterBand(1).ReadAsArray()
ras_arr = ras.ReadAsArray() # TODO this is not memory efficient.

out_dfs = []
for mask_val in mask_values:
pixels = util.extract_masked_px(ras_arr, mask_arr, mask_val=mask_val)\
.transpose()
fid_px = np.ones(pixels.shape[0]) * mask_val

mask_df = pd.DataFrame(pixels, columns=raster_band_names)
mask_df['__fid__'] = fid_px
out_dfs.append(mask_df)

# Fill DataFrame with pixels.
out_df = pd.concat(out_dfs)

# Join with vector attributes.
out_df = out_df.merge(vec_gdf, how='left', on='__fid__')

else:
# No vector given, simply load the raster.
ras_arr = ras.ReadAsArray() # TODO not memory efficient.
mask_arr = np.ones((ras_arr.shape[1], ras_arr.shape[2]))
pixels = util.extract_masked_px(ras_arr, mask_arr).transpose()
out_df = pd.DataFrame(pixels, columns=raster_band_names)

# TODO screen no data values.

# If vector:
# Get vector mask (either all features or by a specific vector field).
# Loop over mask values to extract pixels.
# Remove temporary files.
if fid1 is not None and tmp_fname is not None:
os.close(fid1)
os.remove(tmp_fname)
if fid2 is not None and vector_mask_fname is not None:
os.close(fid2)
os.remove(vector_mask_fname)

# Assemble dataframe from raster pixels.
pass
# Return dropping any extra cols.
return out_df.drop(columns=['__fid__', 'geometry'], errors='ignore')
66 changes: 64 additions & 2 deletions rastertodataframe/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def _create_empty_raster(template, out_path, n_bands=1, no_data_value=None):
return out_dataset


def _burn_vector_mask_into_raster(raster_path, vector_path, out_path,
vector_field=None):
def burn_vector_mask_into_raster(raster_path, vector_path, out_path,
vector_field=None):
"""Create a new raster based on the input raster with vector features
burned into the raster. To be used as a mask for pixels in the vector.
Expand Down Expand Up @@ -253,3 +253,65 @@ def _burn_vector_mask_into_raster(raster_path, vector_path, out_path,
out_ds = None

return open_raster(out_path)


def get_raster_band_names(raster):
"""Obtain the names of bands from a raster. The raster metadata is queried
first, if no names a present, a 1-index list of band_N is returned.
Parameters
----------
raster : gdal.Dataset
Returns
-------
list[str]
"""
band_names = []
for i in range(1, raster.RasterCount + 1):
band = raster.GetRasterBand(i)

if band.GetDescription():
# Use the band description.
band_names.append(band.GetDescription())
else:
# Check for metedata.
this_band_name = 'Band_{}'.format(band.GetBand())
metadata = band.GetDataset().GetMetadata_Dict()

# If in metadata, return the metadata entry, else Band_N.
if this_band_name in metadata and metadata[this_band_name]:
band_names.append(metadata[this_band_name])
else:
band_names.append(this_band_name)

return band_names


def extract_masked_px(ras, mask, mask_val=None):
"""Extract raster values according to a mask.
Parameters
----------
ras : np.ndarray
2D or 3D input data in the form [bands][y][x].
mask : np.ndarray
1D or 2D with zeroes to mask data.
mask_val : int
Value of the data pixels in the mask. Default: non-zero.
Returns
-------
np.ndarray
Array of non-masked data.
"""
if mask is None:
return ras

# Use the mask to get the indices of the non-zero pixels.
if mask_val:
(i, j) = (mask == mask_val).nonzero()
else:
(i, j) = mask.nonzero()

return (ras[i, j] if ras.ndim == 2 else ras[:, i, j])
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ GDAL==2.2.4
geopandas==0.3.0
pandas==0.23.2
pyproj==1.9.5.1
numpy==1.14.2

0 comments on commit edd4ce2

Please sign in to comment.