diff --git a/.circleci/.dockerignore b/.circleci/.dockerignore new file mode 100644 index 000000000..dbe9a91d7 --- /dev/null +++ b/.circleci/.dockerignore @@ -0,0 +1,3 @@ +* +!requirements-conda.txt +!fix-permissions diff --git a/.circleci/Dockerfile b/.circleci/Dockerfile index d498294c2..f4629597a 100644 --- a/.circleci/Dockerfile +++ b/.circleci/Dockerfile @@ -1,63 +1,35 @@ -FROM circleci/openjdk:8-jdk - -ENV MINICONDA_VERSION=4.8.2 \ - MINICONDA_MD5=87e77f097f6ebb5127c77662dfc3165e \ - CONDA_VERSION=4.8.2 \ - CONDA_DIR=/opt/conda \ - PYTHON_VERSION=3.7.7 +FROM circleci/openjdk:11-jdk +#LABEL org.opencontainers.image.source=https://github.com/locationtech/rasterframes USER root -ENV PATH=$CONDA_DIR/bin:$PATH - -# circleci is 3434 -COPY --chown=3434:3434 fix-permissions /tmp - +# See: https://docs.conda.io/projects/conda/en/latest/user-guide/install/rpm-debian.html RUN \ - apt-get update && \ - apt-get install -yq --no-install-recommends \ - sudo \ - wget \ - bzip2 \ - file \ - libtinfo5 \ - ca-certificates \ - gettext-base \ - locales && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* + curl -s https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ + install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ + gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list RUN \ - cd /tmp && \ - mkdir -p $CONDA_DIR && \ - wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py37_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - echo "${MINICONDA_MD5} *Miniconda3-py37_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \ - /bin/bash Miniconda3-py37_${MINICONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \ - rm Miniconda3-py37_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - conda config --system --set auto_update_conda false && \ - conda config --system --set show_channel_urls true && \ - conda config --system --set channel_priority strict && \ - if [ ! $PYTHON_VERSION = 'default' ]; then conda install --yes python=$PYTHON_VERSION; fi && \ - conda list python | grep '^python ' | tr -s ' ' | cut -d '.' -f 1,2 | sed 's/$/.*/' >> $CONDA_DIR/conda-meta/pinned && \ - conda install --quiet --yes conda && \ - conda install --quiet --yes pip && \ - pip config set global.progress_bar off && \ - echo "$CONDA_DIR/lib" > /etc/ld.so.conf.d/conda.conf && \ - conda clean --all --force-pkgs-dirs --yes --quiet && \ - sh /tmp/fix-permissions $CONDA_DIR 2> /dev/null + apt-get update && \ + apt-get install -yq --no-install-recommends conda && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -COPY requirements-conda.txt /tmp/ +ENV CONDA_DIR=/opt/conda +ENV PATH=$CONDA_DIR/bin:$PATH +COPY requirements-conda.txt fix-permissions /tmp RUN \ - conda install --channel conda-forge --no-channel-priority --freeze-installed \ - --file /tmp/requirements-conda.txt && \ + conda install --quiet --yes --channel=conda-forge --file=/tmp/requirements-conda.txt && \ + echo "$CONDA_DIR/lib" > /etc/ld.so.conf.d/conda.conf && \ + ldconfig && \ conda clean --all --force-pkgs-dirs --yes --quiet && \ - sh /tmp/fix-permissions $CONDA_DIR 2> /dev/null && \ - ldconfig 2> /dev/null + sh /tmp/fix-permissions $CONDA_DIR + # Work-around for pyproj issue https://github.com/pyproj4/pyproj/issues/415 ENV PROJ_LIB=/opt/conda/share/proj USER 3434 - WORKDIR /home/circleci diff --git a/.circleci/Makefile b/.circleci/Makefile index 35d44a7a5..578140c4e 100644 --- a/.circleci/Makefile +++ b/.circleci/Makefile @@ -1,19 +1,27 @@ -IMAGE_NAME=miniconda-gdal -VERSION=latest -HOST=docker.pkg.github.com -REPO=${HOST}/locationtech/rasterframes -FULL_NAME=${REPO}/${IMAGE_NAME}:${VERSION} +IMAGE_NAME=circleci-openjdk-conda-gdal +SHA=$(shell git log -n1 --format=format:"%H" | cut -c 1-7) +VERSION?=$(SHA) +HOST=docker.io +REPO=$(HOST)/s22s +FULL_NAME=$(REPO)/$(IMAGE_NAME):$(VERSION) -all: build login push +.DEFAULT_GOAL := help +help: +# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html + @echo "Usage: make [target]" + @echo "Targets: " + @grep -E '^[a-zA-Z0-9_%/-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\t\033[36m%-20s\033[0m %s\n", $$1, $$2}' -build: +all: build push ## Build and then push image + +build: ## Build the docker image docker build . -t ${FULL_NAME} -login: - docker login ${HOST} +login: ## Login to the docker registry + docker login -push: +push: login ## Push docker image to registry docker push ${FULL_NAME} -shell: build +run: build ## Build image and launch shell docker run --rm -it ${FULL_NAME} bash diff --git a/.circleci/config.yml b/.circleci/config.yml index 07bd8ce85..aafc628f2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,11 +6,10 @@ orbs: executors: default: docker: - - image: s22s/miniconda-gdal:latest + - image: s22s/circleci-openjdk-conda-gdal:b8e30ee working_directory: ~/repo environment: - SBT_VERSION: 1.3.8 - SBT_OPTS: -Xmx768m + SBT_OPTS: "-Xms64m -Xmx1536m -Djava.awt.headless=true -Dsun.io.serialization.extendedDebugInfo=true" commands: setup: description: Setup for sbt build @@ -24,8 +23,7 @@ orbs: steps: - run: name: "Compile Scala via sbt" - command: |- - sbt -v -batch compile test:compile it:compile + command: sbt -v -batch compile test:compile it:compile python: commands: @@ -60,6 +58,7 @@ orbs: mkdir -p /tmp/core_dumps ls -lh /tmp cp core.* *.hs /tmp/core_dumps 2> /dev/null || true + cp core/* /tmp/core_dumps/ 2> /dev/null || true cp -r /tmp/hsperfdata* /tmp/*.hprof /tmp/core_dumps 2> /dev/null || true when: on_fail @@ -125,24 +124,23 @@ jobs: - run: name: "Scala Tests: core" - command: sbt -batch core/test + command: sbt -v -batch core/test - run: name: "Scala Tests: datasource" - command: sbt -batch datasource/test + command: sbt -v -batch datasource/test - run: name: "Scala Tests: experimental" - command: sbt -batch experimental/test + command: sbt -v -batch experimental/test - run: name: "Create PyRasterFrames package" - command: |- - sbt -v -batch pyrasterframes/package + command: sbt -v -batch pyrasterframes/package - run: name: "Python Tests" - command: sbt -batch pyrasterframes/test + command: sbt -v -batch pyrasterframes/test - rasterframes/save-artifacts - rasterframes/save-cache @@ -249,4 +247,4 @@ workflows: - test - it - it-no-gdal - - docs + - docs \ No newline at end of file diff --git a/.circleci/fix-permissions b/.circleci/fix-permissions index 2a2bb9d7d..d8e14920f 100755 --- a/.circleci/fix-permissions +++ b/.circleci/fix-permissions @@ -1,19 +1,4 @@ #!/usr/bin/env bash -# set permissions on a directory -# after any installation, if a directory needs to be (human) user-writable, -# run this script on it. -# It will make everything in the directory owned by the group $NB_GID -# and writable by that group. -# Deployments that want to set a specific user id can preserve permissions -# by adding the `--group-add users` line to `docker run`. - -# uses find to avoid touching files that already have the right permissions, -# which would cause massive image explosion - -# right permissions are: -# group=$NB_GID -# AND permissions include group rwX (directory-execute) -# AND directories have setuid,setgid bits set set -e diff --git a/.circleci/requirements-conda.txt b/.circleci/requirements-conda.txt index 17c4761d9..a8ebfd56b 100644 --- a/.circleci/requirements-conda.txt +++ b/.circleci/requirements-conda.txt @@ -1,3 +1,5 @@ -gdal==2.4.4 +python==3.8 +gdal==3.1.2 libspatialindex +rasterio[s3] rtree \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml new file mode 100644 index 000000000..9462aaba1 --- /dev/null +++ b/.github/workflows/build-test.yml @@ -0,0 +1,131 @@ +name: Build and Test + +on: + pull_request: + branches: ['**'] + push: + branches: ['main'] + tags: [v*] + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + container: + image: s22s/circleci-openjdk-conda-gdal:b8e30ee + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: coursier/cache-action@v6 + - uses: olafurpg/setup-scala@v13 + with: + java-version: adopt@1.11 + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install Conda dependencies + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + $CONDA/bin/conda install -c conda-forge --yes --file pyrasterframes/src/main/python/requirements-condaforge.txt + + - run: ulimit -c unlimited -S + + # Do just the compilation stage to minimize sbt memory footprint + - name: Compile + run: sbt -v -batch compile test:compile it:compile + + - name: Core tests + run: sbt -batch core/test + + - name: Datasource tests + run: sbt -batch datasource/test + + - name: Experimental tests + run: sbt -batch experimental/test + + - name: Create PyRasterFrames package + run: sbt -v -batch pyrasterframes/package + + - name: Python tests + run: sbt -batch pyrasterframes/test + + - name: Collect artifacts + if: ${{ failure() }} + run: | + mkdir -p /tmp/core_dumps + ls -lh /tmp + cp core.* *.hs /tmp/core_dumps/ 2> /dev/null || true + cp ./core/*.log /tmp/core_dumps/ 2> /dev/null || true + cp -r /tmp/hsperfdata* /tmp/*.hprof /tmp/core_dumps/ 2> /dev/null || true + cp repo/core/core/* /tmp/core_dumps/ 2> /dev/null || true + + - name: Upload core dumps + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: core-dumps + path: /tmp/core_dumps + + docs: + runs-on: ubuntu-latest + container: + image: s22s/circleci-openjdk-conda-gdal:b8e30ee + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: coursier/cache-action@v6 + - uses: olafurpg/setup-scala@v13 + with: + java-version: adopt@1.11 + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install Conda dependencies + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + $CONDA/bin/conda install -c conda-forge --yes --file pyrasterframes/src/main/python/requirements-condaforge.txt + + - run: ulimit -c unlimited -S + + - name: Build documentation + run: sbt makeSite + + - name: Collect artifacts + if: ${{ failure() }} + run: | + mkdir -p /tmp/core_dumps + cp core.* *.hs /tmp/core_dumps 2> /dev/null || true + mkdir -p /tmp/markdown + cp pyrasterframes/target/python/docs/*.md /tmp/markdown 2> /dev/null || true + + - name: Upload core dumps + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: core-dumps + path: /tmp/core_dumps + + - name: Upload markdown + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: markdown + path: /tmp/markdown + + - name: Upload rf-site + if: ${{ failure() }} + uses: actions/upload-artifact@v2 + with: + name: rf-site + path: docs/target/site \ No newline at end of file diff --git a/.sbtopts b/.sbtopts deleted file mode 100644 index 8b1378917..000000000 --- a/.sbtopts +++ /dev/null @@ -1 +0,0 @@ - diff --git a/.scalafmt.conf b/.scalafmt.conf index 82235ad30..499bd1da7 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,4 +1,4 @@ -version = 3.0.3 +version = 3.0.4 runner.dialect = scala212 indent.main = 2 indent.significant = 2 diff --git a/build.sbt b/build.sbt index bc4f3cceb..a04cadd66 100644 --- a/build.sbt +++ b/build.sbt @@ -19,6 +19,9 @@ * */ +// Leave me an my custom keys alone! +Global / lintUnusedKeysOnLoad := false + addCommandAlias("makeSite", "docs/makeSite") addCommandAlias("previewSite", "docs/previewSite") addCommandAlias("ghpagesPushSite", "docs/ghpagesPushSite") @@ -52,6 +55,10 @@ lazy val core = project libraryDependencies ++= Seq( `slf4j-api`, shapeless, + circe("core").value, + circe("generic").value, + circe("parser").value, + circe("generic-extras").value, frameless excludeAll ExclusionRule("com.github.mpilquist", "simulacrum"), `jts-core`, `spray-json`, @@ -152,14 +159,14 @@ lazy val docs = project .dependsOn(core, datasource, pyrasterframes) .enablePlugins(SiteScaladocPlugin, ParadoxPlugin, ParadoxMaterialThemePlugin, GhpagesPlugin, ScalaUnidocPlugin) .settings( - apiURL := Some(url("http://rasterframes.io/latest/api")), + apiURL := Some(url("https://rasterframes.io/latest/api")), autoAPIMappings := true, ghpagesNoJekyll := true, ScalaUnidoc / siteSubdirName := "latest/api", paradox / siteSubdirName := ".", paradoxProperties ++= Map( "version" -> version.value, - "scaladoc.org.apache.spark.sql.rf" -> "http://rasterframes.io/latest", + "scaladoc.org.apache.spark.sql.rf" -> "https://rasterframes.io/latest", "github.base_url" -> "" ), paradoxNavigationExpandDepth := Some(3), diff --git a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala index 2c2077fe4..6c4f38654 100644 --- a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala +++ b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala @@ -45,7 +45,7 @@ class TileUDT extends UserDefinedType[Tile] { def userClass: Class[Tile] = classOf[Tile] def sqlType: StructType = StructType(Seq( - StructField("cell_type", StringType, false), + StructField("cellType", StringType, false), StructField("cols", IntegerType, false), StructField("rows", IntegerType, false), StructField("cells", BinaryType, true), diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala index b570b7b4a..1fd99725e 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala @@ -82,7 +82,6 @@ package object expressions { registry.registerExpression[GetCRS]("rf_crs") registry.registerExpression[RealizeTile]("rf_tile") registry.registerExpression[CreateProjectedRaster]("rf_proj_raster") - registry.registerExpression[Subtract]("rf_local_subtract") registry.registerExpression[Multiply]("rf_local_multiply") registry.registerExpression[Divide]("rf_local_divide") registry.registerExpression[NormalizedDifference]("rf_normalized_difference") diff --git a/core/src/main/scala/org/locationtech/rasterframes/functions/package.scala b/core/src/main/scala/org/locationtech/rasterframes/functions/package.scala index 6545da41a..322f7d4df 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/functions/package.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/functions/package.scala @@ -99,38 +99,38 @@ package object functions { private[rasterframes] val tileOnes: (Int, Int, String) => Tile = (cols, rows, cellTypeName) => makeConstantTile(1, cols, rows, cellTypeName) - val reproject_and_merge_f: (Row, CRS, Seq[Tile], Seq[Row], Seq[CRS], Row, String) => Tile = (leftExtentEnc: Row, leftCRSEnc: CRS, tiles: Seq[Tile], rightExtentEnc: Seq[Row], rightCRSEnc: Seq[CRS], leftDimsEnc: Row, resampleMethod: String) => { - if (tiles.isEmpty) null + val reproject_and_merge_f: (Row, CRS, Seq[Tile], Seq[Row], Seq[CRS], Row, String) => Option[Tile] = (leftExtentEnc: Row, leftCRS: CRS, tiles: Seq[Tile], rightExtentEnc: Seq[Row], rightCRSs: Seq[CRS], leftDimsEnc: Row, resampleMethod: String) => { + if (tiles.isEmpty) None else { - require(tiles.length == rightExtentEnc.length && tiles.length == rightCRSEnc.length, "size mismatch") + require(tiles.length == rightExtentEnc.length && tiles.length == rightCRSs.length, "size mismatch") - val leftExtent: Extent = leftExtentEnc.as[Extent] - val leftDims: Dimensions[Int] = leftDimsEnc.as[Dimensions[Int]] - val leftCRS: CRS = leftCRSEnc - lazy val rightExtents: Seq[Extent] = rightExtentEnc.map(_.as[Extent]) - lazy val rightCRSs: Seq[CRS] = rightCRSEnc + val leftExtent = Option(leftExtentEnc).map(_.as[Extent]) + val leftDims = Option(leftDimsEnc).map(_.as[Dimensions[Int]]) + lazy val rightExtents = rightExtentEnc.map(_.as[Extent]) lazy val resample = resampleMethod match { case ResampleMethod(mm) => mm case _ => throw new IllegalArgumentException(s"Unable to parse ResampleMethod for ${resampleMethod}.") } - - if (leftExtent == null || leftDims == null || leftCRS == null) null - else { - - val cellType = tiles.map(_.cellType).reduceOption(_ union _).getOrElse(tiles.head.cellType) - - // TODO: how to allow control over... expression? - val projOpts = Reproject.Options(resample) - val dest: Tile = ArrayTile.empty(cellType, leftDims.cols, leftDims.rows) - //is there a GT function to do all this? - tiles.zip(rightExtents).zip(rightCRSs).map { - case ((tile, extent), crs) => - tile.reproject(extent, crs, leftCRS, projOpts) - }.foldLeft(dest)((d, t) => - d.merge(leftExtent, t.extent, t.tile, projOpts.method) - ) - } - } + (leftExtent, leftDims, Option(leftCRS)) + .zipped + .map((leftExtent, leftDims, leftCRS) => { + val cellType = tiles + .map(_.cellType) + .reduceOption(_ union _) + .getOrElse(tiles.head.cellType) + + // TODO: how to allow control over... expression? + val projOpts = Reproject.Options(resample) + val dest: Tile = ArrayTile.empty(cellType, leftDims.cols, leftDims.rows) + //is there a GT function to do all this? + tiles.zip(rightExtents).zip(rightCRSs).map { + case ((tile, extent), crs) => + tile.reproject(extent, crs, leftCRS, projOpts) + }.foldLeft(dest)((d, t) => + d.merge(leftExtent, t.extent, t.tile, projOpts.method) + ) + }) + }.headOption } // NB: Don't be tempted to make this a `val`. Spark will barf if `withRasterFrames` hasn't been called first. diff --git a/core/src/main/scala/org/locationtech/rasterframes/ref/Subgrid.scala b/core/src/main/scala/org/locationtech/rasterframes/ref/Subgrid.scala index 811b191f7..665fae5d1 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/ref/Subgrid.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/ref/Subgrid.scala @@ -1,3 +1,23 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2021 Azavea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ package org.locationtech.rasterframes.ref import geotrellis.raster.GridBounds diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterJoinSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterJoinSpec.scala index b5752b46b..e57255ea0 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterJoinSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterJoinSpec.scala @@ -25,6 +25,7 @@ import geotrellis.proj4.CRS import geotrellis.raster.resample._ import geotrellis.raster.testkit.RasterMatchers import geotrellis.raster.{Dimensions, IntConstantNoDataCellType, Raster, Tile} +import geotrellis.vector.Extent import org.apache.spark.SparkConf import org.apache.spark.sql.functions._ import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate @@ -195,6 +196,72 @@ class RasterJoinSpec extends TestEnvironment with TestData with RasterMatchers { // This just tests that the tiles are not identical result.getAs[Double]("min") should be > (0.0) } + + // Failed to execute user defined function(package$$$Lambda$4417/0x00000008019e2840: (struct, string, array,bandIndex:int,subextent:struct,subgrid:struct>>>, array>, array, struct, string) => struct,bandIndex:int,subextent:struct,subgrid:struct>>) + + it("should raster join with null left head") { + // https://github.com/locationtech/rasterframes/issues/462 + val prt = TestData.projectedRasterTile( + 10, 10, 1, + Extent(0.0, 0.0, 40.0, 40.0), + CRS.fromEpsgCode(32611), + ) + + val left = Seq( + (1, "a", prt.tile, prt.tile, prt.extent, prt.crs), + (1, "b", null, prt.tile, prt.extent, prt.crs) + ).toDF("i", "j", "t", "u", "e", "c") + + val right = Seq( + (1, prt.tile, prt.extent, prt.crs) + ).toDF("i", "r", "e", "c") + + val joined = left.rasterJoin(right, + left("i") === right("i"), + left("e"), left("c"), + right("e"), right("c"), + NearestNeighbor + ) + joined.count() should be (2) + + // In the case where the head column is null it will be passed thru + val t1 = joined + .select(isnull($"t")) + .filter($"j" === "b") + .first() + + t1.getBoolean(0) should be(true) + + // The right hand side tile should get dimensions from col `u` however + val collected = joined.select(rf_dimensions($"r")).collect() + collected.headOption should be (Some(Dimensions(10, 10))) + + // If there is no non-null tile on the LHS then the RHS is ill defined + val joinedNoLeftTile = left + .drop($"u") + .rasterJoin(right, + left("i") === right("i"), + left("e"), left("c"), + right("e"), right("c"), + NearestNeighbor + ) + joinedNoLeftTile.count() should be (2) + + // If there is no non-null tile on the LHS then the RHS is ill defined + val t2 = joinedNoLeftTile + .select(isnull($"t")) + .filter($"j" === "b") + .first() + t2.getBoolean(0) should be(true) + + // Because no non-null tile col on Left side, the right side is null too + val t3 = joinedNoLeftTile + .select(isnull($"r")) + .filter($"j" === "b") + .first() + t3.getBoolean(0) should be(true) + } + } override def additionalConf: SparkConf = super.additionalConf.set("spark.sql.codegen.comments", "true") diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterLayerSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterLayerSpec.scala index 4decaf7cf..1dce2a6ca 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterLayerSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterLayerSpec.scala @@ -26,7 +26,6 @@ package org.locationtech.rasterframes import java.net.URI import java.sql.Timestamp import java.time.ZonedDateTime - import geotrellis.layer.{withMergableMethods => _, _} import geotrellis.proj4.{CRS, LatLng} import geotrellis.raster._ @@ -37,6 +36,7 @@ import org.apache.spark.sql.{SQLContext, SparkSession} import org.locationtech.rasterframes.ref.RFRasterSource import org.locationtech.rasterframes.tiles.ProjectedRasterTile import org.locationtech.rasterframes.util._ +import org.scalatest.BeforeAndAfterEach import scala.util.control.NonFatal @@ -46,10 +46,16 @@ import scala.util.control.NonFatal * @since 7/10/17 */ class RasterLayerSpec extends TestEnvironment with MetadataKeys - with TestData { + with BeforeAndAfterEach with TestData { import TestData.randomTile import spark.implicits._ + override def beforeEach(): Unit = { + // Try to GC to avoid OOM on low memory instances. + // TODO: remove once we have a larger CI + System.gc() + } + describe("Runtime environment") { it("should provide build info") { //assert(RFBuildInfo.toMap.nonEmpty) diff --git a/core/src/test/scala/org/locationtech/rasterframes/StandardEncodersSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/StandardEncodersSpec.scala index d5ddbcc18..a2cbad0b7 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/StandardEncodersSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/StandardEncodersSpec.scala @@ -41,8 +41,6 @@ class StandardEncodersSpec extends TestEnvironment with TestData with Inspectors import spark.implicits._ val data = Dimensions[Int](256, 256) val df = List(data).toDF() - df.show() - df.printSchema() val fs = df.as[Dimensions[Int]] val out = fs.first() out shouldBe data @@ -53,8 +51,6 @@ class StandardEncodersSpec extends TestEnvironment with TestData with Inspectors import spark.implicits._ val data = TileDataContext(IntCellType, Dimensions[Int](256, 256)) val df = List(data).toDF() - df.show() - df.printSchema() val fs = df.as[TileDataContext] val out = fs.first() out shouldBe data @@ -65,8 +61,6 @@ class StandardEncodersSpec extends TestEnvironment with TestData with Inspectors import spark.implicits._ val data = ProjectedExtent(Extent(0, 0, 1, 1), LatLng) val df = List(data).toDF() - df.show() - df.printSchema() df.select($"crs".cast(StringType)).show() val fs = df.as[ProjectedExtent] val out = fs.first() @@ -84,8 +78,6 @@ class StandardEncodersSpec extends TestEnvironment with TestData with Inspectors KeyBounds(SpatialKey(0,0), SpatialKey(9,9)) ) val df = List(data).toDF() - df.show() - df.printSchema() val fs = df.as[TileLayerMetadata[SpatialKey]] val out = fs.first() out shouldBe data diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/Poke.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/Poke.scala new file mode 100644 index 000000000..37cd3898c --- /dev/null +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/Poke.scala @@ -0,0 +1,15 @@ +package org.locationtech.rasterframes.datasource +import _root_.geotrellis.raster._ + +object Poke extends App { +// import _root_.geotrellis.raster.io.geotiff.TiffType +// val enc = TiffType.tiffTypeEncoder +// println(enc(TiffType.fromCode(43))) + + val rnd = new scala.util.Random(42) + val (cols, rows) = (10, 10) + val bytes = Array.ofDim[Byte](cols * rows) + rnd.nextBytes(bytes) + val tile = ArrayTile.fromBytes(bytes, UByteCellType, cols, rows) + println(tile.renderAscii()) +} diff --git a/project/RFAssemblyPlugin.scala b/project/RFAssemblyPlugin.scala index 577af4e30..6a3646509 100644 --- a/project/RFAssemblyPlugin.scala +++ b/project/RFAssemblyPlugin.scala @@ -30,7 +30,7 @@ import scala.util.matching.Regex * Standard support for creating assembly jars. */ object RFAssemblyPlugin extends AutoPlugin { - override def requires = AssemblyPlugin + override def requires = AssemblyPlugin && RFDependenciesPlugin implicit class RichRegex(val self: Regex) extends AnyVal { def =~(s: String) = self.pattern.matcher(s).matches @@ -51,6 +51,7 @@ object RFAssemblyPlugin extends AutoPlugin { assembly / assemblyShadeRules:= { val shadePrefixes = Seq( "shapeless", + "com.github.mpilquist", "com.amazonaws", "org.apache.avro", "org.apache.http", @@ -58,7 +59,9 @@ object RFAssemblyPlugin extends AutoPlugin { "com.google.common", "com.typesafe.config", "com.fasterxml.jackson", - "io.netty" + "io.netty", + "spire", + "cats.kernel" ) shadePrefixes.map(p => ShadeRule.rename(s"$p.**" -> s"shaded.rasterframes.$p.@1").inAll) }, diff --git a/project/RFDependenciesPlugin.scala b/project/RFDependenciesPlugin.scala index 1b2a8c6fe..c3f830930 100644 --- a/project/RFDependenciesPlugin.scala +++ b/project/RFDependenciesPlugin.scala @@ -39,7 +39,12 @@ object RFDependenciesPlugin extends AutoPlugin { def geomesa(module: String) = Def.setting { "org.locationtech.geomesa" %% s"geomesa-$module" % rfGeoMesaVersion.value } - + def circe(module: String) = Def.setting { + module match { + case "json-schema" => "io.circe" %% s"circe-$module" % "0.1.0" + case _ => "io.circe" %% s"circe-$module" % "0.14.1" + } + } val scalatest = "org.scalatest" %% "scalatest" % "3.2.5" % Test val shapeless = "com.chuusai" %% "shapeless" % "2.3.7" val `jts-core` = "org.locationtech.jts" % "jts-core" % "1.17.0" diff --git a/project/RFProjectPlugin.scala b/project/RFProjectPlugin.scala index e250cb8ea..7aba41f30 100644 --- a/project/RFProjectPlugin.scala +++ b/project/RFProjectPlugin.scala @@ -69,6 +69,12 @@ object RFProjectPlugin extends AutoPlugin { email = "echeipesh@gmail.com", url = url("https://github.com/echeipesh") ), + Developer( + id = "pomadchin", + name = "Grigory Pomadchin", + email = "gpomadchin@azavea.com", + url = url("https://github.com/pomadchin") + ), Developer( id = "bguseman", name = "Ben Guseman", diff --git a/pyrasterframes/src/main/python/pyrasterframes/pyproject.toml b/pyrasterframes/src/main/python/pyrasterframes/pyproject.toml new file mode 100644 index 000000000..b62a44d4a --- /dev/null +++ b/pyrasterframes/src/main/python/pyrasterframes/pyproject.toml @@ -0,0 +1,4 @@ +[build-system] +# Minimum requirements for the build system to execute. +requires = ["setuptools", "wheel"] # PEP 508 specifications. +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_types.py b/pyrasterframes/src/main/python/pyrasterframes/rf_types.py index dfc75b941..516a0eb2c 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rf_types.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rf_types.py @@ -26,6 +26,8 @@ class here provides the PyRasterFrames entry point. """ from itertools import product import functools, math + +import pyproj from pyspark import SparkContext from pyspark.sql import DataFrame, Column from pyspark.sql.types import (UserDefinedType, StructType, StructField, BinaryType, DoubleType, ShortType, IntegerType, StringType) @@ -42,7 +44,8 @@ class here provides the PyRasterFrames entry point. from typing import List, Tuple -__all__ = ['RasterFrameLayer', 'Tile', 'TileUDT', 'CellType', 'Extent', 'CRS', 'RasterSourceUDT', 'TileExploder', 'NoDataFilter'] +__all__ = ['RasterFrameLayer', 'Tile', 'TileUDT', 'CellType', 'Extent', + 'CRS', 'CrsUDT', 'RasterSourceUDT', 'TileExploder', 'NoDataFilter'] class cached_property(object): @@ -227,7 +230,12 @@ def __str__(self): class CRS(object): # NB: The name `crsProj4` has to match what's used in StandardSerializers.crsSerializers def __init__(self, crsProj4): - self.crsProj4 = crsProj4 + if isinstance(crsProj4, pyproj.CRS): + self.crsProj4 = crsProj4.to_proj4() + elif isinstance(crsProj4, str): + self.crsProj4 = crsProj4 + else: + raise ValueError('Unexpected CRS definition type: {}'.format(type(crsProj4))) @cached_property def __jvm__(self): @@ -242,9 +250,13 @@ def proj4_str(self): """Alias for `crsProj4`""" return self.crsProj4 + def __eq__(self, other): + return isinstance(other, CRS) and self.crsProj4 == other.crsProj4 + class CellType(object): def __init__(self, cell_type_name): + assert(isinstance(cell_type_name, str)) self.cell_type_name = cell_type_name @classmethod @@ -443,29 +455,34 @@ def sqlType(cls): """ Mirrors `schema` in scala companion object org.apache.spark.sql.rf.TileUDT """ + extent = StructType([ + StructField("xmin",DoubleType(), True), + StructField("ymin",DoubleType(), True), + StructField("xmax",DoubleType(), True), + StructField("ymax",DoubleType(), True) + ]) + subgrid = StructType([ + StructField("colMin", IntegerType(), True), + StructField("rowMin", IntegerType(), True), + StructField("colMax", IntegerType(), True), + StructField("rowMax", IntegerType() ,True) + ]) + + ref = StructType([ + StructField("source", StructType([ + StructField("raster_source_kryo", BinaryType(), False) + ]),True), + StructField("bandIndex", IntegerType(), True), + StructField("subextent", extent ,True), + StructField("subgrid", subgrid, True), + ]) + return StructType([ - StructField("cell_context", StructType([ - StructField("cellType", StructType([ - StructField("cellTypeName", StringType(), False) - ]), False), - StructField("dimensions", StructType([ - StructField("cols", ShortType(), False), - StructField("rows", ShortType(), False) - ]), False), - ]), False), - StructField("cell_data", StructType([ - StructField("cells", BinaryType(), True), - StructField("ref", StructType([ - StructField("source", RasterSourceUDT(), False), - StructField("bandIndex", IntegerType(), False), - StructField("subextent", StructType([ - StructField("xmin", DoubleType(), False), - StructField("ymin", DoubleType(), False), - StructField("xmax", DoubleType(), False), - StructField("ymax", DoubleType(), False) - ]), True) - ]), True) - ]), False) + StructField("cellType", StringType(), False), + StructField("cols", IntegerType(), False), + StructField("rows", IntegerType(), False), + StructField("cells", BinaryType(), True), + StructField("ref", ref, True) ]) @classmethod @@ -478,20 +495,14 @@ def scalaUDT(cls): def serialize(self, tile): cells = bytearray(tile.cells.flatten().tobytes()) - row = [ - # cell_context - [ - [tile.cell_type.cell_type_name], - tile.dimensions() - ], - # cell_data - [ - # cells - cells, - None - ] + dims = tile.dimensions() + return [ + tile.cell_type.cell_type_name, + dims[0], + dims[1], + cells, + None ] - return row def deserialize(self, datum): """ @@ -500,21 +511,21 @@ def deserialize(self, datum): :return: A Tile object from row data. """ - cell_data_bytes = datum.cell_data.cells + cell_data_bytes = datum.cells if cell_data_bytes is None: - if datum.cell_data.ref is None: + if datum.ref is None: raise Exception("Invalid Tile structure. Missing cells and reference") else: - payload = datum.cell_data.ref + payload = datum.ref ref = RFContext.active()._resolve_raster_ref(payload) cell_type = CellType(ref.cellType().name()) cols = ref.cols() rows = ref.rows() cell_data_bytes = ref.tile().toBytes() else: - cell_type = CellType(datum.cell_context.cellType.cellTypeName) - cols = datum.cell_context.dimensions.cols - rows = datum.cell_context.dimensions.rows + cell_type = CellType(datum.cellType) + cols = datum.cols + rows = datum.rows if cell_data_bytes is None: raise Exception("Unable to fetch cell data from: " + repr(datum)) @@ -540,6 +551,34 @@ def deserialize(self, datum): Tile.__UDT__ = TileUDT() +class CrsUDT(UserDefinedType): + @classmethod + def sqlType(cls): + """ + Mirrors `schema` in scala companion object org.apache.spark.sql.rf.CrsUDT + """ + return StringType() + + @classmethod + def module(cls): + return 'pyrasterframes.rf_types' + + @classmethod + def scalaUDT(cls): + return 'org.apache.spark.sql.rf.CrsUDT' + + def serialize(self, crs): + return crs.proj4_str + + def deserialize(self, datum): + return CRS(datum) + + deserialize.__safe_for_unpickling__ = True + + +CRS.__UDT__ = CrsUDT() + + class TileExploder(JavaTransformer, DefaultParamsReadable, DefaultParamsWritable): """ Python wrapper for TileExploder.scala @@ -559,3 +598,9 @@ def __init__(self): super(NoDataFilter, self).__init__() self._java_obj = self._new_java_obj("org.locationtech.rasterframes.ml.NoDataFilter", self.uid) + + def setInputCols(self, value): + """ + Sets the value of :py:attr:`inputCol`. + """ + return self._set(inputCols=value) diff --git a/pyrasterframes/src/main/python/pyrasterframes/utils.py b/pyrasterframes/src/main/python/pyrasterframes/utils.py index 54916d3db..d7ee97e2e 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/utils.py +++ b/pyrasterframes/src/main/python/pyrasterframes/utils.py @@ -52,7 +52,7 @@ def pdir(curr): # See if we're running outside of sbt build and adjust if os.path.basename(target_dir) != "target": target_dir = os.path.join(pdir(pdir(target_dir)), 'target') - jar_dir = os.path.join(target_dir, 'scala-2.11') + jar_dir = os.path.join(target_dir, 'scala-2.12') return os.path.realpath(jar_dir) diff --git a/pyrasterframes/src/main/python/pyrasterframes/version.py b/pyrasterframes/src/main/python/pyrasterframes/version.py index 5f3ec66c6..cfcfa2bf4 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/version.py +++ b/pyrasterframes/src/main/python/pyrasterframes/version.py @@ -20,4 +20,4 @@ # # Translating Java version from version.sbt to PEP440 norms -__version__: str = '0.9.1' +__version__: str = '0.9.2' diff --git a/pyrasterframes/src/main/python/requirements-condaforge.txt b/pyrasterframes/src/main/python/requirements-condaforge.txt index 9680a4129..900b3789f 100644 --- a/pyrasterframes/src/main/python/requirements-condaforge.txt +++ b/pyrasterframes/src/main/python/requirements-condaforge.txt @@ -1,4 +1,4 @@ # These packages should be installed from conda-forge, given their complex binary components. -gdal==2.4.4 +gdal==3.1.2 rasterio[s3] rtree diff --git a/pyrasterframes/src/main/python/setup.py b/pyrasterframes/src/main/python/setup.py index c6ad71acc..ecb516c1f 100644 --- a/pyrasterframes/src/main/python/setup.py +++ b/pyrasterframes/src/main/python/setup.py @@ -140,7 +140,6 @@ def dest_file(self, src_file): # to throw a `NotImplementedError: Can't perform this operation for unregistered loader type` pytest = 'pytest>=4.0.0,<5.0.0' - pyspark = 'pyspark==3.1.1' boto3 = 'boto3' deprecation = 'deprecation' @@ -148,13 +147,14 @@ def dest_file(self, src_file): matplotlib = 'matplotlib' fiona = 'fiona' folium = 'folium' -gdal = 'gdal==2.4.4' +gdal = 'gdal==3.1.2' geopandas = 'geopandas' ipykernel = 'ipykernel' ipython = 'ipython' numpy = 'numpy' pandas = 'pandas' pypandoc = 'pypandoc' +pyproj = 'pyproj' pytest_runner = 'pytest-runner' pytz = 'pytz' rasterio = 'rasterio' @@ -185,7 +185,7 @@ def dest_file(self, src_file): 'Bug Reports': 'https://github.com/locationtech/rasterframes/issues', 'Source': 'https://github.com/locationtech/rasterframes', }, - python_requires=">=3.5", + python_requires=">=3.7", install_requires=[ gdal, pytz, @@ -193,6 +193,7 @@ def dest_file(self, src_file): pyspark, numpy, pandas, + pyproj, tabulate, deprecation, ], diff --git a/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py b/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py index ef28c6562..e8f34f3a4 100644 --- a/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py +++ b/pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py @@ -66,7 +66,7 @@ def test_unstructured_write_schemaless(self): from pyrasterframes.rasterfunctions import rf_agg_stats, rf_crs rf = self.spark.read.raster(self.img_uri) max = rf.agg(rf_agg_stats('proj_raster').max.alias('max')).first()['max'] - crs = rf.select(rf_crs('proj_raster').crsProj4.alias('c')).first()['c'] + crs = rf.select(rf_crs('proj_raster').alias('crs')).first()['crs'] dest_file = self._tmpfile() self.assertTrue(not dest_file.startswith('file://')) diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py index 0f9fab13c..620e96a6c 100644 --- a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py @@ -29,7 +29,6 @@ from . import TestEnvironment - class UtilTest(TestEnvironment): def test_spark_confs(self): @@ -100,168 +99,6 @@ def test_cell_type_conversion(self): ) -class UDT(TestEnvironment): - - def setUp(self): - self.create_layer() - - def test_mask_no_data(self): - t1 = Tile(np.array([[1, 2], [3, 4]]), CellType("int8ud3")) - self.assertTrue(t1.cells.mask[1][0]) - self.assertIsNotNone(t1.cells[1][1]) - self.assertEqual(len(t1.cells.compressed()), 3) - - t2 = Tile(np.array([[1.0, 2.0], [float('nan'), 4.0]]), CellType.float32()) - self.assertEqual(len(t2.cells.compressed()), 3) - self.assertTrue(t2.cells.mask[1][0]) - self.assertIsNotNone(t2.cells[1][1]) - - def test_tile_udt_serialization(self): - from pyspark.sql.types import StructType, StructField - - udt = TileUDT() - cell_types = (ct for ct in rf_cell_types() if not (ct.is_raw() or ("bool" in ct.base_cell_type_name()))) - - for ct in cell_types: - cells = (100 + np.random.randn(3, 3) * 100).astype(ct.to_numpy_dtype()) - - if ct.is_floating_point(): - nd = 33.0 - else: - nd = 33 - - cells[1][1] = nd - a_tile = Tile(cells, ct.with_no_data_value(nd)) - round_trip = udt.fromInternal(udt.toInternal(a_tile)) - self.assertEqual(a_tile, round_trip, "round-trip serialization for " + str(ct)) - - schema = StructType([StructField("tile", TileUDT(), False)]) - df = self.spark.createDataFrame([{"tile": a_tile}], schema) - - long_trip = df.first()["tile"] - self.assertEqual(long_trip, a_tile) - - def test_masked_deser(self): - t = Tile(np.array([[1, 2, 3,], [4, 5, 6], [7, 8, 9]]), - CellType('uint8')) - - df = self.spark.createDataFrame([Row(t=t)]) - roundtrip = df.select(rf_mask_by_value('t', - rf_local_greater('t', lit(6)), - 1)) \ - .first()[0] - self.assertEqual( - roundtrip.cells.mask.sum(), - 3, - f"Expected {3} nodata values but found Tile" - f"{roundtrip}" - ) - - def test_udf_on_tile_type_input(self): - import numpy.testing - df = self.spark.read.raster(self.img_uri) - rf = self.rf - - # create trivial UDF that does something we already do with raster_Functions - @udf('integer') - def my_udf(t): - a = t.cells - return a.size # same as rf_dimensions.cols * rf_dimensions.rows - - rf_result = rf.select( - (rf_dimensions('tile').cols.cast('int') * rf_dimensions('tile').rows.cast('int')).alias('expected'), - my_udf('tile').alias('result')).toPandas() - - numpy.testing.assert_array_equal( - rf_result.expected.tolist(), - rf_result.result.tolist() - ) - - df_result = df.select( - (rf_dimensions(df.proj_raster).cols.cast('int') * rf_dimensions(df.proj_raster).rows.cast('int') - - my_udf(rf_tile(df.proj_raster))).alias('result') - ).toPandas() - - numpy.testing.assert_array_equal( - np.zeros(len(df_result)), - df_result.result.tolist() - ) - - def test_udf_on_tile_type_output(self): - import numpy.testing - - rf = self.rf - - # create a trivial UDF that does something we already do with a raster_functions - @udf(TileUDT()) - def my_udf(t): - import numpy as np - return Tile(np.log1p(t.cells)) - - rf_result = rf.select( - rf_tile_max( - rf_local_subtract( - my_udf(rf.tile), - rf_log1p(rf.tile) - ) - ).alias('expect_zeros') - ).collect() - - # almost equal because of different implemenations under the hoods: C (numpy) versus Java (rf_) - numpy.testing.assert_almost_equal( - [r['expect_zeros'] for r in rf_result], - [0.0 for _ in rf_result], - decimal=6 - ) - - def test_no_data_udf_handling(self): - from pyspark.sql.types import StructType, StructField - - t1 = Tile(np.array([[1, 2], [0, 4]]), CellType.uint8()) - self.assertEqual(t1.cell_type.to_numpy_dtype(), np.dtype("uint8")) - e1 = Tile(np.array([[2, 3], [0, 5]]), CellType.uint8()) - schema = StructType([StructField("tile", TileUDT(), False)]) - df = self.spark.createDataFrame([{"tile": t1}], schema) - - @udf(TileUDT()) - def increment(t): - return t + 1 - - r1 = df.select(increment(df.tile).alias("inc")).first()["inc"] - self.assertEqual(r1, e1) - - def test_udf_np_implicit_type_conversion(self): - import math - import pandas - - a1 = np.array([[1, 2], [0, 4]]) - t1 = Tile(a1, CellType.uint8()) - exp_array = a1.astype('>f8') - - @udf(TileUDT()) - def times_pi(t): - return t * math.pi - - @udf(TileUDT()) - def divide_pi(t): - return t / math.pi - - @udf(TileUDT()) - def plus_pi(t): - return t + math.pi - - @udf(TileUDT()) - def less_pi(t): - return t - math.pi - - df = self.spark.createDataFrame(pandas.DataFrame([{"tile": t1}])) - r1 = df.select( - less_pi(divide_pi(times_pi(plus_pi(df.tile)))) - ).first()[0] - - self.assertTrue(np.all(r1.cells == exp_array)) - self.assertEqual(r1.cells.dtype, exp_array.dtype) - class TileOps(TestEnvironment): diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py index 3f17c1c3d..7b94c2f05 100644 --- a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py +++ b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py @@ -642,5 +642,5 @@ def test_rf_proj_raster(self): df = self.prdf.select(rf_proj_raster(rf_tile('proj_raster'), rf_extent('proj_raster'), rf_crs('proj_raster')).alias('roll_your_own')) - self.assertIn('tile_context', df.schema['roll_your_own'].dataType.fieldNames()) + self.assertIn('extent', df.schema['roll_your_own'].dataType.fieldNames()) diff --git a/pyrasterframes/src/main/python/tests/UDTTests.py b/pyrasterframes/src/main/python/tests/UDTTests.py new file mode 100644 index 000000000..6ed39391d --- /dev/null +++ b/pyrasterframes/src/main/python/tests/UDTTests.py @@ -0,0 +1,195 @@ +import unittest + +import numpy as np +from pyrasterframes.rasterfunctions import * +from pyrasterframes.rf_types import * +from pyspark.sql.functions import * +from pyspark.sql import Row, DataFrame +from pyproj import CRS as pyCRS + +from . import TestEnvironment + + +class TileUDTTests(TestEnvironment): + + def setUp(self): + self.create_layer() + + def test_mask_no_data(self): + t1 = Tile(np.array([[1, 2], [3, 4]]), CellType("int8ud3")) + self.assertTrue(t1.cells.mask[1][0]) + self.assertIsNotNone(t1.cells[1][1]) + self.assertEqual(len(t1.cells.compressed()), 3) + + t2 = Tile(np.array([[1.0, 2.0], [float('nan'), 4.0]]), CellType.float32()) + self.assertEqual(len(t2.cells.compressed()), 3) + self.assertTrue(t2.cells.mask[1][0]) + self.assertIsNotNone(t2.cells[1][1]) + + def test_tile_udt_serialization(self): + from pyspark.sql.types import StructType, StructField + + udt = TileUDT() + cell_types = (ct for ct in rf_cell_types() if not (ct.is_raw() or ("bool" in ct.base_cell_type_name()))) + + for ct in cell_types: + cells = (100 + np.random.randn(3, 3) * 100).astype(ct.to_numpy_dtype()) + + if ct.is_floating_point(): + nd = 33.0 + else: + nd = 33 + + cells[1][1] = nd + a_tile = Tile(cells, ct.with_no_data_value(nd)) + round_trip = udt.fromInternal(udt.toInternal(a_tile)) + self.assertEqual(a_tile, round_trip, "round-trip serialization for " + str(ct)) + + schema = StructType([StructField("tile", TileUDT(), False)]) + df = self.spark.createDataFrame([{"tile": a_tile}], schema) + + long_trip = df.first()["tile"] + self.assertEqual(long_trip, a_tile) + + def test_masked_deser(self): + t = Tile(np.array([[1, 2, 3,], [4, 5, 6], [7, 8, 9]]), + CellType('uint8')) + + df = self.spark.createDataFrame([Row(t=t)]) + roundtrip = df.select(rf_mask_by_value('t', + rf_local_greater('t', lit(6)), + 1)) \ + .first()[0] + self.assertEqual( + roundtrip.cells.mask.sum(), + 3, + f"Expected {3} nodata values but found Tile" + f"{roundtrip}" + ) + + def test_udf_on_tile_type_input(self): + import numpy.testing + df = self.spark.read.raster(self.img_uri) + rf = self.rf + + # create trivial UDF that does something we already do with raster_Functions + @udf('integer') + def my_udf(t): + a = t.cells + return a.size # same as rf_dimensions.cols * rf_dimensions.rows + + rf_result = rf.select( + (rf_dimensions('tile').cols.cast('int') * rf_dimensions('tile').rows.cast('int')).alias('expected'), + my_udf('tile').alias('result')).toPandas() + + numpy.testing.assert_array_equal( + rf_result.expected.tolist(), + rf_result.result.tolist() + ) + + df_result = df.select( + (rf_dimensions(df.proj_raster).cols.cast('int') * rf_dimensions(df.proj_raster).rows.cast('int') - + my_udf(rf_tile(df.proj_raster))).alias('result') + ).toPandas() + + numpy.testing.assert_array_equal( + np.zeros(len(df_result)), + df_result.result.tolist() + ) + + def test_udf_on_tile_type_output(self): + import numpy.testing + + rf = self.rf + + # create a trivial UDF that does something we already do with a raster_functions + @udf(TileUDT()) + def my_udf(t): + import numpy as np + return Tile(np.log1p(t.cells)) + + rf_result = rf.select( + rf_tile_max( + rf_local_subtract( + my_udf(rf.tile), + rf_log1p(rf.tile) + ) + ).alias('expect_zeros') + ).collect() + + # almost equal because of different implemenations under the hoods: C (numpy) versus Java (rf_) + numpy.testing.assert_almost_equal( + [r['expect_zeros'] for r in rf_result], + [0.0 for _ in rf_result], + decimal=6 + ) + + def test_no_data_udf_handling(self): + from pyspark.sql.types import StructType, StructField + + t1 = Tile(np.array([[1, 2], [0, 4]]), CellType.uint8()) + self.assertEqual(t1.cell_type.to_numpy_dtype(), np.dtype("uint8")) + e1 = Tile(np.array([[2, 3], [0, 5]]), CellType.uint8()) + schema = StructType([StructField("tile", TileUDT(), False)]) + df = self.spark.createDataFrame([{"tile": t1}], schema) + + @udf(TileUDT()) + def increment(t): + return t + 1 + + r1 = df.select(increment(df.tile).alias("inc")).first()["inc"] + self.assertEqual(r1, e1) + + def test_udf_np_implicit_type_conversion(self): + import math + import pandas + + a1 = np.array([[1, 2], [0, 4]]) + t1 = Tile(a1, CellType.uint8()) + exp_array = a1.astype('>f8') + + @udf(TileUDT()) + def times_pi(t): + return t * math.pi + + @udf(TileUDT()) + def divide_pi(t): + return t / math.pi + + @udf(TileUDT()) + def plus_pi(t): + return t + math.pi + + @udf(TileUDT()) + def less_pi(t): + return t - math.pi + + df = self.spark.createDataFrame(pandas.DataFrame([{"tile": t1}])) + r1 = df.select( + less_pi(divide_pi(times_pi(plus_pi(df.tile)))) + ).first()[0] + + self.assertTrue(np.all(r1.cells == exp_array)) + self.assertEqual(r1.cells.dtype, exp_array.dtype) + +class CrsUDTTests(TestEnvironment): + + def setUp(self): + pass + + + def test_crs_udt_serialization(self): + udt = CrsUDT() + + crs = CRS(pyCRS.from_epsg(4326).to_proj4()) + + roundtrip = udt.fromInternal(udt.toInternal(crs)) + assert(crs == roundtrip) + + def test_extract_from_raster(self): + # should be able to write a projected raster tile column to path like '/data/foo/file.tif' + from pyrasterframes.rasterfunctions import rf_crs + rf = self.spark.read.raster(self.img_uri) + crs: DataFrame = rf.select(rf_crs('proj_raster').alias('crs')).distinct() + assert(crs.schema.fields[0].dataType == CrsUDT()) + assert(crs.first()['crs'].proj4_str == '+proj=utm +zone=16 +datum=WGS84 +units=m +no_defs ') diff --git a/pyrasterframes/src/main/python/tests/__init__.py b/pyrasterframes/src/main/python/tests/__init__.py index e7fe61dba..d273f8188 100644 --- a/pyrasterframes/src/main/python/tests/__init__.py +++ b/pyrasterframes/src/main/python/tests/__init__.py @@ -51,7 +51,7 @@ def pdir(curr): return os.path.dirname(curr) here = os.path.dirname(os.path.realpath(__file__)) - scala_target = os.path.realpath(os.path.join(pdir(pdir(here)), 'scala-2.11')) + scala_target = os.path.realpath(os.path.join(pdir(pdir(here)), 'scala-2.12')) rez_dir = os.path.realpath(os.path.join(scala_target, 'test-classes')) # If not running in build mode, try source dirs. if not os.path.exists(rez_dir): @@ -63,7 +63,8 @@ def spark_test_session(): spark = create_rf_spark_session(**{ 'spark.master': 'local[*, 2]', 'spark.ui.enabled': 'false', - 'spark.app.name': app_name + 'spark.app.name': app_name, + #'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' }) spark.sparkContext.setLogLevel('ERROR') diff --git a/rf-notebook/src/main/docker/requirements-nb.txt b/rf-notebook/src/main/docker/requirements-nb.txt index d06f2bd94..cff89d025 100644 --- a/rf-notebook/src/main/docker/requirements-nb.txt +++ b/rf-notebook/src/main/docker/requirements-nb.txt @@ -1,5 +1,5 @@ -pyspark>=2.4.7,<=3.0 -gdal==2.4.4 +pyspark>=3.1 +gdal==3.1.2 numpy pandas shapely