Skip to content

Commit

Permalink
0. Heavily refactored DataLoader.py
Browse files Browse the repository at this point in the history
  • Loading branch information
calebrob6 committed Dec 20, 2018
1 parent a16ea1c commit 92ac83e
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 229 deletions.
297 changes: 102 additions & 195 deletions DataLoader.py
@@ -1,10 +1,9 @@
import sys
import os
import time
import string

import numpy as np
from collections import defaultdict
from enum import Enum

import fiona
import fiona.transform
Expand All @@ -20,6 +19,25 @@
import pickle
import glob

import GeoTools

class GeoDataTypes(Enum):
NAIP = 1
NLCD = 2
LANDSAT_LEAFON = 3
LANDSAT_LEAFOFF = 4
BUILDINGS = 5
LANDCOVER = 6

# ------------------------------------------------------------------------------
# Le's methods for finding which NAIP tiles exist for a given "filename id"
#
# E.g. given "m_3907638_nw_18_1_20150815.mrf", find all files from other years that
# match "m_3907638_nw_18_1*"
#
# TODO: Make this lookup faster, currently O(n) where n is total number of tiles 100k-1000k
# ------------------------------------------------------------------------------

def naip2id(naip):
return '_'.join(naip.split('/')[-1].split('_')[:-1])

Expand All @@ -28,122 +46,94 @@ def get_naip_same_loc(naip):
return naip_d[naip2id(naip)]
return [naip,]

def naip_fn_to_pred_tile_fn(naip_fn):
return os.path.basename(naip_fn).split('.mrf')[0] + '_tilepred'

def get_landsat_by_extent(naip_fn, extent, pad_rad):
ls_on_fn = naip_fn.replace("/esri-naip/data/v1/", "/resampled-landsat8/data/leaf_on/")[:-4] + "_landsat.tif"
ls_off_fn = naip_fn.replace("/esri-naip/data/v1/", "/resampled-landsat8/data/leaf_off/")[:-4] + "_landsat.tif"
#print(ls_on_fn)
#print(ls_off_fn)

f = rasterio.open(ls_on_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
minx, miny, maxx, maxy = shapely.geometry.shape(geom).buffer(pad_rad).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image1, out_transform = rasterio.mask.mask(f, [geom], crop=True)
#raster_mask, _, win = rasterio.mask.raster_geometry_mask(f, [geom], crop=True)
f.close()

f = rasterio.open(ls_off_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
minx, miny, maxx, maxy = shapely.geometry.shape(geom).buffer(pad_rad).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image2, out_transform = rasterio.mask.mask(f, [geom], crop=True)
f.close()

return np.concatenate((out_image1, out_image2), axis=0)

def get_blg_by_extent(naip_fn, extent, pad_rad):
blg_fn = naip_fn.replace("/esri-naip/data/v1/", "/resampled-buildings/data/v1/")[:-4] + "_building.tif"

f = rasterio.open(blg_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
minx, miny, maxx, maxy = shapely.geometry.shape(geom).buffer(pad_rad).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True)
f.close()

return out_image

def get_nlcd_by_extent(naip_fn, extent, pad_rad):
nlcd_fn = naip_fn.replace("/esri-naip/", "/resampled-nlcd/")[:-4] + "_nlcd.tif"

f = rasterio.open(nlcd_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
minx, miny, maxx, maxy = shapely.geometry.shape(geom).buffer(pad_rad).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True)
f.close()

out_image = np.squeeze(out_image)
out_image = np.vectorize(cid.__getitem__)(out_image)
assert all([os.path.exists(fn) for fn in [
"data/list_all_naip.txt",
]])

return out_image
naip_d = {}
fdid = open('data/list_all_naip.txt', 'r')
while True:
line = fdid.readline().strip()
if not line:
break
naipid = naip2id(line)
if naipid in naip_d:
naip_d[naipid] += [line,]
else:
naip_d[naipid] = [line,]
fdid.close()

def get_lc_by_extent(naip_fn, extent, pad_rad):
naip_fns = get_naip_same_loc(naip_fn)
for naip_fname in naip_fns:
lc_fn = naip_fname.replace("/esri-naip/", "/resampled-lc/")[:-4] + "_lc.tif"
if os.path.exists(lc_fn):
#print(lc_fn)
# ------------------------------------------------------------------------------

f = rasterio.open(lc_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
minx, miny, maxx, maxy = shapely.geometry.shape(geom).buffer(pad_rad).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True)
f.close()

out_image = np.squeeze(out_image)
out_image[out_image>=7] = 0
# ------------------------------------------------------------------------------
# Caleb's methods for finding which NAIP tiles are assosciated with an input extent
#
# TODO: Assume that the tile_index.dat file is already created, make a separate script
# for generating it
# ------------------------------------------------------------------------------

return out_image

return np.zeros((10, 10), dtype=np.float32)
assert all([os.path.exists(fn) for fn in [
"data/tile_index.dat",
"data/tile_index.idx",
"data/tiles.p"
]])
TILES = pickle.load(open("data/tiles.p", "rb"))

def get_naip_by_extent(naip_fn, extent):
naip_fns = get_naip_same_loc(naip_fn)
naip_fn = naip_fns[-1]
#print(naip_fn)

f = rasterio.open(naip_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
def lookup_tile_by_geom(geom):
tile_index = rtree.index.Index("data/tile_index")

pad_rad = 4
bounds = shapely.geometry.shape(geom).buffer(pad_rad).bounds
minx, miny, maxx, maxy = bounds
while (maxx-minx) < 280 or (maxy-miny) < 280:
pad_rad += 20
bounds = shapely.geometry.shape(geom).buffer(pad_rad).bounds
minx, miny, maxx, maxy = bounds

# Add some margin
#minx, miny, maxx, maxy = shape(geom).buffer(50).bounds
minx, miny, maxx, maxy = shapely.geometry.shape(geom).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))

out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True)
crs = f.crs.copy()
f.close()

return out_image, out_transform, crs, bounds, pad_rad
geom = shapely.geometry.shape(geom)
intersected_indices = list(tile_index.intersection(geom.bounds))
for idx in intersected_indices:
intersected_fn = TILES[idx][0]
intersected_geom = TILES[idx][1]
if intersected_geom.contains(geom):
return intersected_fn

def get_naip_by_extent_fixed(naip_fn, extent):
naip_fns = get_naip_same_loc(naip_fn)
naip_fn = naip_fns[-1]
#print(naip_fn)
if len(intersected_indices) > 0:
raise ValueError("Error, there are overlaps with tile index, but no tile completely contains selection")
else:
raise ValueError("No tile intersections")

# ------------------------------------------------------------------------------

def get_data_by_extent(naip_fn, extent, geo_data_type):

if geo_data_type == GeoDataTypes.NAIP:
fn = naip_fn
elif geo_data_type == GeoDataTypes.NLCD:
fn = naip_fn.replace("/esri-naip/", "/resampled-nlcd/")[:-4] + "_nlcd.tif"
elif geo_data_type == GeoDataTypes.LANDSAT_LEAFON:
fn = naip_fn.replace("/esri-naip/data/v1/", "/resampled-landsat8/data/leaf_on/")[:-4] + "_landsat.tif"
elif geo_data_type == GeoDataTypes.LANDSAT_LEAFOFF:
fn = naip_fn.replace("/esri-naip/data/v1/", "/resampled-landsat8/data/leaf_off/")[:-4] + "_landsat.tif"
elif geo_data_type == GeoDataTypes.BUILDINGS:
fn = naip_fn.replace("/esri-naip/", "/resampled-buildings/")[:-4] + "_building.tif"
elif geo_data_type == GeoDataTypes.LANDCOVER:
# TODO: Add existence check
fn = naip_fname.replace("/esri-naip/", "/resampled-lc/")[:-4] + "_lc.tif"
else:
raise ValueError("GeoDataType not recognized")


f = rasterio.open(naip_fn, "r")
geom = extent_to_transformed_geom(extent, f.crs["init"])
pad_rad = 15
f = rasterio.open(fn, "r")
geom = GeoTools.extent_to_transformed_geom(extent, f.crs["init"])
pad_rad = 15 # TODO: this might need to be changed for much larger inputs
buffed_geom = shapely.geometry.shape(geom).buffer(pad_rad)
minx, miny, maxx, maxy = buffed_geom.bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))
out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True)
src_crs = f.crs.copy()
f.close()


dst_crs = {"init": "EPSG:3857"}
dst_crs = {"init": "EPSG:%s" % (extent["spatialReference"]["latestWkid"])}
dst_transform, width, height = rasterio.warp.calculate_default_transform(
src_crs,
dst_crs,
Expand All @@ -168,38 +158,22 @@ def get_naip_by_extent_fixed(naip_fn, extent):

# Calculate the correct padding
w = extent["xmax"] - extent["xmin"]
pad = int(np.round((dst_image.shape[1] - w) / 2))
padding = int(np.round((dst_image.shape[1] - w) / 2))

return dst_image, dst_transform, src_crs, 0, pad

#return out_image, out_transform, src_crs, 0, 0
return dst_image, padding

def lookup_tile_by_geom(geom):
tile_index = rtree.index.Index("data/tile_index")
# ------------------------------------------------------------------------------

# Add some margin
#minx, miny, maxx, maxy = shape(geom).buffer(50).bounds
minx, miny, maxx, maxy = shapely.geometry.shape(geom).bounds
geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True))

geom = shapely.geometry.shape(geom)
intersected_indices = list(tile_index.intersection(geom.bounds))
for idx in intersected_indices:
intersected_fn = tiles[idx][0]
intersected_geom = tiles[idx][1]
if intersected_geom.contains(geom):
return True, intersected_fn

if len(intersected_indices) > 0:
# we picked something that overlaps the border between tiles but isn't totally in one
msg = "Error, there are overlaps with tile index, but no tile completely contains selection"
print(msg)
return False, msg
else:
# we didn't find anything
msg = "No tile intersections"
print(msg)
return False, msg
# ------------------------------------------------------------------------------
# Le's methods for predicting entire tile worth of data
#
# NOTE: I have not refactored these --Caleb
# ------------------------------------------------------------------------------

def naip_fn_to_pred_tile_fn(naip_fn):
return os.path.basename(naip_fn).split('.mrf')[0] + '_tilepred'

def find_tile_and_load_pred(centerp):
geom, naip_fn = center_to_tile_geom(centerp)
Expand Down Expand Up @@ -287,80 +261,13 @@ def center_to_tile_geom(centerp):
tile_index = rtree.index.Index("data/tile_index")
intersected_indices = list(tile_index.intersection(geom.bounds))
for idx in intersected_indices:
intersected_fn = tiles[idx][0]
intersected_geom = tiles[idx][1]
intersected_fn = TILES[idx][0]
intersected_geom = TILES[idx][1]
geom = shapely.geometry.mapping(intersected_geom)
geom = fiona.transform.transform_geom("EPSG:4269", "EPSG:3857", geom)
return geom, intersected_fn

print('No tile intersecton')
return None, None

def extent_to_transformed_geom(extent, dest_crs="EPSG:4269"):
left, right = extent["xmin"], extent["xmax"]
top, bottom = extent["ymax"], extent["ymin"]

geom = {
"type": "Polygon",
"coordinates": [[(left, top), (right, top), (right, bottom), (left, bottom), (left, top)]]
}

# The map navigator uses EPSG:3857 and Caleb's indices use EPSG:4269
return fiona.transform.transform_geom("EPSG:3857", dest_crs, geom)

#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------

NLCD_CLASSES = [
0, 11, 12, 21, 22, 23, 24, 31, 41, 42, 43, 51, 52, 71, 72, 73, 74, 81, 82, 90, 95, 255
]
cid = defaultdict(lambda: 0, {cl:i for i,cl in enumerate(NLCD_CLASSES)})

if not os.path.exists("data/tile_index.dat"):
#------------------------------------
# We need to load the shapefile and create a tile index for quick point queries
#------------------------------------
tic = float(time.time())
print("Creating spatial index and pickling tile name dictionary")
tile_index = rtree.index.Index("data/tile_index")
tiles = {}
f = fiona.open("data/best_tiles.shp", "r")
count = 0
for feature in f:
if count % 10000 == 0:
print("Loaded %d shapes..." % (count))

fid = feature["properties"]["location"]
geom = shapely.geometry.shape(feature["geometry"])
tile_index.insert(count, geom.bounds)
tiles[count] = (fid, geom)

count += 1
f.close()
tile_index.close()

pickle.dump(tiles, open("data/tiles.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

print("Finished creating spatial index in %0.4f seconds" % (time.time() - tic))
else:
#------------------------------------
# Tile index already exists
#------------------------------------
print("Spatial index already exists, loading")
tiles = pickle.load(open("data/tiles.p", "rb"))

print("Spatial index loaded")

# Load naip dictionary
naip_d = {}
fdid = open('data/list_all_naip.txt')
while True:
line = fdid.readline().strip()
if not line:
break
naipid = naip2id(line)
if naipid in naip_d:
naip_d[naipid] += [line,]
else:
naip_d[naipid] = [line,]
fdid.close()
# ------------------------------------------------------------------------------
15 changes: 15 additions & 0 deletions GeoTools.py
@@ -0,0 +1,15 @@
import numpy as np
import rasterio
import fiona

def extent_to_transformed_geom(extent, dest_crs="EPSG:4269"):
left, right = extent["xmin"], extent["xmax"]
top, bottom = extent["ymax"], extent["ymin"]

geom = {
"type": "Polygon",
"coordinates": [[(left, top), (right, top), (right, bottom), (left, bottom), (left, top)]]
}

# The map navigator uses EPSG:3857 and Caleb's indices use EPSG:4269
return fiona.transform.transform_geom("EPSG:3857", dest_crs, geom)

0 comments on commit 92ac83e

Please sign in to comment.