In [211]:
import matplotlib.pyplot as plt, rdflib, pandas as pd, numpy as np, sys, os, random, math, fiona, uuid, copy
from osgeo import gdal, osr, gdal_array
from collections import defaultdict, Counter
from dotenv import load_dotenv
from tqdm.auto import tqdm
from typing import *
from ruamel.yaml import YAML
import xarray as xr

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# next cell
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [212]:
load_dotenv(verbose=True)
paths = ["../", "/workspace/d-repr/pydrepr"]
for path in paths:
    if path not in sys.path:
        sys.path.insert(0, path)

yaml = YAML()

from drepr import __version__, DRepr, outputs
from drepr.executors.readers.reader_container import ReaderContainer
from drepr.executors.readers.np_dict import NPDictReader
print("drepr version:", __version__)

drepr version: 2.8


In [3]:
from funcs import DcatReadFunc
from funcs.readers.dcat_read_func import ShardedClassID
from funcs.gdal.raster import *
from dateutil.parser import parse

**configuration & global variables**

In [278]:
HOME_DIR = os.environ['HOME_DIR']

gldas = "5babae3f-c468-4e01-862e-8b201468e3b5"
gpm = "ea0e86f3-9470-4e7e-a581-df85b4a7075d"
variable = "atmosphere_water__precipitation_mass_flux"
variable = "land_surface_air__temperature"

ethiopia = BoundingBox(32.75418, 3.22206, 47.98942, 15.15943)

**download the weather dataset**

In [218]:
def read_datasets(dataset_id, start_time, end_time):
  func = DcatReadFunc(dataset_id, parse(start_time), parse(end_time))
  func.set_preferences({"data": "array"})
  datasets = func.exec()['data']
  return datasets

In [283]:
weather_dataset = read_datasets(gldas, "2011-09-01T00:00:00", "2011-09-01T23:59:59")

2020-03-04 01:13:20,959 | funcs.readers.dcat_read_func | INFO - Overwrite GLDAS
2020-03-04 01:13:20,960 | funcs.readers.dcat_read_func | DEBUG - Found key 'resource_repr'
2020-03-04 01:13:20,961 | funcs.readers.dcat_read_func | DEBUG - Downloading 8 resources ...
2020-03-04 01:13:20,961 | funcs.readers.dcat_read_func | DEBUG - Skipping resource 7e945d34-73e3-46b8-aa5d-17d5e61a23da, found in cache
2020-03-04 01:13:20,962 | funcs.readers.dcat_read_func | DEBUG - Skipping resource a4d67e6f-3f5b-475a-9ac2-a27dd527f4a2, found in cache
2020-03-04 01:13:20,963 | funcs.readers.dcat_read_func | DEBUG - Skipping resource 0f4a8ce6-fcab-450a-96bd-29d1819c4cf1, found in cache
2020-03-04 01:13:20,964 | funcs.readers.dcat_read_func | DEBUG - Skipping resource a404e94b-78ef-49d8-9f73-38b2368b3a7c, found in cache
2020-03-04 01:13:20,966 | funcs.readers.dcat_read_func | DEBUG - Skipping resource c7764c4a-0936-4513-b359-e2f6deafe267, found in cache
2020-03-04 01:13:20,968 | funcs.readers.dcat_read_func |

**crop the data**

some useful functions to convert datasets to rasters and convert them back

In [277]:
with open(HOME_DIR + "/examples/d3m/crop_bb.yml", "r") as f:
  crop_bb_conf = yaml.load(f)

def dataset2raster(sm, variable):
  rasters = []
  for c in sm.c('mint:Variable').filter(outputs.FCondition("mint:standardName", "==", variable)):
    for raster_id, sc in c.group_by("mint-geo:raster"):
      # TODO: handle time properly
      timestamp = sc.p("mint:timestamp").as_ndarray([])
      if timestamp.data.size != 1:
        raise NotImplemented()
      timestamp = timestamp.data[0]
      
      data = sc.p("rdf:value").as_ndarray([sc.p("mint-geo:lat"), sc.p("mint-geo:long")])
      gt_info = sm.get_record_by_id(raster_id)
      gt = GeoTransform(x_0=gt_info.s("mint-geo:x_0"),
                        y_0=gt_info.s("mint-geo:y_0"),
                        dx=gt_info.s("mint-geo:dx"), dy=gt_info.s("mint-geo:dy"))
      raster = Raster(data.data, gt, int(gt_info.s("mint-geo:epsg")),
             float(data.nodata.value) if data.nodata is not None else None)
      raster.timestamp = timestamp
      rasters.append(raster)
  return rasters
  
def raster2dataset(r, variable):
  global crop_bb_conf
  reader = NPDictReader({
    "variable": r.data,
    "lat": r.get_center_latitude(),
    "long": r.get_center_longitude(),
    "timestamp": r.timestamp,
    "standard_name": variable,
    "gt_x_0": r.geotransform.x_0,
    "gt_y_0": r.geotransform.y_0,
    "gt_dx": r.geotransform.dx,
    "gt_dy": r.geotransform.dy,
    "gt_epsg": r.epsg,
    "gt_x_slope": r.geotransform.x_slope,
    "gt_y_slope": r.geotransform.y_slope,
  })
  resource_id = str(uuid.uuid4())
  ReaderContainer.get_instance().set(resource_id, reader)
  
  conf = copy.deepcopy(crop_bb_conf)
  conf['attributes']['variable']['missing_values'].append(r.nodata)
  drepr = DRepr.parse(conf)
  sm = outputs.ArrayBackend.from_drepr(drepr, resource_id)
  ReaderContainer.get_instance().delete(resource_id)
  return sm

def raster2netcdf(r, variable, outfile):
  lat = r.get_center_latitude()
  long = r.get_center_longitude()
  data = xr.DataArray(r.data, dims=('lat', 'long'), coords={'lat': lat, 'long': long})
  data.attrs['standard_name'] = variable
  data.attrs['_FillValue'] = r.nodata
  data.attrs['missing_values'] = r.nodata
  
  ds = xr.Dataset({standard_name: data})  
  ds.to_netcdf(outfile)
  
def dataset2netcdf(sm):
  datasets = {}
  for c in sm.c("mint:Variable"):
    if c.p("mint:Place") is not None:
      raise NotImplemented()

    for standard_name, sc1 in c.group_by("mint:standardName"):
      for time, sc2 in sc1.group_by("mint:timestamp"):
        val = sc2.p("rdf:value").as_ndarray([sc2.p("mint-geo:lat"), sc2.p("mint-geo:long")])
        data = val.data.reshape(1, *val.data.shape)
        data = xr.DataArray(val.data.reshape(1, *val.data.shape), dims=('time', 'lat', 'long'), coords={
          'lat': val.index_props[0], 'long': val.index_props[1], 'time': np.asarray([time])
        })
        data.attrs['standard_name'] = standard_name
        data.attrs['_FillValue'] = val.nodata.value
        data.attrs['missing_values'] = val.nodata.value
        
        assert standard_name not in datasets
        datasets[standard_name] = data
  ds = xr.Dataset(datasets)
  ds.attrs.update({
    "conventions": "CF-1.6"
  })
  return ds

crop the data by a bounding box

In [295]:
rasters = dataset2raster(weather_dataset, variable)

In [286]:
sr = rasters[0]

In [294]:
sr.timestamp

AttributeError: 'Raster' object has no attribute 'timestamp'

In [290]:
datetime.datetime.utcfromtimestamp(sr.timestamp)#.strftime("%Y-%M-%D")

datetime.datetime(1970, 3, 13, 0, 21)

In [301]:
subrasters = []
for raster in rasters:
  sr = raster.crop(bounds=ethiopia, resampling_algo=ReSample.BILINEAR)
  sr.timestamp = raster.timestamp
  sm = raster2dataset(sr, variable)
  dataset2netcdf(sm).to_netcdf(HOME_DIR + f"/data/tmp_out/{int(sr.timestamp)}.nc4")
  subrasters.append(sr)

In [257]:
sm = raster2dataset(rasters[0], variable)

In [272]:
dataset2netcdf(sm).to_netcdf(HOME_DIR + "/data/tmp_out/test.nc4")

In [222]:
for raster in rasters:
  raster.to_geotiff(HOME_DIR + "/data/full.tif")
  ethiopia_raster = raster.crop(bounds=ethiopia, resampling_algo=ReSample.BILINEAR)
  ethiopia_raster.to_geotiff(HOME_DIR + "/data/small.tif")

In [223]:
sm = raster2dataset(ethiopia_raster, variable)

In [235]:
dataset2netcdf(sm)

None
defaultdict(<class 'list'>, {'rdf:value': [0], 'mint-geo:lat': [1], 'mint-geo:long': [2], 'mint:standardName': [3], 'mint-geo:raster': [4], 'mint:timestamp': []})
defaultdict(<class 'list'>, {'rdf:value': [0], 'mint-geo:lat': [1], 'mint-geo:long': [2], 'mint:standardName': [3], 'mint-geo:raster': [4], 'mint:timestamp': []})


IndexError: list index out of range