In [11]:
import matplotlib.pyplot as plt, rdflib, pandas as pd, numpy as np, sys, os, random, math, fiona, uuid, copy, glob
from osgeo import gdal, osr, gdal_array
from collections import defaultdict, Counter
from dotenv import load_dotenv
from tqdm.auto import tqdm
from typing import *
from ruamel.yaml import YAML
import xarray as xr

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# next cell
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
load_dotenv(verbose=True)
paths = ["../", "/workspace/d-repr/pydrepr", "/home/rook/workspace/d-repr/pydrepr"]
for path in paths:
    if path not in sys.path:
        sys.path.insert(0, path)

yaml = YAML()

from drepr import __version__, DRepr, outputs
from drepr.executors.readers.reader_container import ReaderContainer
from drepr.executors.readers.np_dict import NPDictReader
print("drepr version:", __version__)

drepr version: 2.8


In [3]:
from funcs import DcatReadFunc
from funcs.readers.dcat_read_func import ShardedClassID
from funcs.gdal.raster import *
from dateutil.parser import parse

**1. configuration & global variables**

In [27]:
HOME_DIR = os.environ['HOME_DIR']

gldas = "5babae3f-c468-4e01-862e-8b201468e3b5"
gpm = "ea0e86f3-9470-4e7e-a581-df85b4a7075d"
region = "74e6f707-d5e9-4cbd-ae26-16ffa21a1d84"
variable = "atmosphere_water__precipitation_mass_flux"
variable = "land_surface_air__temperature"

ethiopia = BoundingBox(32.75418, 3.22206, 47.98942, 15.15943)

**2. download the weather dataset**

In [30]:
def read_datasets(dataset_id, start_time, end_time):
  if start_time is not None:
    start_time = parse(start_time)
  if end_time is not None:
    end_time = parse(end_time)
    
  func = DcatReadFunc(dataset_id, start_time, end_time)
  func.set_preferences({"data": "array"})
  datasets = func.exec()['data']
  return datasets

def read_local_datasets(repr_file, resource_path):
  drepr = DRepr.parse_from_file(repr_file)
  datasets = []
  for file in glob.glob(resource_path):
    datasets.append(outputs.ArrayBackend.from_drepr(drepr, file))  
  if len(datasets) > 1:
    return ShardedBackend(datasets)
  return datasets[0]

In [6]:
weather_dataset = read_datasets(gldas, "2011-09-01T00:00:00", "2011-09-01T03:00:00")

2020-03-04 13:14:02,996 | funcs.readers.dcat_read_func | INFO - Overwrite GLDAS
2020-03-04 13:14:02,997 | funcs.readers.dcat_read_func | INFO - Found key 'resource_repr'
2020-03-04 13:14:02,997 | funcs.readers.dcat_read_func | INFO - Downloading 1 resources ...
2020-03-04 13:14:02,998 | funcs.readers.dcat_read_func | INFO - Download Complete. Skip 1 and download 0 resources


**3. crop the data**

some useful functions to convert datasets to rasters and convert them back

In [7]:
with open(HOME_DIR + "/examples/d3m/crop_bb.yml", "r") as f:
  crop_bb_conf = yaml.load(f)

def dataset2raster(sm, variable):
  rasters = []
  for c in sm.c('mint:Variable').filter(outputs.FCondition("mint:standardName", "==", variable)):
    for raster_id, sc in c.group_by("mint-geo:raster"):
      # TODO: handle time properly
      timestamp = sc.p("mint:timestamp").as_ndarray([])
      if timestamp.data.size != 1:
        raise NotImplemented()
      timestamp = timestamp.data[0]
      
      data = sc.p("rdf:value").as_ndarray([sc.p("mint-geo:lat"), sc.p("mint-geo:long")])
      gt_info = sm.get_record_by_id(raster_id)
      gt = GeoTransform(x_0=gt_info.s("mint-geo:x_0"),
                        y_0=gt_info.s("mint-geo:y_0"),
                        dx=gt_info.s("mint-geo:dx"), dy=gt_info.s("mint-geo:dy"))
      raster = Raster(data.data, gt, int(gt_info.s("mint-geo:epsg")),
             float(data.nodata.value) if data.nodata is not None else None)
      raster.timestamp = timestamp
      rasters.append(raster)
  return rasters
  
def raster2dataset(r, variable):
  global crop_bb_conf
  reader = NPDictReader({
    "variable": r.data,
    "lat": r.get_center_latitude(),
    "long": r.get_center_longitude(),
    "timestamp": r.timestamp,
    "standard_name": variable,
    "gt_x_0": r.geotransform.x_0,
    "gt_y_0": r.geotransform.y_0,
    "gt_dx": r.geotransform.dx,
    "gt_dy": r.geotransform.dy,
    "gt_epsg": r.epsg,
    "gt_x_slope": r.geotransform.x_slope,
    "gt_y_slope": r.geotransform.y_slope,
  })
  resource_id = str(uuid.uuid4())
  ReaderContainer.get_instance().set(resource_id, reader)
  
  conf = copy.deepcopy(crop_bb_conf)
  conf['attributes']['variable']['missing_values'].append(r.nodata)
  drepr = DRepr.parse(conf)
  sm = outputs.ArrayBackend.from_drepr(drepr, resource_id)
  ReaderContainer.get_instance().delete(resource_id)
  return sm

def raster2netcdf(r, variable, outfile):
  lat = r.get_center_latitude()
  long = r.get_center_longitude()
  data = xr.DataArray(r.data, dims=('lat', 'long'), coords={'lat': lat, 'long': long})
  data.attrs['standard_name'] = variable
  data.attrs['_FillValue'] = r.nodata
  data.attrs['missing_values'] = r.nodata
  
  ds = xr.Dataset({standard_name: data})  
  ds.to_netcdf(outfile)
  
def dataset2netcdf(sm):
  datasets = {}
  for c in sm.c("mint:Variable"):
    if c.p("mint:Place") is not None:
      raise NotImplemented()

    for standard_name, sc1 in c.group_by("mint:standardName"):
      for time, sc2 in sc1.group_by("mint:timestamp"):
        val = sc2.p("rdf:value").as_ndarray([sc2.p("mint-geo:lat"), sc2.p("mint-geo:long")])
        data = val.data.reshape(1, *val.data.shape)
        data = xr.DataArray(val.data.reshape(1, *val.data.shape), dims=('time', 'lat', 'long'), coords={
          'lat': val.index_props[0], 'long': val.index_props[1], 'time': np.asarray([time])
        })
        data.attrs['standard_name'] = standard_name
        data.attrs['_FillValue'] = val.nodata.value
        data.attrs['missing_values'] = val.nodata.value
        
        assert standard_name not in datasets
        datasets[standard_name] = data
  ds = xr.Dataset(datasets)
  ds.attrs.update({
    "conventions": "CF-1.6"
  })
  return ds

**3.1 crop the data by a bounding box**

In [9]:
subrasters = []
for raster in tqdm(dataset2raster(weather_dataset, variable)):
  sr = raster.crop(bounds=ethiopia, resampling_algo=ReSample.BILINEAR)
  sr.timestamp = raster.timestamp
  filename = datetime.datetime.utcfromtimestamp(sr.timestamp).strftime("%Y%m%d%H%M%S")
  sm = raster2dataset(sr, variable)
  dataset2netcdf(sm).to_netcdf(HOME_DIR + f"/data/gldas/{variable}/{filename}.nc4")
  subrasters.append(sr)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




debug to see if the data is correct

In [10]:
subrasters[-1].to_geotiff(HOME_DIR + "/data/small.tif")
raster.to_geotiff(HOME_DIR + "/data/full.tif")
sm = read_local_datasets(HOME_DIR + "/examples/d3m/gldas.crop.yml", HOME_DIR + f"/data/gldas/{variable}/*.nc4")
a = dataset2raster(sm, variable)[0].data
b = subrasters[-1].data

assert np.allclose(a, b)

**3.2 crop data by shapefiles**

In [85]:
region_dataset = read_datasets(region, None, None)

2020-03-04 16:30:08,730 | funcs.readers.dcat_read_func | INFO - Found key 'dataset_repr'
2020-03-04 16:30:08,731 | funcs.readers.dcat_read_func | INFO - Downloading 1 resources ...
2020-03-04 16:30:08,733 | funcs.readers.dcat_read_func | INFO - Download Complete. Skip 1 and download 0 resources


In [86]:
for c in region_dataset.c("mint:Place"):
  for r in c.iter_records():
    print(r.to_dict())

{'@id': (0,), 'mint:region': ['Addis Ababa'], 'mint-geo:bounding': [(0,)]}
{'@id': (1,), 'mint:region': ['Tigray'], 'mint-geo:bounding': [(1,)]}
{'@id': (2,), 'mint:region': ['Somali'], 'mint-geo:bounding': [(2,)]}
{'@id': (3,), 'mint:region': ['Dire Dawa'], 'mint-geo:bounding': [(3,)]}
{'@id': (4,), 'mint:region': ['Hareri'], 'mint-geo:bounding': [(4,)]}
{'@id': (5,), 'mint:region': ['SNNPR'], 'mint-geo:bounding': [(5,)]}
{'@id': (6,), 'mint:region': ['Gambela'], 'mint-geo:bounding': [(6,)]}
{'@id': (7,), 'mint:region': ['Beneshangul Gumu'], 'mint-geo:bounding': [(7,)]}
{'@id': (8,), 'mint:region': ['Amhara'], 'mint-geo:bounding': [(8,)]}
{'@id': (9,), 'mint:region': ['Afar'], 'mint-geo:bounding': [(9,)]}
{'@id': (10,), 'mint:region': ['Oromia'], 'mint-geo:bounding': [(10,)]}


In [93]:
from funcs.trans_cropping_func import CroppingTransFunc

_tmp = CroppingTransFunc.extract_shape(region_dataset)
shape_files = []

for s in _tmp:
  tmp_file = HOME_DIR + f"/data/tmp_shapefiles/{str(uuid.uuid4())}.shp"
  CroppingTransFunc.shape_array_to_shapefile(s, tmp_file)
  shape_files.append(tmp_file)

/Users/rook/workspace/MINT/MINT-Transformation/data/tmp_shapefiles/cf51e4dc-8822-4aa1-b67b-c5b9a7af8086.shp


TypeError: must be real number, not tuple

In [52]:
with fiona.open("/Users/rook/workspace/MINT/MINT-Transformation/data/tmp/15193822-1f8f-4a29-b9ba-f6b8ef48b8e4/EthiopiaRegions.shp", "r") as f:
  for line in f:
    coords = line['geometry']['coordinates']
    break

In [72]:
coords

[[[(38.701832718591, 8.92536056339308),
   (38.7018271553244, 8.92537946315634),
   (38.7018269684748, 8.92538507087577),
   (38.701832718591, 8.92536056339308)]],
 [[(38.7019174557734, 8.92569941274066),
   (38.7019149503721, 8.92569773637211),
   (38.7018985839659, 8.92568913000775),
   (38.701923446638, 8.92570528668625),
   (38.7019174557734, 8.92569941274066)]],
 [[(38.701832718591, 8.92536056339308),
   (38.7018026546733, 8.92553149009404),
   (38.701806369207, 8.92555021591077),
   (38.7018096273966, 8.92557066069262),
   (38.7018151610187, 8.92558866413339),
   (38.7018239690001, 8.9256077555713),
   (38.7019174557734, 8.92569941274066),
   (38.702020385637, 8.92576827784696),
   (38.701923446638, 8.92570528668625),
   (38.7019579419449, 8.92573910513413),
   (38.7019889284323, 8.925763400494),
   (38.7020540517989, 8.92578055642307),
   (38.7020897975607, 8.92578561126317),
   (38.7021830211276, 8.9259155710364),
   (38.7021773644362, 8.92593637077239),
   (38.7021681620944, 8

In [67]:
len(coords[0])
coords[0][0]

[(38.701832718591, 8.92536056339308),
 (38.7018271553244, 8.92537946315634),
 (38.7018269684748, 8.92538507087577),
 (38.701832718591, 8.92536056339308)]

In [68]:
s[0][0][0]

[(38.701832718591, 8.92536056339308),
 (38.7018271553244, 8.92537946315634),
 (38.7018269684748, 8.92538507087577),
 (38.701832718591, 8.92536056339308)]

In [78]:
s = _tmp[0]
len(s[0][1])

1

-----------

In [257]:
sm = raster2dataset(rasters[0], variable)

In [272]:
dataset2netcdf(sm).to_netcdf(HOME_DIR + "/data/tmp_out/test.nc4")

In [222]:
for raster in rasters:
  raster.to_geotiff(HOME_DIR + "/data/full.tif")
  ethiopia_raster = raster.crop(bounds=ethiopia, resampling_algo=ReSample.BILINEAR)
  ethiopia_raster.to_geotiff(HOME_DIR + "/data/small.tif")

In [223]:
sm = raster2dataset(ethiopia_raster, variable)

In [235]:
dataset2netcdf(sm)

None
defaultdict(<class 'list'>, {'rdf:value': [0], 'mint-geo:lat': [1], 'mint-geo:long': [2], 'mint:standardName': [3], 'mint-geo:raster': [4], 'mint:timestamp': []})
defaultdict(<class 'list'>, {'rdf:value': [0], 'mint-geo:lat': [1], 'mint-geo:long': [2], 'mint:standardName': [3], 'mint-geo:raster': [4], 'mint:timestamp': []})


IndexError: list index out of range