In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append("../")

# Data storage setup

#### The following notebook may be used to install all data used in the `coralshift` paper from source. Due to the huge quantity of data, the sequential calls to various APIs take a very long time (~days). 

#### It is therefore recommended to make use of the example datasets provided at the following link: [https://doi.org/10.5281/zenodo.8110926](https://doi.org/10.5281/zenodo.8110926)

## Dataset structure

Datasets will be downloaded and processed into the following directory structure:

- coralshift (cloned repository containing Python scripts, Jupyter notebooks etc.)
- datasets
    - **bathymetry**
        - REGION_NAME_1_2020_30m_MSL_cog.tif
        - REGION_NAME_2_2020_30m_MSL_cog.tif
        - ...
        - REGION_NAME_N_2020_30m_MSL_cog.tif
        - bathymetry_REGION_NAME_1_RESOLUTION.nc
        - bathymetry_REGION_NAME_2_RESOLUTION.nc
        - ...
        - bathymetry_REGION_NAME_N_RESOLUTION.nc
    - **gradients**
        - region_REGION_NAME_1_RESOLUTION_gradients.nc
        - region_REGION_NAME_2_RESOLUTION_gradients.nc
        - ...
        - region_REGION_NAME_N_RESOLUTION_gradients.nc
    - **reef_baseline**
        - region_name
            - benthic.gpkg
            - benthic.pkl
            - REGION_NAME shapefile components
        - gt_files
            - RESOLUTION_arrays
                - coral_region_REGION_NAME_1_1000m_RESOLUTION.nc
                - coral_region_REGION_NAME_2_1000m_RESOLUTION.nc
                - ...
                - coral_region_REGION_NAME_N_1000m_RESOLUTION.nc
        - coral_region_REGION_NAME_1_1000m.tif
        - coral_region_REGION_NAME_2_1000m.tif
        - ...
        - coral_region_REGION_NAME_N_1000m.tif
    - **global_ocean_reanalysis**
        - daily_means
            - region_name e.g. Great_Barrier_Reef_A
                - var1 (.nc files and accompanying metadata containing variable data grouped by year)
                - var2
                - ...
                - varN
                - merged_vars (.nc files and accompanying metadata for each variable)
                - cmems_gopr_daily_REGION_NAME.nc (merged variables for region + metadata)
        - monthly_means
            - region_name e.g. Great_Barrier_Reef_A
                - var1 (.nc files and accompanying metadata containing variable data grouped by year)
                - var2
                - ...
                - varN
                - merged_vars (.nc files and accompanying metadata for each variable)
                - cmems_gopr_monthly_REGION_NAME.nc (merged variables for region + metadata)
    - **era5**
        - region_name e.g. Great_Barrier_Reef_A
            - var1 (.nc files containing variable data grouped by year)
            - var2
            - ...
            - varN
            - weather_parameters (.nc file for each variable for whole specified time period)

In [None]:
# change this line to the where directory in which the GitHub repository is located: datasets will be installed into 
# the directory one level above this
os.chdir("/lustre_scratch/orlando-code/coralshift/")

In [None]:
# import necessary packages
from IPython.display import HTML

from coralshift.dataloading import data_structure, climate_data, bathymetry, reef_extent
from coralshift.utils import directories
from coralshift.plotting import spatial_plots

## Specify your area of interest

The availability of high-resolution (30m) bathymetric data means that areas of interest are currently confined to 4 areas on the Great Barrier Reef (GBR). The following code downloads the specified area of bathymetry data:

| Reef Area Name                	| Latitudes 	| Longitudes 	|
|-------------------------------	|-----------	|------------	|
| Great Barrier Reef A 2020 30m 	| 10-17°S   	| 142-147°E  	|
| Great Barrier Reef B 2020 30m 	| 16-23°S   	| 144-149°E  	|
| Great Barrier Reef C 2020 30m 	| 18-24°S   	| 148-154°E  	|
| Great Barrier Reef D 2020 30m 	| 23-29°S   	| 150-156°E  	|


![bathymetry_regions.png](https://github.com/orlando-code/coralshift/blob/dev-setup/bathymetry_regions.png?raw=true)



In [None]:
# choose resolution (should be above 4000m for processing in decent time)
# native resolutions are 1 (1, "m") or 1/12 degrees (1/12, "d"), or 1/27 degrees (1/27, "d")
target_resolution_m, target_resolution_d = spatial_data.choose_resolution(
    resolution=1/27, unit="d")

print(f"Data will be resampled to {target_resolution_d:.04f} degrees (~{target_resolution_m:.0f}m).")

In [None]:
# specify the letters or names of regions to download
region_letters = ["A", "B", "C", "D"]
# visualise area(s)
spatial_plots.plot_reef_areas(region_letters)

## Bathymetry

In [None]:
reef_areas = bathymetry.ReefAreas()

for region in region_letters:
    file_name = reef_areas.get_short_filename(area_name)
    bath_dir = directories.get_bathymetry_datasets_dir()
    _, xa_bath = bathymetry.generate_bathymetry_xa_da(region)
    _, _ = spatial_data.upsample_and_save_xa_a(
        bath_dir, xa_d=xa_bath, name=file_name, target_resolution_d=target_resolution_d)

## Calculate slopes

In [None]:
# calculate absolute gradients from bathymetry and save to file
bathymetry.generate_gradient_magnitude_ncs(regions=region_letters, resolution_d=target_resolution_d, sigma=1)

In [None]:
### Display different resolutions
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(16,9), subplot_kw=dict(projection=ccrs.PlateCarree()))

ax1 = spatial_plots.plot_spatial(xa_bath_upsampled, 
    fax= (fig,ax_left), val_lims=(-50,0), name="depth", title=f"Bathymetry at {target_resolution_m}m resolution")
ax2 = spatial_plots.plot_spatial(grads, 
    fax=(fig, ax_right), val_lims=(0,10), name="gradient magnitude", 
    title=f"Absolute seafloor gradients at {target_resolution_m}m resolution")

## Coral ground truth: Allen Coral Atlas


There is currently no API for accessing data directly from your local machine. Please follow the instructions* below:
1. Make an account on the [Allen Coral Atlas](https://allencoralatlas.org/atlas/#6.00/-13.5257/144.5000) webpage
2. Generate a geojson file using the code cell below (generated in the `reef_baseline` directory)

*Instructions correct as of 30.06.23

<iframe width="560" height="315" src="https://www.youtube.com/embed/N8vPKXc0W4k" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>

In [None]:
# generate geojson file in reef_baseline directory for download from the Allen Coral Atlas
geojson_path = reef_extent.generate_area_geojson(area_class = reef_areas, area_name=file_name)

print(f"geoJSON file saved at {geojson_path} for upload to GEE")

3. Upload the geojson file via:

    \> My Areas > Upload a GeoJSON or KML file
4. Specify a region name and navigate to the "Download data" tab when it becomes available.
4. Select "Benthic Map (OGC GeoPackage (.gpkg))". Sign the terms and conditions 
and select "Prepare Download". After ~two minutes a sequence of emails will arrive notifying you that your download is ready.
5. Download the file and unzip it using a unzipping utility. Then, add the `benthic.gpkg` file to the `reef_baseline` directory.
6. Continue with the subsequent code cells.

----

You have now downloaded:

**`benthic.gpkg`**

This is a dataframe of Shapely objects ("geometry" polygons) defining the boundaries of different benthic classes:

| Class           	| Number of polygons 	|
|-----------------	|--------------------	|
| Coral/Algae     	| 877787             	|
| Rock            	| 766391             	|
| Rubble          	| 568041             	|
| Sand            	| 518805             	|
| Microalgal Mats 	| 27569              	|

In [None]:
# read .gpkg file and save to .pkl format for faster reading later
region_benthic_df = file_ops.check_pkl_else_read_gpkg(directories.get_reef_baseline_dir() / file_name, filename = "benthic.pkl")
region_benthic_df.head()

### Rasterize polygons

Rasterized arrays are necessary to process the geospatial data e.g. to align different gridcells. Doing this locally through rasterio requires such significant compute that cloud computing is the only reasonable option. A JavaScript file (`rasterization.js`) for use in Google Earth Engine (GEE) is accessible [here](https://code.earthengine.google.com/ae68c68309b04643e8f5f5dc45f0dbca). Visit [this page](https://developers.google.com/earth-engine/guides/getstarted) for information regarding setting up a GEE account and getting started.

GEE requires shapefile (.shp) format to ingest data. This is generated in the following cell:


In [None]:
# process df to gpd.GeoDataFrame. 
# We are interested only in the "Coral/Algae" class, so gdf is limited to these rows by default
gdf_coral = reef_extent.process_benthic_pd(region_benthic_df)
# save as shapely file (if not already present) for rasterisation in GEE
reef_extent.generate_coral_shp(gdf_coral, file_name)

1. Ingest the shapefile (and all accompanying files: .cpg, .dbf, .prj, .shx) as a GEE asset.
2. Import the subsequent `Table` into the script.
3. Update the `resolution` variable as desired (usually that matching the target resolution specified above).
3. Run the script, and submit the `coral_raster_Xm` task. Sit back and wait! After ~1 hour (depending on the chosen resolution) the rasters will be available to download from your Google Drive as GeoTIFFS: after this, add them to the `reef_baseline` directory and carry on with the following cells.

In [None]:
# Process all tifs in reef_extent directory to nc files at specified target resolution
process_reef_extent_tifs(target_resolution_d=target_resolution_d)

## Global Ocean Physics Reanalysis

The dataset and its metadata can be accessed [here](https://doi.org/10.48670/moi-00021).

### Download data

You're required to set up an account with the [Copernicus Marine Service](https://marine.copernicus.eu/). 


**Warning:**  this is a large amount of data for which the only way to gather it is to query the copernicus API via motu. Requests are queued, and request sizes are floated to the top of the queue. The following functions take advantage of this by splitting a single request up by date adn variable before amalgamating the files, but this can still take a **very long time**, and vary significantly depending on overall website traffic. For those who aren't interested in the entire database, it's highly recommended that you use the toy dataset provided as a `.npy` file in the GitHub repository.


In [None]:
# download monthly data for all specified areas 
# Can be adjusted to specify subset of variables, dates, and depths to download.
# Values generated here are those reported in the accompanying paper.

for area_name in region_letters:
    _, _ = climate_data.download_reanalysis(
        download_dir=directories.get_monthly_cmems_dir(),
        region = reef_areas.get_short_filename(area_name),
        final_filename = f"cmems_gopr_monthly_{area_name}",
        lat_lims = reef_areas.get_lat_lon_limits(area_name)[0], lon_lims = reef_areas.get_lat_lon_limits(area_name)[1], 
        product_id = "cmems_mod_glo_phy_my_0.083_P1M-m")   


In [None]:
# download daily data for all specified areas
for area_name in region_letters:
    _, _ = climate_data.download_reanalysis(
        download_dir=directories.get_daily_cmems_dir(),
        region = reef_areas.get_short_filename(area_name),
        final_filename = f"cmems_gopr_daily_{area_name}",
        lat_lims = reef_areas.get_lat_lon_limits(area_name)[0], lon_lims = reef_areas.get_lat_lon_limits(area_name)[1], 
        product_id = "cmems_mod_glo_phy_my_0.083_P1D-m")   

In [None]:
# resample daily climate data to desired resolution
_, _ = spatial_data.upsample_and_save_xa_a(
    directories.get_daily_cmems_dir() / reef_areas.get_short_filename(area_name), 
    xa_d=xa_cmems_daily, name=cmems_daily_path.stem, target_resolution_d=target_resolution_d)

## Load ERA5 data

European Reanalysis v.5 (ERA5) is the fifth generation European Centre for Medium-range Weather Forecasting's (ECMWF) reanalysis for the global climate and weather over the past 8 decades.

The dataset and its metadata may be accessed [here](10.24381/cds.adbb2d47).

In [None]:
# download ERA5 data for each region
for region in region_letters:
    lat_lims=reef_areas.get_lat_lon_limits(region)[0]
    lon_lims=reef_areas.get_lat_lon_limits(region)[1]
    region = reef_areas.get_short_filename(region)
    generate_era5_data(
        lat_lims=lat_lims, lon_lims=lon_lims,
        region = region
        )

#### All necessary data has now beeen downloaded.

