<a href="https://colab.research.google.com/github/nithecs-biomath/mini-schools/blob/main/cube_prac_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BUILDING DATA CUBES
## NITheCS mini school: lecture 2

### Install missing packages

In [None]:
%pip install pygbif
%pip install mgrs

### Only execute the following block when using the TPU kernel

In [None]:
%pip install geopandas
%pip install pydrive
%pip install ee
%pip install eerepr
%pip install geemap

### Loading packages

In [None]:
from pygbif import occurrences as occ
import pandas as pd
import geopandas as gpd
from pyproj import Proj, Transformer
from shapely.geometry import mapping
from shapely.geometry import Polygon
import matplotlib.pyplot as plt

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import drive
from oauth2client.client import GoogleCredentials
import io
import zipfile
import mgrs
import math

### Loading Earth Engine

In [None]:
import ee
import eerepr
import geemap

ee.Authenticate(force=True)
ee.Initialize(project='nithecs-436810')

LANDSAT_ID = "LANDSAT/LC08/C02/T1_L2"
BOUNDARIES_ID = 'FAO/GAUL/2015/level1'
WDPA_ID = 'WCMC/WDPA/current/polygons'


dataset = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2').filterDate('2021-05-01', '2021-06-01')
sa = ee.FeatureCollection(BOUNDARIES_ID).filter(
    'ADM0_NAME == "South Africa"')

protected_areas = ee.FeatureCollection(WDPA_ID)


sa_landsat = dataset.filterBounds(sa)


### Example of the GBIF API through pygbif

In [None]:
from pygbif import occurrences
data = occurrences.search(speciesKey=5229490, limit=10)

print(data['results'])

## GBIF data Cubes

### Generating the Cube

#### Exemplar JSON query for generating a data cube

In [None]:
{
  "sendNotification": true,
  "notificationAddresses": [
    "maarten.trekels@plantentuinmeise.be"
  ],
  "format": "SQL_TSV_ZIP",
  "sql": "SELECT  PRINTF('%04d-%02d', \"year\", \"month\") AS yearMonth,
   GBIF_EEARGCode(10000, decimalLatitude,  decimalLongitude,  COALESCE(coordinateUncertaintyInMeters, 1000) ) AS eeaCellCode,
   speciesKey,
   species,
   establishmentMeans,
   degreeOfEstablishment,
   pathway,
   COUNT(*) AS occurrences,
   COUNT(DISTINCT recordedBy) AS distinctObservers
   FROM  occurrence
   WHERE occurrenceStatus = 'PRESENT'
   AND countryCode = 'BE'
   AND hasCoordinate = TRUE
   AND NOT ARRAY_CONTAINS(issue, 'ZERO_COORDINATE')
   AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_OUT_OF_RANGE')
   AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_INVALID')
   AND NOT ARRAY_CONTAINS(issue, 'COUNTRY_COORDINATE_MISMATCH')
   AND \"month\" IS NOT NULL
   GROUP BY yearMonth,
   eeaCellCode,
   speciesKey,
   species,
   establishmentMeans,
   degreeOfEstablishment,
   pathway
   ORDER BY  yearMonth DESC,
   eeaCellCode ASC,
   speciesKey ASC"
}


## Loading the Data cube in pandas



#### Download from GitHub

You can download a pre generated data cube from GitHub or any other online resource

In [None]:
#data = pd.read_csv('https://raw.githubusercontent.com/nithecs-biomath/mini-schools/refs/heads/main/data/sample_data_SA.csv', sep='\t')

#print(data)

#### Download from Google Drive

In [None]:
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/Shareddrives/NiTheCS mini school/demo_data/Cube_ZA_QDGC_l3.csv', sep='\t')


In [None]:
print(data)

## Getting a Geopackage file from the Grid that you use

In [None]:
# Load QDGC code

input_file = "/content/drive/Shareddrives/NiTheCS mini school/demo_data/qdgc_south_africa.gpkg"

qdgc_ref = gpd.read_file(input_file, layer='tbl_qdgc_03')

In [None]:
print(qdgc_ref)

## Merging the Data cube with the grid

In [None]:
#testing if I can merge data and qdgc

test_merge = pd.merge(data, qdgc_ref, left_on='qdgccode', right_on='qdgc')

print(test_merge)


In [None]:
# Convert to GeoDataFrame

gdf = gpd.GeoDataFrame(test_merge, geometry='geometry')


## Filtering data (e.g. on species)

In [None]:
#check for a single species
filtered_gdf = gdf[gdf['specieskey'].eq(2435350.0)]

print(filtered_gdf)


## Apply the function to create a list of features

In [None]:

filtered_gdf = filtered_gdf.set_crs(epsg=4326, inplace=False)

data_raw = geemap.geopandas_to_ee(filtered_gdf)

print(type(data_raw))


## Visualization of the data cubes on a map with different layers

In [None]:
Map = geemap.Map(layout={"height": "400px", "width": "800px"})


# Add the original data layer in blue
Map.addLayer(data_raw, {"color": "blue"}, "Original data")

Map.addLayer(sa_landsat)

Map.addLayer(protected_areas)


# Set the center of the map to the coordinates
Map.setCenter(-28.50, 29.41)
Map

In [None]:
### Test with NetCDF format

# EBV data cubes in NetCDF format

In [None]:
%pip install netCDF4

In [None]:
%pip install rioxarray
%pip install cartopy
%pip install basemap

In [None]:
import netCDF4 as nc
import xarray as xr


birds_file = xr.open_dataset('/content/drive/Shareddrives/NiTheCS mini school/demo_data/viti_spepop_id77_20240206_v1.nc')

print(birds_file)

In [None]:
print(birds_file.variables)

In [None]:
time = birds_file.variables['time']
print(time)

print(birds_file['entity'])

In [None]:
# Print a detailed view of all data variables
for var in birds_file.data_vars:
    print(f"Variable: {var}")
    print(birds_file[var])
    print("\n")

In [None]:
print(birds_file['entity'].values)

In [None]:
from netCDF4 import Dataset as NetCDFFile
from mpl_toolkits.basemap import Basemap
import numpy as np


In [None]:
drive.mount('/content/drive')

In [None]:
nc = NetCDFFile('/content/drive/Shareddrives/NiTheCS mini school/demo_data/viti_spepop_id77_20240206_v1.nc')

In [None]:
print(nc)

In [None]:
lat = nc.variables['lat'][:]
lon = nc.variables['lon'][:]
time = nc.variables['time'][:]

In [None]:
crs = nc.variables['crs'][:]
ent = nc.variables['entity'][:]

In [None]:
entities = nc.variables['entity'][:]
print(entities)
species_names = [''.join(entity.astype(str)).strip() for entity in entities]

# Print the species names for inspection
for idx, species in enumerate(species_names):
    print(f'{idx}: {species}')

In [None]:
print(nc.groups['metric_1'].variables)

In [None]:
from pyproj import Proj, transform

# Define the EPSG:3035 and EPSG:4326 projections
proj3035 = Proj(init='epsg:3035')
proj4326 = Proj(init='epsg:4326')

# Convert lon and lat arrays from meters (EPSG:3035) to degrees (EPSG:4326)
lon_deg, lat_deg = transform(proj3035, proj4326, lon, lat)

# Now lon_deg and lat_deg are in degrees, and can be used with the 'aea' projection in Basemap


In [None]:
mp = Basemap(projection='aea', ellps='WGS84', lon_0=10, lat_0=52, lat_1=37, lat_2=62,
            llcrnrlon=min(lon_deg.flatten()), llcrnrlat=min(lat_deg.flatten()),
            urcrnrlon=max(lon_deg.flatten()), urcrnrlat=max(lat_deg.flatten()))


In [None]:
species_index = 10

species_distribution = nc.groups['metric_1'].variables['ebv_cube'][species_index, :, :]

In [None]:
print(f'lon shape: {lon.shape}')
print(f'lat shape: {lat.shape}')
print(f'species_distribution shape: {species_distribution_2d.shape}')

species_distribution_2d = np.squeeze(species_distribution)
# Mask NaN values in the species distribution data
species_distribution_masked = np.ma.masked_invalid(species_distribution_2d)

In [None]:
print(lon)

In [None]:
#lon, lat = np.meshgrid(lon,lat)  #this converts coordinates into 2D arrray
x, y = mp(lon,lat) #mapping them together

cs = mp.pcolormesh(x, y, species_distribution_masked, cmap='YlGn', shading='auto')

# consider this as the outline for the map that is to be created
mp.drawcoastlines()
mp.drawstates()
mp.drawcountries()
#plt.colorbar(cs, label='Species Distribution')

plt.show()

# Random functions to test: do not use! :-)

In [None]:
# Function to convert QDGC to lat/long bounding box
def qdgc_to_polygon(qdgc):
    # Parse the longitude and latitude
    lon_deg = int(qdgc[1:4])  # Extract longitude value
    lat_deg = int(qdgc[5:7])  # Extract latitude value

    if qdgc[0] == 'W':  # Western Hemisphere
        lon_deg = -lon_deg
    if qdgc[4] == 'S':  # Southern Hemisphere
        lat_deg = -lat_deg

    # Subdivision (AA, AB, BB, etc.)
    subcell = qdgc[7:]

    # Quarter-degree grid size (0.25° x 0.25°)
    quarter_degree_size = 1

    # Subdivision within quarter-degree cells (1/4 of 0.25° = 0.0625°)
    subcell_size = quarter_degree_size / 4  # Each smaller cell is 0.0625° x 0.0625°

    # Mapping the subcell to the grid position (AA, AB, ..., DD)
    subcell_map = {
        'AA': (0, 0), 'AB': (subcell_size, 0), 'AC': (2 * subcell_size, 0), 'AD': (3 * subcell_size, 0),
        'BA': (0, subcell_size), 'BB': (subcell_size, subcell_size), 'BC': (2 * subcell_size, subcell_size), 'BD': (3 * subcell_size, subcell_size),
        'CA': (0, 2 * subcell_size), 'CB': (subcell_size, 2 * subcell_size), 'CC': (2 * subcell_size, 2 * subcell_size), 'CD': (3 * subcell_size, 2 * subcell_size),
        'DA': (0, 3 * subcell_size), 'DB': (subcell_size, 3 * subcell_size), 'DC': (2 * subcell_size, 3 * subcell_size), 'DD': (3 * subcell_size, 3 * subcell_size)
    }

    lon_shift, lat_shift = subcell_map[subcell]

    # Find the top-left corner of the quarter-degree grid
    lon_min = lon_deg + (0 if qdgc[0] == 'W' else 0.0)
    lat_min = lat_deg + (0 if qdgc[4] == 'S' else 0.0)

    # Shift by the quarter-degree for the QDGC part (quarter-degree grid)
    lon_min += lon_shift
    lat_min += lat_shift

    # Calculate maximum lat and lon
    lat_max = lat_min + subcell_size
    lon_max = lon_min + subcell_size

    # Create the polygon for the grid cell
    return Polygon([(lon_min, lat_min), (lon_max, lat_min), (lon_max, lat_max), (lon_min, lat_max), (lon_min, lat_min)])




# Apply function to get polygons
#df = pd.DataFrame(data['qdgccode'].unique())



#ata['geometry'] = data['qdgccode'].apply(qdgc_to_polygon)
data['geometry'] = data['qdgccode'].apply(qdgc_to_polygon)
#geom = qdgc_to_polygon(df[0].values())


In [None]:
# Function to convert meters to degrees for latitude and longitude
def meters_to_degrees(lat, meters):
    # 1 degree latitude is roughly 111.32 km (constant)
    deg_lat = meters / 111320

    # 1 degree longitude is 111.32 km * cos(latitude) (varies with latitude)
    deg_lon = meters / (111320 * math.cos(math.radians(lat)))

    return deg_lat, deg_lon

# Function to convert MGRS to polygon
def mgrs_to_polygon(mgrs_code):
    mgrs_converter = mgrs.MGRS()

    # Get lower-left corner of MGRS grid square (lat, lon)
    lat, lon = mgrs_converter.toLatLon(mgrs_code)

    # Determine grid size in meters based on MGRS precision
    # Example: Adjust according to precision (1000m for 4-character code, etc.)
    grid_size_meters = 10000  # Adjust based on precision of MGRS code

    # Convert meters to degrees at the given latitude
    grid_size_lat_deg, grid_size_lon_deg = meters_to_degrees(lat, grid_size_meters)

    # Create polygon points for the grid square
    polygon_points = [
        (lon, lat),  # lower-left
        (lon + grid_size_lon_deg, lat),  # lower-right
        (lon + grid_size_lon_deg, lat + grid_size_lat_deg),  # upper-right
        (lon, lat + grid_size_lat_deg)  # upper-left
    ]

    # Create the polygon using shapely
    polygon = Polygon(polygon_points)

    return polygon

m = mgrs.MGRS()
# Function to convert MGRS to UTM polygon
def mgrs_to_utm_polygon(mgrs_code):
    # Convert MGRS to lat/lon using the mgrs library

    lat, lon = m.toLatLon(mgrs_code)  # Get lower-left corner in lat/lon

    # Extract UTM zone number from the MGRS code (first two digits are the UTM zone)
    utm_zone_number = int(mgrs_code[:2])

    # Determine if it's in the northern or southern hemisphere based on the latitude band
    hemisphere = 'north' if mgrs_code[2].upper() >= 'N' else 'south'

    # Create UTM projection based on the zone number and hemisphere
    utm_proj = Proj(proj='utm', zone=utm_zone_number, ellps='WGS84', south=(hemisphere == 'south'))

    # Transformer to convert lat/lon to UTM coordinates (EPSG:4326 -> UTM)
    transformer_to_utm = Transformer.from_crs("epsg:4326", utm_proj.srs)

    # Transform the lower-left corner from lat/lon to UTM (meters)
    x_utm, y_utm = transformer_to_utm.transform(lat, lon)

    # Define the grid size in meters (e.g., 1000 meters for a 1 km MGRS grid)
    grid_size_meters = 10000  # Adjust based on the precision of your MGRS code

    # Create the UTM polygon points (lower-left, lower-right, upper-right, upper-left)
    utm_polygon_points = [
        (x_utm, y_utm),                                 # lower-left
        (x_utm + grid_size_meters, y_utm),              # lower-right
        (x_utm + grid_size_meters, y_utm + grid_size_meters),  # upper-right
        (x_utm, y_utm + grid_size_meters)               # upper-left
    ]

    # Create the polygon in UTM space
    utm_polygon = Polygon(utm_polygon_points)

    # Transformer to convert UTM coordinates back to lat/lon (UTM -> EPSG:4326)
    transformer_to_latlon = Transformer.from_crs(utm_proj.srs, "epsg:4326")

    # Transform the UTM polygon back to lat/lon coordinates
    latlon_polygon = Polygon([transformer_to_latlon.transform(x, y) for x, y in utm_polygon.exterior.coords])

    return latlon_polygon

#data['geometry'] = data['mgrscode'].apply(mgrs_to_polygon)