In [1]:
!pwd

/Users/orlandotimmerman/Library/CloudStorage/OneDrive-UniversityofCambridge/cambridge/phd/coralshift


In [12]:
%load_ext autoreload
%autoreload 2

import os
import sys

os.chdir("/lustre_scratch/orlando-code/coralshift/")

import geopandas as gpd
import numpy as np
import xarray as xa
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Random forest classifier using scikit-learn

In [2]:
# import necessary coralshift and other packages
!pwd


/lustre_scratch/orlando-code/coralshift


In [3]:
# navigate to data directory
os.chdir("/lustre_scratch/orlando-code/datasets/")

In [13]:
def rasterize_geodf(geo_df: gpd.geodataframe, lat_resolution: float=1., lon_resolution: float=1.) -> np.ndarray:
    """Rasterize a geodataframe to a numpy array.
    
    Args:
        geo_df (gpd.geodataframe): Geodataframe to rasterize
        lat_resolution (float): Resolution of the raster in degrees latitude
        lon_resolution (float): Resolution of the raster in degrees longitude

    Returns:
        np.ndarray: Rasterized numpy array
    TODO: add crs customisation. Probably from class object elsewhere. Currently assumes EPSG:4326."""

    xmin, ymin, xmax, ymax, width, height = lat_lon_vals_from_geo_df(geo_df, lon_resolution, lat_resolution)
    # Create the transform based on the extent and resolution
    transform = rasterio.transform.from_bounds(xmin, ymin, xmax, ymax, width, height)
    transform.crs = rasterio.crs.CRS.from_epsg(4326)

    # Any chance of a loading bar? No: would have to dig into the function istelf.
    # could be interesting...
    return features.rasterize(
        [(shape, 1) for shape in geo_df['geometry']],
        out_shape=(height, width),
        transform=transform,
        fill=0,
        all_touched=True,
        dtype=rasterio.uint8)


def raster_to_xarray(raster: np.ndarray, x_y_limits: np.ndarray, lat_resolution: float=1., lon_resolution: float=1.) -> xa.DataArray:
    """Convert a raster to an xarray DataArray.
    
    Args:
        raster (np.ndarray): Raster to convert
        lat_resolution (float): Resolution of the raster in degrees latitude
        lon_resolution (float): Resolution of the raster in degrees longitude

    Returns:
        xa.DataArray: DataArray of the raster
    TODO: add attributes kwarg    
    """

    lon_min, lat_min, lon_max, lat_max = x_y_limits
    width = int((lon_max - lon_min) / lon_resolution)
    height = int((lat_max - lat_min) / lat_resolution)

    # Create longitude and latitude arrays
    longitudes = np.linspace(lon_min, lon_max, width)
    # reversed because raster inverted
    latitudes = np.linspace(lat_max, lat_min, height)

    # Create an xarray DataArray with longitude and latitude coordinates
    xa_array = xa.DataArray(raster, coords={
        'latitude': latitudes, 'longitude': longitudes}, dims=['latitude', 'longitude']
        )
    # Set the CRS (coordinate reference system) if needed
    # TODO: make kwarg
    xa_array.attrs['crs'] = 'EPSG:4326'  # Example CRS, use the appropriate CRS
    # TODO: set attributes if required
    #     attrs=dict(
#         description="Rasterised Reef Check coral presence survey data"
#     ))
    return xa_array


def lat_lon_vals_from_geo_df(geo_df: gpd.geodataframe, lon_resolution: float=1., lat_resolution: float=1.):
    # Calculate the extent in degrees from bounds of geometry objects
    lon_min, lat_min, lon_max, lat_max = geo_df['geometry'].total_bounds
    # Calculate the width and height of the raster in pixels based on the extent and resolution
    width = int((lon_max - lon_min) / lon_resolution)
    height = int((lat_max - lat_min) / lat_resolution)

    return lon_min, lat_min, lon_max, lat_max, width, height


def rasterise_points_df(df: pd.DataFrame, lat_column: str, lon_column: str,
    lat_resolution: float=1., lon_resolution: float=1., 
    bbox: list[float]=[-90,-180,90,180]) -> np.ndarray:
    """Rasterize a pandas dataframe of points to a numpy array.
    
    Args:
        df (pd.DataFrame): Dataframe of points to rasterize
        lat_resolution (float): Resolution of the raster in degrees latitude
        lon_resolution (float): Resolution of the raster in degrees longitude

    Returns:
        np.ndarray: Rasterized numpy array"""

    # extract bbox limits of your raster
    min_lat, min_lon, max_lat, max_lon = bbox

    # Calculate the number of rows and columns in the raster
    num_rows = int((max_lat - min_lat) / lat_resolution)
    num_cols = int((max_lon - min_lon) / lon_resolution)

    # Initialize an empty raster (of zeros)
    raster = np.zeros((num_rows, num_cols), dtype=int)

    # Convert latitude and longitude points to row and column indices of raster
    row_indices = ((max_lat - df[lat_column]) // lat_resolution).astype(int)
    col_indices = ((df[lon_column] - min_lon) // lon_resolution).astype(int)

    # Filter coordinates that fall within the bounding box: this produces a binary mask
    valid_indices = ((min_lat <= df[lat_column]) & (df[lat_column] <= max_lat) &
                    (min_lon <= df[lon_column]) & (df[lon_column] <= max_lon))

    # # Update the raster with counts of valid coordinates
    raster[row_indices[valid_indices], col_indices[valid_indices]] += 1

    # list of row, column indices corresponding to each latitude/longitude point
    valid_coordinates = list(zip(row_indices[valid_indices], col_indices[valid_indices]))
    # count number of repeated index pairs and return unique
    unique_coordinates, counts = np.unique(valid_coordinates, axis=0, return_counts=True)
    # assign number of counts to each unique raster
    raster[unique_coordinates[:, 0], unique_coordinates[:, 1]] = counts

    return raster

In [None]:
# load in ground truth
unep_fp = Path("unep-wcmc/01_Data/WCMC008_CoralReef2021_Py_v4_1.shp")
unep_gdf = gpd.read_file(unep_fp)

lon_resolution, lat_resolution = 0.01, 0.01
unep_raster = rasterize_geodf(unep_gdf, lat_resolution=lat_resolution, lon_resolution=lon_resolution)


: 

In [None]:
# load in environmental data


: 