In [1]:
# Standard libraries
import datetime as dt
import os

# Third-party libraries
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rioxarray
import xarray as xr
import matplotlib.pyplot as plt

# Local libraries
from utilities import netcdf


In [2]:
# Constants
ROOT_DIR = os.path.join(os.getcwd(), "data")
HAD_DIR = os.path.join(ROOT_DIR, "intermediate/hadukgrid/monthly-1km")
OUT_DIR = os.path.join(ROOT_DIR, "processed/climate")
OUT_DIR_MONTHLY = os.path.join(OUT_DIR, "monthly")
OUT_DIR_ANNUAL = os.path.join(OUT_DIR, "annual")
OUT_DIR_SEASONAL = os.path.join(OUT_DIR, "seasonal")


In [3]:
# 1km climate data
nc_files_1km = netcdf.list_files(HAD_DIR, path=True)

# Locations geo data
location = gpd.read_file(
    os.path.join(ROOT_DIR, "processed/location", "location.geojson")
)

In [6]:
def get_season_code(month_number):
    """Returns the corresponding season code for a month number"""
    if month_number in [12, 1, 2]:
        return "win"
    if month_number in [3, 4, 5]:
        return "spr"
    if month_number in [6, 7, 8]:
        return "sum"
    if month_number in [9, 10, 11]:
        return "aut"


def to_table(ds, data_var):
    """Converts spatially aggregated HadUK-Grid DataSet to a pandas DataFrame formatted for the CWUK databse"""
    df = (
        ds[data_var]
        .to_dataframe()
        .reset_index()
        .rename(
            columns={
                data_var: "value",
                "location": "location_id",
                "month_number": "month",
            }
        )
    )
    df["year"] = df.apply(
        lambda row: int(row["time"].date().strftime("%Y")), axis=1
    )
    df["season"] = df.apply(lambda row: get_season_code(row["month"]), axis=1)
    df.insert(loc=0, column="variable_id", value=[data_var] * len(df))
    df = df.drop(columns="time")
    df = df[
        [
            "variable_id",
            "location_id",
            "year",
            "month",
            "value",
            "season_year",
            "season",
        ]
    ]
    return df


def create_mask(gdf, da):
    """
    Returns a array of spatial masks with values clipped to the geometries in `gdf`.
    One mask for each row of `gdf`.
    """
    mask_da = xr.ones_like(da)
    mask_da.name = "mask"
    mask_da = mask_da.rio.write_crs(da.rio.crs)

    return xr.concat(
        [
            (mask_da.rio.clip([item.geometry], drop=False) == 1).expand_dims(
                dim={"location": [item.id]}
            )
            for item in gdf.itertuples()
        ],
        dim="location",
    )


In [10]:
Y_COORD = "projection_y_coordinate"
X_COORD = "projection_x_coordinate"

for fp in nc_files_1km:
    with xr.open_dataset(fp, decode_coords="all", chunks="auto") as ds:
        # Each nc file has a single data variable
        DATA_VAR = list(ds.data_vars)[0]
        # Create spatial masks. One for each location
        mask_da = create_mask(location, ds[DATA_VAR])
        # Calculate the mean for each location
        mean_da = ds.where(mask_da).mean([Y_COORD, X_COORD])
        # Collect names of coords to drop before calling to_table()
        coords_to_drop = []
        for coord in mean_da.coords:
            if coord not in [
                DATA_VAR,
                "time",
                "month_number",
                "year",
                "season_year",
                "location",
            ]:
                coords_to_drop.append(coord)
        df_monthly = to_table(mean_da.drop(coords_to_drop), DATA_VAR)

        # Seasonal aggregation
        df_seasonal = (
            df_monthly[
                ["variable_id", "location_id", "season_year", "season", "value"]
            ]
            .groupby(["season_year", "season", "location_id", "variable_id"])
            .mean()
            .reset_index()
        )
        df_seasonal = df_seasonal.rename(columns={"season_year": "year"})
        # Annual aggregation
        df_annual = (
            df_monthly[["variable_id", "location_id", "year", "value"]]
            .groupby(["year", "location_id", "variable_id"])
            .mean()
            .reset_index()
        )
        # Save to disk
        df_monthly.to_csv(
            os.path.join(OUT_DIR_MONTHLY, DATA_VAR + "-monthly" + ".csv"),
            index=False,
        )
        df_seasonal.to_csv(
            os.path.join(OUT_DIR_SEASONAL, DATA_VAR + "-seasonal" + ".csv"),
            index=False,
        )
        df_annual.to_csv(
            os.path.join(OUT_DIR_ANNUAL, DATA_VAR + "-annual" + ".csv"),
            index=False,
        )
