In [1]:
# Standard libraries
import datetime as dt
import os

# Third-party libraries
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rioxarray
import xarray as xr
import matplotlib.pyplot as plt

# Local libraries
from utilities import netcdf


In [2]:
# Constants
ROOT_DIR = os.path.join(os.getcwd(), "data")
HAD_DIR = os.path.join(ROOT_DIR, "intermediate/hadukgrid")
GEO_DIR = os.path.join(ROOT_DIR, "processed/space")
VAR_DIR = os.path.join(ROOT_DIR, "processed/variable")
OUT_DIR = os.path.join(ROOT_DIR, "processed/climate")


In [3]:
# Climate data
NC_FILES = netcdf.list_files(HAD_DIR, path=True)


In [5]:
# Spatial data
space = gpd.read_file(os.path.join(GEO_DIR, "space.geojson"))
space

Unnamed: 0,name,x_coord,y_coord,id,parent_id,geometry
0,United Kingdom,348890.323001,461087.31387,gb,,"MULTIPOLYGON (((336177.917 1014260.958, 336272..."
1,England,436325.368279,296147.848719,gb-eng,gb,"MULTIPOLYGON (((84009.402 5361.604, 84022.099 ..."
2,Isle of Man,234265.323626,484561.023312,gb-iom,gb,"MULTIPOLYGON (((216561.496 466536.198, 216335...."
3,Northern Ireland,97015.507927,533949.890497,gb-nir,gb,"MULTIPOLYGON (((45089.390 487652.021, 44926.19..."
4,Scotland,266651.514962,771689.385884,gb-sct,gb,"MULTIPOLYGON (((336177.917 1014260.958, 336272..."
5,Wales,279739.337741,271715.748248,gb-wls,gb,"MULTIPOLYGON (((322349.697 164637.902, 322360...."
6,East Midlands,481004.614478,337237.029489,gb-emd,gb-eng,"POLYGON ((534657.901 406164.796, 533900.897 40..."
7,East Scotland,327936.681532,732569.478746,gb-esc,gb-sct,"MULTIPOLYGON (((320766.200 680581.505, 320697...."
8,East of England,573749.53721,263770.9148,gb-een,gb-eng,"POLYGON ((585951.803 181704.897, 585934.901 18..."
9,London,531331.013781,179645.800174,gb-lnd,gb-eng,"POLYGON ((531788.999 200757.503, 531643.597 20..."


In [40]:
def create_mask(gdf, da):
    """Returns a data array with sizes equal to `ds`"""
    mask_da = xr.ones_like(da)
    mask_da.name = "mask"
    mask_da = mask_da.rio.write_crs(da.rio.crs)

    return xr.concat(
        [
            (mask_da.rio.clip([item.geometry], drop=False) == 1).expand_dims(
                dim={"space": [item.id]}
            )
            for item in gdf.itertuples()
        ],
        dim="space",
    )


In [41]:
def to_table(ds, data_var):
    """Converts spatially aggregated HadUK-Grid DataSet to a pandas DataFrame formatted for the CWUK databse"""
    df = (
        ds[data_var]
        .to_dataframe()[data_var]
        .reset_index()
        .rename(
            columns={data_var: "value", "time": "time_id", "space": "space_id"}
        )
    )
    df["time_id"] = df.apply(
        lambda row: int(row.time_id.date().strftime("%Y%m")), axis=1
    )
    df.insert(loc=0, column="variable_id", value=[data_var] * len(df))
    return df


In [44]:
Y_COORD = "projection_y_coordinate"
X_COORD = "projection_x_coordinate"

for fp in NC_FILES:
    with xr.open_dataset(fp, decode_coords="all", chunks="auto") as ds:
        DATA_VAR = list(ds.data_vars)[0]
        mask_template = ds[DATA_VAR].isel({"time": 0})
        mask = create_mask(space, mask_template)
        mean = ds.where(mask).mean([Y_COORD, X_COORD])
        df = to_table(mean, DATA_VAR)
        df.to_csv(os.path.join(OUT_DIR, DATA_VAR + ".csv"), index=False)
