In [28]:
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray
import geopandas as gpd
from shapely.geometry import box
import os

In [29]:
def fine_grid_no_interpolation(cdf, regridding_factor, dif_lon=None, dif_lat=None, epsilon = 1e-9):

    """
    cdf: NetCDF4 dataset read by xarray.

    regridding_factor: the factor that we want to resample our dataset by; it has to be a multiplier of the original dimension,
    Eg. using a factor of 5 for 0.25 ERA5 will resample the data into 0.05 degree dataset.

    epsilon: a value to avoid floating point error when defining the new dimensions.

    This function resamples the dataset into finer dimension without interpolation, using Kroniker Product.

    Returns xarray dataset with the new spatial dimensions.
    """

    lons = cdf.variables[[name for name in cdf.indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0]][:]
    lats = cdf.variables[[name for name in cdf.indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0]][:]
    if len(lons) > 1:
        dif_lon = lons[1] - lons[0]
    if len(lats) > 1:
        dif_lat = lats[0] - lats[1]

    lats_fine = np.float64(np.round(np.arange(lats[0] + (dif_lat/2) - (dif_lat/regridding_factor)/2,
                               lats[-1]- (dif_lat/2) + (dif_lat/regridding_factor)/2,
                               -dif_lat/regridding_factor+epsilon),6))

    lons_fine = np.float64(np.round(np.arange(lons[0] - (dif_lon/2) + (dif_lon/regridding_factor)/2,
                               lons[-1]+ (dif_lon/2) - (dif_lon/regridding_factor)/2,
                               dif_lon/regridding_factor-epsilon),6))

    t = cdf[[name for name in cdf.coords if any(key in name.lower() for key in ['time', 'date', 'year'])][0]]
    times = pd.DatetimeIndex(t.to_pandas())

    # Create an empty dictionary to store the resampled data variables
    resampled_vars = {}

    # Iterate through each data variable in the input dataset
    for var_name, data_array in cdf.data_vars.items():
        var_kron = np.kron(data_array.values, np.ones((regridding_factor, regridding_factor)))

        # Create a DataArray for the resampled variable
        resampled_vars[var_name] = xr.DataArray(
            var_kron,
            coords=[times, lats_fine, lons_fine],
            dims=["time", "latitude", "longitude"],
            name=var_name
        )

    # Create a Dataset from the dictionary of resampled data variables
    data_set = xr.Dataset(resampled_vars)

    return data_set

In [30]:
def def_dims(nc_file):
    lons = np.array(nc_file.variables[[name for name in nc_file.indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0]][:])
    lats = np.array(nc_file.variables[[name for name in nc_file.indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0]][:])
    dif_lon = np.abs(lons[0] - lons[1], dtype= np.float64)
    dif_lat = np.abs(lats[0] - lats[1], dtype= np.float64)
    return dif_lon, dif_lat, lons, lats

In [31]:
def extracting_and_averaging_polygon(poly, nc_file, regridding_factor, crs = 'epsg:4326', epsilon = 1e-9):
  
  nc_file.rio.set_spatial_dims(x_dim=[name for name in nc_file.indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0],
                                y_dim=[name for name in nc_file.indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0],
                                inplace=True)
  nc_file.rio.write_crs(crs, inplace=True)
  clipped = nc_file.rio.clip(poly.geometry,poly.crs, all_touched=True)

  # Rename dimensions to match expected names in fine_grid_no_interpolation
  # Assuming the time dimension is named 'valid_time' in the input nc_file (t2m)
  original_time_dim = [name for name in clipped.dims if 'time' in name.lower()][0]
  clipped = clipped.rename({original_time_dim: 'time'})


  data_set = fine_grid_no_interpolation(cdf=clipped, regridding_factor=regridding_factor, 
                                        dif_lon=def_dims(nc_file)[0], 
                                        dif_lat=def_dims(nc_file)[1])
  data_set.rio.set_spatial_dims(x_dim=[name for name in data_set.indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0],
                                y_dim=[name for name in data_set.indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0],
                                inplace=True)
  data_set.rio.write_crs(crs, inplace=True)

  c = data_set.rio.clip(poly.geometry,poly.crs)

  # Convert to DataFrame and inspect before groupby
  df_before_groupby = c.to_dataframe().dropna().reset_index()


  avg_df = df_before_groupby.groupby('time')[list(c.data_vars.keys())].mean().reset_index()
  return avg_df

In [32]:
def create_box(center_x, center_y, lon_size, lat_size, crs = 'epsg:4326'):
    half_lon = lon_size / 2
    half_lat = lat_size / 2
    poly = gpd.GeoDataFrame({'geometry': [box(center_x - half_lon, center_y - half_lat, center_x + half_lon, center_y + half_lat)]}, crs=crs)
    return poly
    

In [34]:
def assimiliate(target_ds, to_be_assi_ds, regridding_factor, crs = 'epsg:4326', epsilon = 1e-9):
    dif_lon, dif_lat, lons, lats = def_dims(target_ds)
    dss = []
    for i in range(len(lons)):
        for j in range(len(lats)):
            boxx = create_box(lons[i], lats[j], dif_lon, dif_lat, crs = crs)
            df = extracting_and_averaging_polygon(boxx, to_be_assi_ds, regridding_factor, crs = crs)
            df['longitude'] = 35.5
            df['latitude'] = 32.1
            df.set_index(['time', 'longitude', 'latitude'], inplace=True)
            if len(dss) == 0:
                dss = df.to_xarray()
            else:
                dss = xr.merge([dss, df.to_xarray()], compat='override')
    return dss


In [69]:
aod = xr.open_dataset(r"C:\Users\user\OneDrive\Desktop\gdal trials\AOD nc\2022.nc").drop('spatial_ref')
folder = './Era5 Land/'
files = os.listdir(folder)
era5 = [xr.open_dataset(folder + file).coarsen(valid_time=24).mean() for file in files]
era5 = xr.merge(era5)

In [82]:
lon_dim = [name for name in aod.indexes if ('lon' in name.lower()) | ('eas' in name.lower()) | ('wes' in name.lower()) | ('x' in name.lower())][0]
lat_dim = [name for name in aod.indexes if ('lat' in name.lower()) | ('nor' in name.lower()) | ('sou' in name.lower()) | ('y' in name.lower())][0]
aod = aod.rename({lat_dim: 'latitude'})
aod = aod.rename({lon_dim: 'longitude'})

In [83]:
aod

In [37]:
era5

In [38]:
'''dif_lon, dif_lat, lons, lats = def_dims(aod)
dss = []
crs = 'epsg:4326'
regridding_factor = 20
for i in range(len(lons)):
    for j in range(len(lats)):
        boxx = create_box(lons[i], lats[j], dif_lon, dif_lat, crs = crs)
        df = extracting_and_averaging_polygon(boxx, era5, regridding_factor, crs = crs)
        df['longitude'] = lons[i]
        df['latitude'] = lats[j]
        df.set_index(['time', 'longitude', 'latitude'], inplace=True)
        
        ds = df.to_xarray()
        del df, boxx
        if len(dss) == 0:
            dss = ds
        else:
            dss = xr.merge([dss, ds])
            del ds'''

"dif_lon, dif_lat, lons, lats = def_dims(aod)\ndss = []\ncrs = 'epsg:4326'\nregridding_factor = 20\nfor i in range(len(lons)):\n    for j in range(len(lats)):\n        boxx = create_box(lons[i], lats[j], dif_lon, dif_lat, crs = crs)\n        df = extracting_and_averaging_polygon(boxx, era5, regridding_factor, crs = crs)\n        df['longitude'] = lons[i]\n        df['latitude'] = lats[j]\n        df.set_index(['time', 'longitude', 'latitude'], inplace=True)\n        \n        ds = df.to_xarray()\n        del df, boxx\n        if len(dss) == 0:\n            dss = ds\n        else:\n            dss = xr.merge([dss, ds])\n            del ds"

In [94]:
dif_lon, dif_lat, lons, lats = def_dims(aod)
del dif_lat, dif_lon

df = []
for i in range(len(lons)):
    for j in range(len(lats)):
        dat = era5.sel(latitude=lats[j], longitude=lons[i], method='nearest').to_pandas().drop(['number', 'expver'], axis=1).reset_index()
        dat['time'] = pd.DatetimeIndex(dat['valid_time'].dt.date)
        dat['latitude'] = lats[j]
        dat['longitude'] = lons[i]
        dat.drop('valid_time', axis=1, inplace=True)
        df.append(dat)

In [96]:
era5_as = pd.concat(df).set_index(['time', 'longitude', 'latitude']).to_xarray()

In [97]:
era5_as

In [98]:
xr.merge([aod, era5_as])