# ERA5 yearly datasets 

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

In [2]:
# Paths to my data directories
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

In [26]:
# Print files in my ERA5 dir
for filepath in data_path.joinpath('ERA5').iterdir():
    print(filepath)

/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2020.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2023_old.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2021.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_combined.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2018.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2019.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2023_the_whole_year_how.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2022.nc
/Users/brand/my_code/meteoviz/course_project/data/ERA5/era5_2023.nc


#### Note:
- Note: year 2023 has an unexpected dim called `expver`
    - "expver is used to tell the difference between the initial release (expver=5, called ERA5T) and validated ERA5 data (expver=1). In most cases, ERA5 is identical to ERA5T"
    - More here: https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Dataupdatefrequency

### Read ERA5 datasets into Xarray

In [7]:
ds_2018 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2018.nc'))

In [10]:
ds_2019 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2019.nc'))

In [11]:
ds_2020 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2020.nc'))

In [12]:
ds_2021 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2021.nc'))

In [13]:
ds_2022 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2022.nc'))

In [27]:
ds_2023 = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_2023.nc'))

#### Concat the xarray datasets along dimension `time`

In [29]:
datasets = [ds_2018, ds_2019, ds_2020, ds_2021, ds_2022, ds_2023]
ds_combined = xr.concat(datasets, dim='time')

In [31]:
# ds_combined.time.to_dataframe()['time'].value_counts()

#### Save combined dataset to file and check size
- Should I go ahead and subset against used grid points?
- Get latest used grid points from latest turbine df

In [32]:
ds_combined.to_netcdf(data_path.joinpath('ERA5').joinpath('era5_combined.nc'))

In [33]:
# Load back into xarray and check structure
ds = xr.open_dataset(data_path.joinpath('ERA5').joinpath('era5_combined.nc'))

In [35]:
ds_combined

In [34]:
ds

-----

## Explore 

In [8]:
ds_2018

In [6]:
ds_2023

-----

### Check that all lats and lons match between data sets!
- Add 2023 dataset to check and run again!

In [24]:
def check_dimensions_match(datasets, dim_name):
    """Check if a given dimension's values are the same across multiple datasets."""
    reference_values = datasets[0][dim_name].values
    for ds in datasets[1:]:
        if not (ds[dim_name].values == reference_values).all():
            return False
    return True

In [25]:
# Check latitude and longitude for all datasets
latitude_match = check_dimensions_match(datasets, 'latitude')
longitude_match = check_dimensions_match(datasets, 'longitude')

if latitude_match and longitude_match:
    print("All datasets have matching latitude and longitude values!")
else:
    if not latitude_match:
        print("Datasets do not have matching latitude values.")
    if not longitude_match:
        print("Datasets do not have matching longitude values.")

All datasets have matching latitude and longitude values!


------

## Derive 10m and 100m mean wind speeds 
- Subset the data first?!

In [None]:
def calc_wind_speed_using_ortho_components(u_zonal, v_meridional):
    """
    u_zondal (u10): wind component along local parallel of latitude; positive from west, negative from east
    v_meridional (v10): wind component along local meridian; positive from south, negative from north
    returns the magnitude of the wind vector (i.e. wind speed)
    """
    # use numpy.sqrt() as math.sqrt() only accepts scalar value not array
    mean_wind_speed = np.sqrt(u_zonal**2 + v_meridional**2)
    return mean_wind_speed