diff --git a/notebooks/2021-09/2021-09-28/get_raw_sun_data.py b/notebooks/2021-09/2021-09-28/get_raw_sun_data.py new file mode 100644 index 00000000..3c3456c4 --- /dev/null +++ b/notebooks/2021-09/2021-09-28/get_raw_sun_data.py @@ -0,0 +1,120 @@ +############ +# Look into the differences from year to year in elevation and azimuthal direction + +# Looked from 2018-2020, for January, April, July and October, +# Found the different from year to year was less than 1 degree + +############ + +import logging + +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger("urllib3").setLevel(logging.WARNING) + +from datetime import datetime +from pathlib import Path +import pandas as pd +import numpy as np +import os +import nowcasting_dataset +from nowcasting_dataset.data_sources.gsp.eso import get_gsp_metadata_from_eso +from nowcasting_dataset.data_sources.sun.raw_data_load_save import ( + save_to_zarr, + get_azimuth_and_elevation, +) + +# set up +BUCKET = Path("solar-pv-nowcasting-data") +PV_PATH = BUCKET / "PV/PVOutput.org" +PV_METADATA_FILENAME = PV_PATH / "UK_PV_metadata.csv" + +# set up variables +local_path = os.path.dirname(nowcasting_dataset.__file__) + "/.." +metadata_filename = f"gs://{PV_METADATA_FILENAME}" + +# PV metadata +pv_metadata = pd.read_csv(metadata_filename, index_col="system_id") +pv_metadata = pv_metadata.dropna(subset=["longitude", "latitude"]) +pv_longitudes = pv_metadata["longitude"] +pv_latitudes = pv_metadata["latitude"] + +# GSP Metadata +gsp_metadata = get_gsp_metadata_from_eso() +gsp_metadata = gsp_metadata.dropna(subset=["centroid_lon", "centroid_lat"]) +# probably need to change this to centroid +gsp_lon = gsp_metadata["centroid_lon"] +gsp_lat = gsp_metadata["centroid_lat"] + +# join all sites together +longitudes = list(pv_longitudes.values) + list(gsp_lon.values) +latitudes = list(pv_latitudes.values) + list(gsp_lat.values) + +# make d +start_dt = datetime.fromisoformat("2019-01-01 00:00:00.000+00:00") +end_dt = datetime.fromisoformat("2019-01-02 00:00:00.000+00:00") + +azimuths = {} +azimuths_sin = {} +azimuths_cos = {} +elevations = {} +months = [1, 4, 7, 10] +years = [2018, 2019, 2020] +for month in months: + for year in years: + print(year) + print(month) + start_dt = start_dt.replace(year=year, month=month) + end_dt = end_dt.replace(year=year, month=month) + datestamps = pd.date_range(start=start_dt, end=end_dt, freq="5T") + + azimuth, elevation = get_azimuth_and_elevation( + longitudes=longitudes, latitudes=latitudes, datestamps=datestamps + ) + + azimuths[f"{year}_{month}"] = azimuth + azimuths_sin[f"{year}_{month}"] = np.sin(np.deg2rad(azimuth)) + azimuths_cos[f"{year}_{month}"] = np.cos(np.deg2rad(azimuth)) + elevations[f"{year}_{month}"] = elevation + +m_azimuths = [] +m_azimuths_sin = [] +m_azimuths_cos = [] +m_elevations = [] +for month in months: + for year in years[1:]: + print(year) + print(month) + + m_azimuths.append( + (np.abs(azimuths[f"{year}_{month}"].values - azimuths[f"2018_{month}"].values)).max() + ) + m_azimuths_sin.append( + ( + np.abs( + azimuths_sin[f"{year}_{month}"].values - azimuths_sin[f"2018_{month}"].values + ) + ).max() + ) + m_azimuths_cos.append( + ( + np.abs( + azimuths_cos[f"{year}_{month}"].values - azimuths_cos[f"2018_{month}"].values + ) + ).max() + ) + m_elevations.append( + ( + np.abs(elevations[f"{year}_{month}"].values - elevations[f"2018_{month}"].values) + ).max() + ) + + +# for small radians, sin(x) ~ x, so sin(x)*180/pi ~ degrees +m_azimuths = np.array(m_azimuths_sin) * 180 / np.pi +# m_azimuths = np.array(m_azimuths_cos) * 180 / np.pi + +print(f"Maximum azimuth difference is {max(m_azimuths)} degree") +print(f"Maximum elevation difference is {max(m_elevations)} degree") + +# largest different in both azimuth and elevation < 1 degree --> Happy to use one yea data diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py index 00d3e743..30b1c0a2 100644 --- a/nowcasting_dataset/config/model.py +++ b/nowcasting_dataset/config/model.py @@ -72,6 +72,11 @@ class InputData(BaseModel): description="Path to the GeoTIFF Topographic data source", ) + sun_zarr_path: str = Field( + "gs://solar-pv-nowcasting-data/Sun/v0/sun.zarr/", + description="Path to the Sun data source i.e Azimuth and Elevation", + ) + class OutputData(BaseModel): """ Output data model """ diff --git a/nowcasting_dataset/consts.py b/nowcasting_dataset/consts.py index 5ba98fe6..2b6b0245 100644 --- a/nowcasting_dataset/consts.py +++ b/nowcasting_dataset/consts.py @@ -28,8 +28,10 @@ PV_SYSTEM_ROW_NUMBER = "pv_system_row_number" PV_SYSTEM_X_COORDS = "pv_system_x_coords" PV_SYSTEM_Y_COORDS = "pv_system_y_coords" -PV_AZIMUTH_ANGLE = "pv_azimuth_angle" -PV_ELEVATION_ANGLE = "pv_elevation_angle" + +SUN_AZIMUTH_ANGLE = "sun_azimuth_angle" +SUN_ELEVATION_ANGLE = "sun_elevation_angle" + PV_YIELD = "pv_yield" PV_DATETIME_INDEX = "pv_datetime_index" DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE = 128 diff --git a/nowcasting_dataset/data_sources/pv_data_source.py b/nowcasting_dataset/data_sources/pv_data_source.py index fde4a5c5..2bf4cfed 100644 --- a/nowcasting_dataset/data_sources/pv_data_source.py +++ b/nowcasting_dataset/data_sources/pv_data_source.py @@ -1,10 +1,9 @@ """ PV Data Source """ + import datetime import functools import io import logging -import time -from concurrent import futures from dataclasses import dataclass from numbers import Number from pathlib import Path @@ -15,7 +14,6 @@ import pandas as pd import torch import xarray as xr -from tqdm import tqdm from nowcasting_dataset import geospatial, utils from nowcasting_dataset.consts import ( @@ -23,8 +21,6 @@ PV_SYSTEM_ROW_NUMBER, PV_SYSTEM_X_COORDS, PV_SYSTEM_Y_COORDS, - PV_AZIMUTH_ANGLE, - PV_ELEVATION_ANGLE, PV_YIELD, DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE, OBJECT_AT_CENTER, @@ -65,8 +61,6 @@ def load(self): """ self._load_metadata() self._load_pv_power() - if self.load_azimuth_and_elevation: - self._calculate_azimuth_and_elevation() self.pv_metadata, self.pv_power = align_pv_system_ids(self.pv_metadata, self.pv_power) def _load_metadata(self): @@ -132,18 +126,10 @@ def _get_time_slice(self, t0_dt: pd.Timestamp) -> [pd.DataFrame]: del t0_dt # t0 is not used in the rest of this method! selected_pv_power = self.pv_power.loc[start_dt:end_dt].dropna(axis="columns", how="any") - if self.load_azimuth_and_elevation: - selected_pv_azimuth_angle = self.pv_azimuth.loc[start_dt:end_dt].dropna( - axis="columns", how="any" - ) - selected_pv_elevation_angle = self.pv_elevation.loc[start_dt:end_dt].dropna( - axis="columns", how="any" - ) - else: - selected_pv_azimuth_angle = None - selected_pv_elevation_angle = None + selected_pv_azimuth_angle = None + selected_pv_elevation_angle = None - return selected_pv_power, selected_pv_azimuth_angle, selected_pv_elevation_angle + return selected_pv_power def _get_central_pv_system_id( self, @@ -231,11 +217,7 @@ def get_example( """ logger.debug("Getting PV example data") - ( - selected_pv_power, - selected_pv_azimuth_angle, - selected_pv_elevation_angle, - ) = self._get_time_slice(t0_dt) + selected_pv_power = self._get_time_slice(t0_dt) all_pv_system_ids = self._get_all_pv_system_ids_in_roi( x_meters_center, y_meters_center, selected_pv_power.columns ) @@ -252,9 +234,6 @@ def get_example( all_pv_system_ids = all_pv_system_ids[: self.n_pv_systems_per_example] selected_pv_power = selected_pv_power[all_pv_system_ids] - if self.load_azimuth_and_elevation: - selected_pv_azimuth_angle = selected_pv_azimuth_angle[all_pv_system_ids] - selected_pv_elevation_angle = selected_pv_elevation_angle[all_pv_system_ids] pv_system_row_number = np.flatnonzero(self.pv_metadata.index.isin(all_pv_system_ids)) pv_system_x_coords = self.pv_metadata.location_x[all_pv_system_ids] @@ -272,10 +251,6 @@ def get_example( pv_datetime_index=selected_pv_power.index, ) - if self.load_azimuth_and_elevation: - example[PV_AZIMUTH_ANGLE] = selected_pv_azimuth_angle - example[PV_ELEVATION_ANGLE] = selected_pv_elevation_angle - if self.get_center: example[OBJECT_AT_CENTER] = "pv" @@ -290,9 +265,6 @@ def get_example( ] pad_nans_variables = [PV_YIELD] - if self.load_azimuth_and_elevation: - pad_nans_variables.append(PV_AZIMUTH_ANGLE) - pad_nans_variables.append(PV_ELEVATION_ANGLE) example = utils.pad_data( data=example, @@ -341,79 +313,6 @@ def datetime_index(self) -> pd.DatetimeIndex: """Returns a complete list of all available datetimes.""" return self.pv_power.index - def _calculate_azimuth_and_elevation(self): - """ - Calculate the azimuth and elevation angles for each datestamp, for each pv system. - """ - logger.debug("Calculating azimuth and elevation angles") - - self.pv_azimuth, self.pv_elevation = calculate_azimuth_and_elevation_all_pv_systems( - self.datetime_index().to_pydatetime(), self.pv_metadata - ) - - -def calculate_azimuth_and_elevation_all_pv_systems( - datestamps: List[datetime.datetime], pv_metadata: pd.DataFrame -) -> (pd.Series, pd.Series): - """ - Calculate the azimuth and elevation angles for each datestamp, for each pv system. - - Args: - datestamps: list of timestamps for when to collected data for - pv_metadata: pv metadata, so we know where to collected data for - - Returns: Azimuth and Elevations data - - """ - logger.debug( - f"Will be calculating for {len(datestamps)} datestamps and {len(pv_metadata)} pv systems" - ) - - # create array of index datetime, columns of system_id for both azimuth and elevation - pv_azimuth = [] - pv_elevation = [] - - t = time.time() - # loop over all metadata and fine azimuth and elevation angles, - # not sure this is the best method to use, as currently this step takes ~2 minute for 745 pv systems, - # and 235 datestamps (~100,000 point). But this only needs to be done once. - with futures.ThreadPoolExecutor(max_workers=len(pv_metadata)) as executor: - - logger.debug("Setting up jobs") - - # Submit tasks to the executor. - future_azimuth_and_elevation_per_pv_system = [] - for i in tqdm(range(len(pv_metadata))): - future_azimuth_and_elevation = executor.submit( - geospatial.calculate_azimuth_and_elevation_angle, - latitude=pv_metadata.iloc[i].latitude, - longitude=pv_metadata.iloc[i].longitude, - datestamps=datestamps, - ) - future_azimuth_and_elevation_per_pv_system.append( - [future_azimuth_and_elevation, pv_metadata.iloc[i].name] - ) - - logger.debug(f"Getting results") - - # Collect results from each thread. - for i in tqdm(range(len(future_azimuth_and_elevation_per_pv_system))): - future_azimuth_and_elevation, name = future_azimuth_and_elevation_per_pv_system[i] - azimuth_and_elevation = future_azimuth_and_elevation.result() - - azimuth = azimuth_and_elevation.loc[:, "azimuth"].rename(name) - elevation = azimuth_and_elevation.loc[:, "elevation"].rename(name) - - pv_azimuth.append(azimuth) - pv_elevation.append(elevation) - - pv_azimuth = pd.concat(pv_azimuth, axis=1) - pv_elevation = pd.concat(pv_elevation, axis=1) - - logger.debug(f"Calculated Azimuth and Elevation angles in {time.time() - t} seconds") - - return pv_azimuth, pv_elevation - def load_solar_pv_data_from_gcs( filename: Union[str, Path], diff --git a/nowcasting_dataset/data_sources/sun/__init__.py b/nowcasting_dataset/data_sources/sun/__init__.py new file mode 100644 index 00000000..9c6af9bd --- /dev/null +++ b/nowcasting_dataset/data_sources/sun/__init__.py @@ -0,0 +1 @@ +""" Sun Data Source i.e azimuth and elevation angles """ diff --git a/nowcasting_dataset/data_sources/sun/raw_data_load_save.py b/nowcasting_dataset/data_sources/sun/raw_data_load_save.py new file mode 100644 index 00000000..8ec1b531 --- /dev/null +++ b/nowcasting_dataset/data_sources/sun/raw_data_load_save.py @@ -0,0 +1,180 @@ +""" Sun Data Source """ +import datetime +import time +import io +import logging +from concurrent import futures +from typing import List, Union, Optional + +import fsspec +import numcodecs +import pandas as pd +from tqdm import tqdm +import xarray as xr +import numpy as np +from pathlib import Path + +from nowcasting_dataset import geospatial + +logger = logging.getLogger(__name__) + + +def get_azimuth_and_elevation( + datestamps: List[datetime.datetime], x_centers: List[int], y_centers: List[int] +) -> (pd.DataFrame, pd.DataFrame): + """ + + Get Azimuth and elevation positions of the sun + + For a list of datestamps and a list of coordinates, get the azimuth and elevation degrees. + Note that the degrees are rounded to 2 decimal places, as we at most need that. + + Args: + datestamps: list of datestamps that are needed + x_centers: list of x coordinates - ref. OSGB + y_centers: list of y coordinates - ref. OSGB + + Returns: Tuple of dataframes for azimuth and elevation. + The index is timestamps, and the columns are the x and y coordinates in OSGB projection + + """ + logger.debug( + f"Will be calculating for {len(datestamps)} datestamps and {len(x_centers)} locations" + ) + + assert len(x_centers) == len(y_centers) + + # create array of index datetime, columns of system_id for both azimuth and elevation + azimuth = [] + elevation = [] + + t = time.time() + names = [] + # loop over locations and find azimuth and elevation angles, + with futures.ThreadPoolExecutor() as executor: + + logger.debug("Setting up jobs") + + # Submit tasks to the executor. + future_azimuth_and_elevation_per_location = [] + for i in tqdm(range(len(x_centers))): + + name = x_y_to_name(x_centers[i], y_centers[i]) + if name not in names: + + lat, lon = geospatial.osgb_to_lat_lon(x=x_centers[i], y=y_centers[i]) + + future_azimuth_and_elevation = executor.submit( + geospatial.calculate_azimuth_and_elevation_angle, + latitude=lat, + longitude=lon, + datestamps=datestamps, + ) + future_azimuth_and_elevation_per_location.append( + [future_azimuth_and_elevation, name] + ) + names.append(name) + + logger.debug(f"Getting results") + + # Collect results from each thread. + for future_azimuth_and_elevation, name in tqdm(future_azimuth_and_elevation_per_location): + azimuth_and_elevation = future_azimuth_and_elevation.result() + + azimuth_per_location = azimuth_and_elevation.loc[:, "azimuth"].rename(name) + elevation_per_location = azimuth_and_elevation.loc[:, "elevation"].rename(name) + + azimuth.append(azimuth_per_location) + elevation.append(elevation_per_location) + + azimuth = pd.concat(azimuth, axis=1) + elevation = pd.concat(elevation, axis=1) + + # remove timezone + elevation.index = elevation.index.tz_localize(None) + azimuth.index = azimuth.index.tz_localize(None) + + logger.debug(f"Calculated Azimuth and Elevation angles in {time.time() - t} seconds") + + return azimuth.round(2), elevation.round(2) + + +def save_to_zarr(azimuth: pd.DataFrame, elevation: pd.DataFrame, filename: Union[str, Path]): + """ + Save azimuth and elevation to zarr file + + Args: + azimuth: data to be saved + elevation: data to be saved + filename: the file name where it should be save, can be local of gcs + + """ + # change pandas dataframe to xr Dataset + elevation_xr = xr.DataArray(elevation, dims=["time_5", "locations"]).to_dataset( + name="elevation" + ) + azimuth_xr = xr.DataArray(azimuth, dims=["time_5", "locations"]).to_dataset(name="azimuth") + + # merge dataset + merged_ds = xr.merge([elevation_xr, azimuth_xr]) + + # Make encoding + encoding = { + var: {"compressor": numcodecs.Blosc(cname="zstd", clevel=5)} for var in merged_ds.data_vars + } + + # save to file + merged_ds.to_zarr(filename, mode="w", encoding=encoding) + + +def load_from_zarr( + filename: Union[str, Path], + start_dt: Optional[datetime.datetime] = None, + end_dt: Optional[datetime.datetime] = None, +) -> (pd.DataFrame, pd.DataFrame): + """ + Load sun data + + Args: + filename: the filename to be loaded, can be local or gcs + start_dt: optional start datetime. Both start and end need to be set to be used. + end_dt: optional end datetime. Both start and end need to be set to be used. + + Returns: Tuple of dataframes for azimuth and elevation. + The index is timestamps, and the columns are the x and y coordinates + + """ + logger.debug("Loading Solar PV Data from GCS") + + # It is possible to simplify the code below and do + # xr.open_dataset(file, engine='h5netcdf') + # in the first 'with' block, and delete the second 'with' block. + # But that takes 1 minute to load the data, where as loading into memory + # first and then loading from memory takes 23 seconds! + sun = xr.open_dataset(filename, engine="zarr") + + if (start_dt is not None) and (end_dt is not None): + sun = sun.sel(datetime_gmt=slice(start_dt, end_dt)) + + elevation = pd.DataFrame( + index=pd.to_datetime(sun.time_5), data=sun["elevation"].values, columns=sun.locations + ) + azimuth = pd.DataFrame( + index=pd.to_datetime(sun.time_5), data=sun["azimuth"].values, columns=sun.locations + ) + + return azimuth, elevation + + +def x_y_to_name(x, y) -> str: + """ + Make name form x, y coords + + Args: + x: x coordinate + y: y cooridante + + Returns: name made from x and y + + """ + return f"{x},{y}" diff --git a/nowcasting_dataset/data_sources/sun/sun_data_source.py b/nowcasting_dataset/data_sources/sun/sun_data_source.py new file mode 100644 index 00000000..1b91650f --- /dev/null +++ b/nowcasting_dataset/data_sources/sun/sun_data_source.py @@ -0,0 +1,94 @@ +""" Loading Raw data """ +from nowcasting_dataset.data_sources.data_source import DataSource +from nowcasting_dataset.dataset.example import Example +from nowcasting_dataset import time as nd_time +from dataclasses import dataclass +import pandas as pd +from numbers import Number +from typing import List, Tuple, Union, Optional +from pathlib import Path +import numpy as np +import io +import gcsfs +import xarray as xr +from nowcasting_dataset.geospatial import osgb_to_lat_lon +from datetime import datetime +from nowcasting_dataset.consts import SUN_AZIMUTH_ANGLE, SUN_ELEVATION_ANGLE + +from nowcasting_dataset.data_sources.sun.raw_data_load_save import load_from_zarr, x_y_to_name + + +@dataclass +class SunDataSource(DataSource): + """Add azimuth and elevation angles of the sun.""" + + filename: Union[str, Path] + start_dt: Optional[datetime] = None + end_dt: Optional[datetime] = None + + def __post_init__(self): + """ Post Init """ + super().__post_init__() + self._load() + + def get_example( + self, t0_dt: pd.Timestamp, x_meters_center: Number, y_meters_center: Number + ) -> Example: + """ + Get example data from t0_dt and x and y xoordinates + + Args: + t0_dt: the timestamp to get the sun data for + x_meters_center: the x coordinate (OSGB) + y_meters_center: the y coordinate (OSGB) + + Returns: Dictionary of azimuth and elevation data + """ + # all sun data is from 2019, analaysis showed over the timescale we are interested in the + # elevation and azimuth angles change by < 1 degree, so to save data, we just use data form 2019 + t0_dt = t0_dt.replace(year=2019) + + start_dt = self._get_start_dt(t0_dt) + end_dt = self._get_end_dt(t0_dt) + + # The names of the columns get truncated when saving, therefore we need to look for the + # name of the columns near the location we are looking for + locations = np.array( + [[float(z.split(",")[0]), float(z.split(",")[1])] for z in self.azimuth.columns] + ) + location = locations[ + np.isclose(locations[:, 0], x_meters_center) + & np.isclose(locations[:, 1], y_meters_center) + ] + # lets make sure there is atleast one + assert len(location) > 0 + # Take the first location, and x and y coordinates are the first and center entries in this array + location = location[0] + # make name of column to pull data from. The columns name will be about something like '22222.555,3333.6666' + name = x_y_to_name(x=location[0], y=location[1]) + + del x_meters_center, y_meters_center + azimuth = self.azimuth.loc[start_dt:end_dt][name] + elevation = self.elevation.loc[start_dt:end_dt][name] + + example = Example() + example[SUN_AZIMUTH_ANGLE] = azimuth.values + example[SUN_ELEVATION_ANGLE] = elevation.values + + return example + + def _load(self): + + self.azimuth, self.elevation = load_from_zarr( + filename=self.filename, start_dt=self.start_dt, end_dt=self.end_dt + ) + + def get_locations_for_batch( + self, t0_datetimes: pd.DatetimeIndex + ) -> Tuple[List[Number], List[Number]]: + """ Sun data should not be used to get batch locations """ + raise NotImplementedError("Sun data should not be used to get batch locations") + + def datetime_index(self) -> pd.DatetimeIndex: + """ The datetime index of this datasource """ + return self.azimuth.index diff --git a/nowcasting_dataset/dataset/datamodule.py b/nowcasting_dataset/dataset/datamodule.py index b478f40c..b7f208d4 100644 --- a/nowcasting_dataset/dataset/datamodule.py +++ b/nowcasting_dataset/dataset/datamodule.py @@ -14,6 +14,7 @@ from nowcasting_dataset import time as nd_time from nowcasting_dataset import utils from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource +from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource from nowcasting_dataset.dataset import datasets from nowcasting_dataset.dataset.split.split import split_data, SplitMethod @@ -63,6 +64,7 @@ class NowcastingDataModule(pl.LightningDataModule): ) satellite_image_size_pixels: int = 128 #: Passed to Data Sources. topographic_filename: Optional[Union[str, Path]] = None + sun_filename: Optional[Union[str, Path]] = None nwp_image_size_pixels: int = 2 #: Passed to Data Sources. meters_per_pixel: int = 2000 #: Passed to Data Sources. convert_to_numpy: bool = True #: Passed to Data Sources. @@ -186,6 +188,16 @@ def prepare_data(self) -> None: self.data_sources.append(self.topo_data_source) + # Sun data + if self.sun_filename is not None: + self.sun_data_source = SunDataSource( + filename=self.sun_filename, + history_minutes=self.history_minutes, + forecast_minutes=self.forecast_minutes, + convert_to_numpy=self.convert_to_numpy, + ) + self.data_sources.append(self.sun_data_source) + self.datetime_data_source = data_sources.DatetimeDataSource( history_minutes=self.history_minutes, forecast_minutes=self.forecast_minutes, diff --git a/nowcasting_dataset/dataset/datasets.py b/nowcasting_dataset/dataset/datasets.py index a781bc1c..9038cdd3 100644 --- a/nowcasting_dataset/dataset/datasets.py +++ b/nowcasting_dataset/dataset/datasets.py @@ -24,8 +24,8 @@ SATELLITE_DATA, NWP_DATA, PV_YIELD, - PV_AZIMUTH_ANGLE, - PV_ELEVATION_ANGLE, + SUN_ELEVATION_ANGLE, + SUN_AZIMUTH_ANGLE, SATELLITE_DATETIME_INDEX, NWP_TARGET_TIME, PV_DATETIME_INDEX, @@ -479,14 +479,14 @@ def subselect_data( if PV_YIELD in required_keys and PV_DATETIME_INDEX in batch: batch = select_time_period( batch, - keys=[PV_DATETIME_INDEX, PV_YIELD, PV_AZIMUTH_ANGLE, PV_ELEVATION_ANGLE], + keys=[PV_DATETIME_INDEX, PV_YIELD, SUN_ELEVATION_ANGLE, SUN_AZIMUTH_ANGLE], time_of_first_example=batch[PV_DATETIME_INDEX][0].data, start_time=start_time, end_time=end_time, ) _LOG.debug( f"PV Datetime Shape: {batch[PV_DATETIME_INDEX].shape} PV Data Shape: {batch[PV_YIELD].shape}" - f" PV Azimuth Shape: {batch[PV_AZIMUTH_ANGLE].shape} PV Elevation Shape: {batch[PV_ELEVATION_ANGLE].shape}" + f" PV Azimuth Shape: {batch[SUN_ELEVATION_ANGLE].shape} PV Elevation Shape: {batch[SUN_AZIMUTH_ANGLE].shape}" ) return batch diff --git a/nowcasting_dataset/dataset/example.py b/nowcasting_dataset/dataset/example.py index 7b48db4f..96cf9101 100644 --- a/nowcasting_dataset/dataset/example.py +++ b/nowcasting_dataset/dataset/example.py @@ -46,9 +46,9 @@ class Example(TypedDict): pv_yield: Array # PV azimuth and elevation angles i.e where the sun is. - #: shape = [batch_size, ] seq_length, n_pv_systems_per_example - pv_azimuth_angle: Array - pv_elevation_angle: Array + #: shape = [batch_size, ] seq_length + sun_azimuth_angle: Array + sun_elevation_angle: Array #: PV identification. #: shape = [batch_size, ] n_pv_systems_per_example diff --git a/nowcasting_dataset/dataset/validate.py b/nowcasting_dataset/dataset/validate.py index e59da512..80a063d8 100644 --- a/nowcasting_dataset/dataset/validate.py +++ b/nowcasting_dataset/dataset/validate.py @@ -20,8 +20,8 @@ PV_SYSTEM_X_COORDS, PV_SYSTEM_Y_COORDS, PV_SYSTEM_ROW_NUMBER, - PV_AZIMUTH_ANGLE, - PV_ELEVATION_ANGLE, + SUN_AZIMUTH_ANGLE, + SUN_ELEVATION_ANGLE, DATETIME_FEATURE_NAMES, TOPOGRAPHIC_X_COORDS, TOPOGRAPHIC_DATA, @@ -311,10 +311,10 @@ def validate_example( np.nanmin(data[PV_YIELD]) >= 0.0 ), f"Maximum PV value is {np.nanmin(data[PV_YIELD])} but it should be <= 1" - if PV_AZIMUTH_ANGLE in data.keys(): - assert data[PV_AZIMUTH_ANGLE].shape[-2:] == (seq_len_5_minutes, n_pv_systems_per_example) - if PV_AZIMUTH_ANGLE in data.keys(): - assert data[PV_ELEVATION_ANGLE].shape[-2:] == (seq_len_5_minutes, n_pv_systems_per_example) + if SUN_AZIMUTH_ANGLE in data.keys(): + assert data[SUN_AZIMUTH_ANGLE].shape[-1] == seq_len_5_minutes + if SUN_ELEVATION_ANGLE in data.keys(): + assert data[SUN_ELEVATION_ANGLE].shape[-1] == seq_len_5_minutes assert data["sat_data"].shape[-4:] == ( seq_len_5_minutes, diff --git a/scripts/get_raw_sun_data.py b/scripts/get_raw_sun_data.py new file mode 100644 index 00000000..c15d7188 --- /dev/null +++ b/scripts/get_raw_sun_data.py @@ -0,0 +1,80 @@ +############ +# Compute raw sun data using pvlib +# +# 2021-09-01 +# Peter Dudfield +# +# The data is about +# - 1MB for a 2 of days, for ~2000 sites and takes about ~1 minutes +# - 6MB for a 10 of days, for ~2000 sites and takes about ~1 minutes +# - 252MB for a 365 of days, for ~2000 sites and takes about ~11 minutes (on a macbook pro) + +# Decide to just go for one year of data +# on 1st Jan 2019 and 2020, the biggest differences was in elevation was 1 degree, +# More investigation has been done (link), and happy difference is less than 1 degree, +# Therefore, its ok good to use 1 year of data, for all the years +############ + +import logging +from datetime import datetime +from pathlib import Path +import pandas as pd +import os +import nowcasting_dataset +from nowcasting_dataset.data_sources.gsp.eso import get_gsp_metadata_from_eso +from nowcasting_dataset.data_sources.sun.raw_data_load_save import ( + save_to_zarr, + get_azimuth_and_elevation, +) +from nowcasting_dataset.geospatial import lat_lon_to_osgb + +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger("urllib3").setLevel(logging.WARNING) + + +# set up +BUCKET = Path("solar-pv-nowcasting-data") +PV_PATH = BUCKET / "PV/PVOutput.org" +PV_METADATA_FILENAME = PV_PATH / "UK_PV_metadata.csv" + +# set up variables +local_path = os.path.dirname(nowcasting_dataset.__file__) + "/.." +metadata_filename = f"gs://{PV_METADATA_FILENAME}" +start_dt = datetime.fromisoformat("2019-01-01 00:00:00.000+00:00") +end_dt = datetime.fromisoformat("2020-01-01 00:00:00.000+00:00") +datestamps = pd.date_range(start=start_dt, end=end_dt, freq="5T") + +# PV metadata +pv_metadata = pd.read_csv(metadata_filename, index_col="system_id") +pv_metadata = pv_metadata.dropna(subset=["longitude", "latitude"]) +pv_metadata["location_x"], pv_metadata["location_y"] = lat_lon_to_osgb( + pv_metadata["latitude"], pv_metadata["longitude"] +) +pv_x = pv_metadata["location_x"] +pv_y = pv_metadata["location_y"] + +# GSP Metadata +gsp_metadata = get_gsp_metadata_from_eso() +gsp_metadata = gsp_metadata.dropna(subset=["centroid_lon", "centroid_lat"]) +gsp_x = gsp_metadata["centroid_x"] +gsp_y = gsp_metadata["centroid_y"] + +# join all sites together +x_centers = list(pv_x.values) + list(gsp_x.values) +y_centers = list(pv_y.values) + list(gsp_y.values) + +# make d +azimuth, elevation = get_azimuth_and_elevation( + x_centers=x_centers, y_centers=y_centers, datestamps=datestamps +) + +azimuth = azimuth.astype(int) +elevation = elevation.astype(int) + +# save it locally and in the cloud, just in case when saving in the cloud it fails +save_to_zarr(azimuth=azimuth, elevation=elevation, filename="./sun.zarr") +save_to_zarr( + azimuth=azimuth, elevation=elevation, filename="gs://solar-pv-nowcasting-data/Sun/v0/sun.zarr/" +) +# This has been uploaded to 'gs://solar-pv-nowcasting-data/Sun/v0' diff --git a/tests/config/nwp_size_test.yaml b/tests/config/nwp_size_test.yaml index 7a6affa7..2cc24549 100644 --- a/tests/config/nwp_size_test.yaml +++ b/tests/config/nwp_size_test.yaml @@ -8,6 +8,7 @@ input_data: solar_pv_metadata_filename: tests/data/pv_metadata/UK_PV_metadata.csv gsp_zarr_path: tests/data/gsp/test.zarr topographic_filename: tests/data/europe_dem_2km_osgb.tif + sun_zarr_path: tests/data/sun/test.zarr output_data: filepath: not used by unittests! process: diff --git a/tests/config/test.yaml b/tests/config/test.yaml index 29ae15d5..d86ad42b 100644 --- a/tests/config/test.yaml +++ b/tests/config/test.yaml @@ -8,6 +8,7 @@ input_data: solar_pv_metadata_filename: tests/data/pv_metadata/UK_PV_metadata.csv gsp_zarr_path: tests/data/gsp/test.zarr topographic_filename: tests/data/europe_dem_2km_osgb.tif + sun_zarr_path: tests/data/sun/test.zarr output_data: filepath: not used by unittests! process: diff --git a/tests/data/sun/test.zarr/.zattrs b/tests/data/sun/test.zarr/.zattrs new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/tests/data/sun/test.zarr/.zattrs @@ -0,0 +1 @@ +{} diff --git a/tests/data/sun/test.zarr/.zgroup b/tests/data/sun/test.zarr/.zgroup new file mode 100644 index 00000000..3f3fad2d --- /dev/null +++ b/tests/data/sun/test.zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} diff --git a/tests/data/sun/test.zarr/.zmetadata b/tests/data/sun/test.zarr/.zmetadata new file mode 100644 index 00000000..fe1a2ad8 --- /dev/null +++ b/tests/data/sun/test.zarr/.zmetadata @@ -0,0 +1,117 @@ +{ + "metadata": { + ".zattrs": {}, + ".zgroup": { + "zarr_format": 2 + }, + "azimuth/.zarray": { + "chunks": [ + 3286, + 130 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "zstd", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "