diff --git a/.github/workflows/linters.yaml b/.github/workflows/linters.yaml new file mode 100644 index 00000000..8d8027a1 --- /dev/null +++ b/.github/workflows/linters.yaml @@ -0,0 +1,24 @@ +name: Lint Python + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.9] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pydocstyle + - name: Docstyle linting + run: | + pydocstyle --convention=google --add-ignore=D200,D210,D212,D415 nowcasting_dataset diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index cc8ccd13..60a4ed1c 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python application +name: Python Tests on: [push, pull_request] diff --git a/nowcasting_dataset/__init__.py b/nowcasting_dataset/__init__.py index 9172424d..01b773db 100644 --- a/nowcasting_dataset/__init__.py +++ b/nowcasting_dataset/__init__.py @@ -1 +1,2 @@ +""" init file """ from nowcasting_dataset.square import Square diff --git a/nowcasting_dataset/cloud/__init__.py b/nowcasting_dataset/cloud/__init__.py index e69de29b..e09f83e3 100644 --- a/nowcasting_dataset/cloud/__init__.py +++ b/nowcasting_dataset/cloud/__init__.py @@ -0,0 +1 @@ +""" Cloud functions """ diff --git a/nowcasting_dataset/cloud/aws.py b/nowcasting_dataset/cloud/aws.py index a062eb9f..128c92fa 100644 --- a/nowcasting_dataset/cloud/aws.py +++ b/nowcasting_dataset/cloud/aws.py @@ -1,3 +1,4 @@ +""" AWS functions """ import logging from pathlib import Path import os @@ -13,14 +14,17 @@ def aws_upload_and_delete_local_files( aws_path: str, local_path: Path, bucket: str = "solar-pv-nowcasting-data" ): """ + Upload and delete files + 1. Upload the files in a local path, to a path in aws 2. Delete files in that local path - @param aws_path: the folder in the aws bucket that files will be saved too - @param local_path: the local path where fiels will be copied from - @param bucket: the aws bucket that files are saved too - @return: - """ + Args: + aws_path: the folder in the aws bucket that files will be saved too + local_path: the local path where fiels will be copied from + bucket: the aws bucket that files are saved too + + """ _LOG.info("Uploading to AWS!") # create s3 resource @@ -55,12 +59,14 @@ def aws_download_to_local( ): """ Download file from gcs - @param remote_filename: the gcs file name, should start with gs:// - @param local_filename: - @param s3_resource: s3 resource, means a new one doesnt have to be made everytime. - @param bucket: The s3 bucket name, from which to load the file from. - """ + Args: + remote_filename: the gcs file name, should start with gs:// + local_filename: the local file name + s3_resource: s3 resource, means a new one doesnt have to be made everytime. + bucket: The s3 bucket name, from which to load the file from. + + """ _LOG.debug(f"Downloading {remote_filename} from AWS to {local_filename}") if s3_resource is None: @@ -74,15 +80,19 @@ def aws_download_to_local( def upload_one_file( - remote_filename: str, local_filename: str, bucket: str = "solar-pv-nowcasting-data", + remote_filename: str, + local_filename: str, + bucket: str = "solar-pv-nowcasting-data", ): """ Upload one file to s3 - @param remote_filename: the aws key name - @param local_filename: the local file name - @param bucket: the s3 bucket - """ + Args: + remote_filename: the aws key name + local_filename: the local file name + bucket: the s3 bucket + + """ # create s3 resource s3 = boto3.client("s3") @@ -96,8 +106,13 @@ def get_all_filenames_in_path_aws( ) -> List[str]: """ Get all the files names from one folder in gcp - @param remote_path: the path that we should look in - @return: a list of strings, of files names + + Args: + remote_path:the path that we should look in + bucket: the aws bucket + + Returns: a list of strings, of files names + """ # get client s3 = boto3.client("s3") diff --git a/nowcasting_dataset/cloud/gcp.py b/nowcasting_dataset/cloud/gcp.py index 70c6b600..6b7777b3 100644 --- a/nowcasting_dataset/cloud/gcp.py +++ b/nowcasting_dataset/cloud/gcp.py @@ -1,3 +1,4 @@ +""" GCP general functions """ import logging from pathlib import Path from typing import List, Union @@ -34,14 +35,14 @@ def gcp_upload_and_delete_local_files(dst_path: str, local_path: Union[str, Path def gcp_download_to_local( remote_filename: str, local_filename: str, gcs: gcsfs.GCSFileSystem = None ): - """Download file from gcs. + """ + Download file from gcs. Args: remote_filename: the gcs file name, should start with gs:// - local_filename: + local_filename: the local filename gcs: gcsfs.GCSFileSystem connection, means a new one doesnt have to be made everytime. """ - _LOG.debug(f"Downloading from GCP {remote_filename} to {local_filename}") if gcs is None: diff --git a/nowcasting_dataset/cloud/local.py b/nowcasting_dataset/cloud/local.py index 6c2f402e..c0dcfc0a 100644 --- a/nowcasting_dataset/cloud/local.py +++ b/nowcasting_dataset/cloud/local.py @@ -1,3 +1,4 @@ +""" Functions for local files """ import glob import os import shutil @@ -10,10 +11,7 @@ def delete_all_files_and_folder_in_temp_path(path: str): - """ - Delete all the files and folders in a temporary path - """ - + """ Delete all the files and folders in a temporary path """ _LOG.info(f"Deleting files and folder from {path} .") for files in os.listdir(path): diff --git a/nowcasting_dataset/cloud/utils.py b/nowcasting_dataset/cloud/utils.py index 38b776c2..ebbe323e 100644 --- a/nowcasting_dataset/cloud/utils.py +++ b/nowcasting_dataset/cloud/utils.py @@ -1,3 +1,4 @@ +""" General utils functions """ import logging from pathlib import Path import gcsfs @@ -14,7 +15,6 @@ def upload_and_delete_local_files(dst_path: str, local_path: Path, cloud: str = """ Upload and delete local files to either AWS or GCP """ - assert cloud in ["gcp", "aws"] if cloud == "gcp": @@ -26,12 +26,14 @@ def upload_and_delete_local_files(dst_path: str, local_path: Path, cloud: str = def gcp_to_aws(gcp_filename: str, gcs: gcsfs.GCSFileSystem, aws_filename: str, aws_bucket: str): """ Download a file from gcp and upload it to aws - @param gcp_filename: the gcp file name - @param gcs: the gcs file system (so it doesnt have to be made more than once) - @param aws_filename: the aws filename and path - @param aws_bucket: the asw bucket - """ + Args: + gcp_filename: the gcp file name + gcs: the gcs file system (so it doesnt have to be made more than once) + aws_filename: the aws filename and path + aws_bucket: the aws bucket + + """ # create temp file with tempfile.NamedTemporaryFile() as fp: local_filename = fp.name diff --git a/nowcasting_dataset/config/__init__.py b/nowcasting_dataset/config/__init__.py index e69de29b..93de233c 100644 --- a/nowcasting_dataset/config/__init__.py +++ b/nowcasting_dataset/config/__init__.py @@ -0,0 +1 @@ +""" Configuration of the dataset """ diff --git a/nowcasting_dataset/config/load.py b/nowcasting_dataset/config/load.py index 977c3462..1b84fdf5 100644 --- a/nowcasting_dataset/config/load.py +++ b/nowcasting_dataset/config/load.py @@ -1,3 +1,4 @@ +""" Loading configuration functions """ import logging import gcsfs import os @@ -14,11 +15,14 @@ def load_yaml_configuration(filename: Union[str, Pathy]) -> Configuration: """ Load a yaml file which has a configuration in it - filename: the file name that you want to load. Will load from local, AWS, or GCP - depending on the protocol suffix (e.g. 's3://bucket/config.yaml'). - Returns: pydantic class - """ + Args: + filename: the file name that you want to load. Will load from local, AWS, or GCP + depending on the protocol suffix (e.g. 's3://bucket/config.yaml'). + + Returns:pydantic class + + """ # load the file to a dictionary with fsspec.open(filename, mode="r") as stream: configuration = yaml.safe_load(stream) @@ -41,7 +45,6 @@ def load_configuration_from_gcs( Returns: configuration class """ - logger.info("Loading configuration from gcs") bucket_and_dir = os.path.join(f"gs://{bucket}", gcp_dir) diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py index 64ceee55..298c82aa 100644 --- a/nowcasting_dataset/config/model.py +++ b/nowcasting_dataset/config/model.py @@ -1,3 +1,4 @@ +""" Configuration model for the dataset """ from pydantic import BaseModel, Field, validator from pydantic import BaseModel, Field @@ -12,6 +13,8 @@ class General(BaseModel): + """ General pydantic model """ + name: str = Field("example", description="The name of this configuration file.") description: str = Field( "example configuration", description="Description of this confgiruation file" @@ -27,6 +30,8 @@ class General(BaseModel): class Git(BaseModel): + """ Git model """ + hash: str = Field(..., description="The git hash has for when a dataset is created.") message: str = Field(..., description="The git message has for when a dataset is created.") committed_date: datetime = Field( @@ -35,9 +40,13 @@ class Git(BaseModel): class InputData(BaseModel): - # All paths must include the protocol prefix. For local files, - # it's sufficient to just start with a '/'. For aws, start with 's3://', - # for gcp start with 'gs://'. + """ + Input data model + + All paths must include the protocol prefix. For local files, + it's sufficient to just start with a '/'. For aws, start with 's3://', + for gcp start with 'gs://'. + """ solar_pv_data_filename: str = Field( "gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc", @@ -66,6 +75,8 @@ class InputData(BaseModel): class OutputData(BaseModel): + """ Output data model """ + filepath: str = Field( "gs://solar-pv-nowcasting-data/prepared_ML_training_data/v5/", description=( @@ -76,6 +87,8 @@ class OutputData(BaseModel): class Process(BaseModel): + """ Pydantic model of how the data is processed """ + seed: int = Field(1234, description="Random seed, so experiments can be repeatable") batch_size: int = Field(32, description="the number of examples per batch") upload_every_n_batches: int = Field( @@ -100,24 +113,30 @@ class Process(BaseModel): @property def seq_len_30_minutes(self): + """ How many steps are there in 30 minute datasets """ return int((self.history_minutes + self.forecast_minutes) / 30 + 1) @property def seq_len_5_minutes(self): + """ How many steps are there in 5 minute datasets """ return int((self.history_minutes + self.forecast_minutes) / 5 + 1) @validator("history_minutes") def history_minutes_divide_by_30(cls, v): + """ Validate 'history_minutes' """ assert v % 30 == 0 # this means it also divides by 5 return v @validator("forecast_minutes") def forecast_minutes_divide_by_30(cls, v): + """ Validate 'forecast_minutes' """ assert v % 30 == 0 # this means it also divides by 5 return v class Configuration(BaseModel): + """ Configuration model for the dataset """ + general: General = General() input_data: InputData = InputData() output_data: OutputData = OutputData() @@ -125,9 +144,7 @@ class Configuration(BaseModel): git: Optional[Git] = None def set_base_path(self, base_path: str): - """Append base_path to all paths. - - Mostly used for testing.""" + """Append base_path to all paths. Mostly used for testing.""" base_path = Pathy(base_path) path_attrs = [ "solar_pv_data_filename", diff --git a/nowcasting_dataset/config/save.py b/nowcasting_dataset/config/save.py index 9a8914d4..7eacf893 100644 --- a/nowcasting_dataset/config/save.py +++ b/nowcasting_dataset/config/save.py @@ -1,3 +1,4 @@ +""" Save functions for the configuration model""" import yaml import logging import fsspec @@ -18,7 +19,6 @@ def save_yaml_configuration( Will save to GCP, AWS, or local, depending on the protocol suffix of filepath. """ - # make a dictionary from the configuration d = configuration.dict() if filename is None: diff --git a/nowcasting_dataset/consts.py b/nowcasting_dataset/consts.py index 0bdd295b..f1237569 100644 --- a/nowcasting_dataset/consts.py +++ b/nowcasting_dataset/consts.py @@ -1,3 +1,4 @@ +""" Constants that can be imported when needed """ from typing import Union import numpy as np import xarray as xr diff --git a/nowcasting_dataset/data_sources/__init__.py b/nowcasting_dataset/data_sources/__init__.py index 3171f7a8..cd3bdf58 100644 --- a/nowcasting_dataset/data_sources/__init__.py +++ b/nowcasting_dataset/data_sources/__init__.py @@ -1,3 +1,4 @@ +""" Various DataSources """ from nowcasting_dataset.data_sources.data_source import DataSource from nowcasting_dataset.data_sources.satellite_data_source import SatelliteDataSource from nowcasting_dataset.data_sources.pv_data_source import PVDataSource diff --git a/nowcasting_dataset/data_sources/data_source.py b/nowcasting_dataset/data_sources/data_source.py index df8f20d5..e887e527 100644 --- a/nowcasting_dataset/data_sources/data_source.py +++ b/nowcasting_dataset/data_sources/data_source.py @@ -1,3 +1,4 @@ +""" General Data Source Class """ from numbers import Number import pandas as pd import numpy as np @@ -34,7 +35,7 @@ class DataSource: convert_to_numpy: bool def __post_init__(self): - + """ Post Init """ self.sample_period_minutes = self._get_sample_period_minutes() self.history_len = self.history_minutes // self.sample_period_minutes @@ -69,7 +70,9 @@ def _get_end_dt(self, t0_dt: pd.Timestamp) -> pd.Timestamp: # ************* METHODS THAT CAN BE OVERRIDDEN **************************** def _get_sample_period_minutes(self): """ - This is the default sample period in minutes. This functions may be overwritten if + This is the default sample period in minutes. + + This functions may be overwritten if the sample period of the data source is not 5 minutes """ logging.debug( @@ -96,8 +99,16 @@ def get_batch( y_locations: Iterable[Number], ) -> List[Example]: """ - Returns: - List of Examples with data converted to Numpy data structures. + Get Batch Data + + Args: + t0_datetimes: list of timestamps for the datetime of the batches. The batch will also include data + for historic and future depending on 'history_minutes' and 'future_minutes'. + x_locations: x center batch locations + y_locations: y center batch locations + + Returns: Batch data + """ examples = [] zipped = zip(t0_datetimes, x_locations, y_locations) @@ -148,14 +159,18 @@ def get_example( @dataclass class ImageDataSource(DataSource): """ + Image Data source + Args: image_size_pixels: Size of the width and height of the image crop - returned by get_sample(). """ + returned by get_sample(). + """ image_size_pixels: InitVar[int] meters_per_pixel: InitVar[int] def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ Post Init """ super().__post_init__() self._square = square.Square( size_pixels=image_size_pixels, meters_per_pixel=meters_per_pixel @@ -165,6 +180,8 @@ def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): @dataclass class ZarrDataSource(ImageDataSource): """ + A General Zarr Data source + Attributes: _data: xr.DataArray data, opened by open(). x is left-to-right. @@ -180,6 +197,7 @@ class ZarrDataSource(ImageDataSource): consolidated: bool = True def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ Post init """ super().__post_init__(image_size_pixels, meters_per_pixel) self._data = None if self.n_timesteps_per_batch is None: @@ -187,6 +205,7 @@ def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): @property def data(self): + """ Data property """ if self._data is None: raise RuntimeError("Please run `open()` before accessing data!") return self._data @@ -194,6 +213,18 @@ def data(self): def get_example( self, t0_dt: pd.Timestamp, x_meters_center: Number, y_meters_center: Number ) -> Example: + """ + Get Example data + + Args: + t0_dt: list of timestamps for the datetime of the batches. The batch will also include data + for historic and future depending on 'history_minutes' and 'future_minutes'. + x_meters_center: x center batch locations + y_meters_center: y center batch locations + + Returns: Example Data + + """ selected_data = self._get_time_slice(t0_dt) bounding_box = self._square.bounding_box_centered_on( x_meters_center=x_meters_center, y_meters_center=y_meters_center @@ -225,8 +256,8 @@ def get_example( return self._put_data_into_example(selected_data) def geospatial_border(self) -> List[Tuple[Number, Number]]: - """Get 'corner' coordinates for a rectangle within the boundary of the - data. + """ + Get 'corner' coordinates for a rectangle within the boundary of the data. Returns List of 2-tuples of the x and y coordinates of each corner, in OSGB projection. @@ -247,10 +278,14 @@ def _post_process_example( # ****************** METHODS THAT MUST BE OVERRIDDEN ********************** # (in addition to the DataSource methods that must be overridden) def open(self) -> None: - # We don't want to _open_data() in __init__. - # If we did that, then we couldn't copy ZarrDataSource - # instances into separate processes. Instead, - # call open() _after_ creating separate processes. + """ + Open the data + + We don't want to _open_data() in __init__. + If we did that, then we couldn't copy ZarrDataSource + instances into separate processes. Instead, + call open() _after_ creating separate processes. + """ raise NotImplementedError() def _open_data(self) -> xr.DataArray: diff --git a/nowcasting_dataset/data_sources/datetime_data_source.py b/nowcasting_dataset/data_sources/datetime_data_source.py index 35a75bd2..4583864a 100644 --- a/nowcasting_dataset/data_sources/datetime_data_source.py +++ b/nowcasting_dataset/data_sources/datetime_data_source.py @@ -1,3 +1,4 @@ +""" Datetime DataSource - add hour and year features """ from nowcasting_dataset.data_sources.data_source import DataSource from nowcasting_dataset.dataset.example import Example from nowcasting_dataset import time as nd_time @@ -9,14 +10,26 @@ @dataclass class DatetimeDataSource(DataSource): - """Add hour_of_day_{sin, cos} and day_of_year_{sin, cos} features.""" + """ Add hour_of_day_{sin, cos} and day_of_year_{sin, cos} features. """ def __post_init__(self): + """ Post init """ super().__post_init__() def get_example( self, t0_dt: pd.Timestamp, x_meters_center: Number, y_meters_center: Number ) -> Example: + """ + Get example data + + Args: + t0_dt: list of timestamps + x_meters_center: x center of patches - not needed + y_meters_center: y center of patches - not needed + + Returns: batch data of datetime features + + """ del x_meters_center, y_meters_center start_dt = self._get_start_dt(t0_dt) end_dt = self._get_end_dt(t0_dt) @@ -26,7 +39,9 @@ def get_example( def get_locations_for_batch( self, t0_datetimes: pd.DatetimeIndex ) -> Tuple[List[Number], List[Number]]: + """ This method is not needed for DatetimeDataSource """ raise NotImplementedError() def datetime_index(self) -> pd.DatetimeIndex: + """ This method is not needed for DatetimeDataSource """ raise NotImplementedError() diff --git a/nowcasting_dataset/data_sources/gsp/__init__.py b/nowcasting_dataset/data_sources/gsp/__init__.py index e69de29b..2edd3635 100644 --- a/nowcasting_dataset/data_sources/gsp/__init__.py +++ b/nowcasting_dataset/data_sources/gsp/__init__.py @@ -0,0 +1 @@ +""" GSP data sources and functions """ diff --git a/nowcasting_dataset/data_sources/gsp/eso.py b/nowcasting_dataset/data_sources/gsp/eso.py index 5cb6efdd..b5e6c11f 100644 --- a/nowcasting_dataset/data_sources/gsp/eso.py +++ b/nowcasting_dataset/data_sources/gsp/eso.py @@ -1,5 +1,6 @@ """ This file has a few functions that are used to get GSP (Grid Supply Point) information from National Grid ESO. + ESO - Electricity System Operator. General information can be found here - https://data.nationalgrideso.com/system/gis-boundaries-for-gb-grid-supply-points @@ -41,13 +42,13 @@ def get_gsp_metadata_from_eso(calculate_centroid: bool = True) -> pd.DataFrame: """ Get the metadata for the gsp, from ESO. + Args: calculate_centroid: Load the shape file also, and calculate the Centroid - Returns: + Returns: Dataframe of ESO Metadata """ - logger.debug("Getting GSP shape file") # call ESO website. There is a possibility that this API will be replaced and its unclear if this original API will @@ -83,6 +84,7 @@ def get_gsp_shape_from_eso( ) -> gpd.GeoDataFrame: """ Get the the gsp shape file from ESO (or a local file) + Args: join_duplicates: If True, any RegionIDs which have multiple entries, will be joined together to give one entry load_local_file: Load from a local file, not from ESO @@ -90,7 +92,6 @@ def get_gsp_shape_from_eso( Returns: Geo Pandas dataframe of GSP shape data """ - logger.debug("Loading GSP shape file") local_file = f"{os.path.dirname(os.path.realpath(__file__))}/gsp_shape" @@ -178,7 +179,6 @@ def get_list_of_gsp_ids(maximum_number_of_gsp: Optional[int] = None) -> List[int Returns: list of gsp ids """ - # get a lit of gsp ids metadata = get_gsp_metadata_from_eso(calculate_centroid=False) diff --git a/nowcasting_dataset/data_sources/gsp/gsp_data_source.py b/nowcasting_dataset/data_sources/gsp/gsp_data_source.py index 10325943..c62a01ae 100644 --- a/nowcasting_dataset/data_sources/gsp/gsp_data_source.py +++ b/nowcasting_dataset/data_sources/gsp/gsp_data_source.py @@ -1,3 +1,7 @@ +""" GSP Data Source. GSP - Grid Supply Points + +Read more https://data.nationalgrideso.com/system/gis-boundaries-for-gb-grid-supply-points +""" import logging import xarray as xr @@ -72,7 +76,6 @@ def load(self): """ Load the meta data and load the GSP power data """ - # load metadata self.metadata = get_gsp_metadata_from_eso() @@ -108,11 +111,16 @@ def get_locations_for_batch( ) -> Tuple[List[Number], List[Number]]: """ Get x and y locations for a batch. Assume that all data is available for all GSP. + Random GSP are taken, and the locations of them are returned. This is useful as other datasources need to know which x,y locations to get + + Args: + t0_datetimes: list of datetimes that the batches locations have data for + Returns: list of x and y locations - """ + """ logger.debug("Getting locations for the batch") # Pick a random GSP for each t0_datetime, and then grab @@ -230,6 +238,7 @@ def _get_central_gsp_id( ) -> int: """ Get the GSP id of the central GSP from coordinates + Args: x_meters_center: the location of the gsp (x) y_meters_center: the location of the gsp (y) @@ -237,7 +246,6 @@ def _get_central_gsp_id( Returns: GSP id """ - logger.debug("Getting Central GSP") # If x_meters_center and y_meters_center have been chosen @@ -279,6 +287,7 @@ def _get_gsp_ids_in_roi( ) -> pd.Int64Index: """ Find the GSP IDs for all the GSP within the geospatial region of interest, defined by self.square. + Args: x_meters_center: center of area of interest (x coords) y_meters_center: center of area of interest (y coords) @@ -287,7 +296,6 @@ def _get_gsp_ids_in_roi( Returns: list of GSP ids that are in area of interest """ - logger.debug("Getting all gsp in ROI") # creating bounding box @@ -311,13 +319,14 @@ def _get_gsp_ids_in_roi( def _get_time_slice(self, t0_dt: pd.Timestamp) -> [pd.DataFrame]: """ Get time slice of GSP power data for give time. + Note the time is extended backwards by history lenght and forward by prediction time + Args: t0_dt: timestamp of interest Returns: pandas data frame of GSP power data """ - logger.debug(f"Getting power slice for {t0_dt}") # get start and end datetime, takening into account history and forecast length. @@ -338,6 +347,7 @@ def _get_time_slice(self, t0_dt: pd.Timestamp) -> [pd.DataFrame]: def drop_gsp_by_threshold(gsp_power: pd.DataFrame, meta_data: pd.DataFrame, threshold_mw: int = 20): """ Drop GSP where the max power is below a certain threshold + Args: gsp_power: GSP power data meta_data: the GSP meta data @@ -372,10 +382,9 @@ def load_solar_gsp_data( start_dt: the start datetime, which to trim the data to end_dt: the end datetime, which to trim the data to - Returns:dataframe of pv data + Returns: dataframe of pv data """ - logger.debug(f"Loading Solar GSP Data from GCS {filename} from {start_dt} to {end_dt}") # Open data - it may be quicker to open byte file first, but decided just to keep it like this at the moment gsp_power = xr.open_dataset(filename, engine="zarr") diff --git a/nowcasting_dataset/data_sources/gsp/pvlive.py b/nowcasting_dataset/data_sources/gsp/pvlive.py index 7648f0af..112d2f61 100644 --- a/nowcasting_dataset/data_sources/gsp/pvlive.py +++ b/nowcasting_dataset/data_sources/gsp/pvlive.py @@ -1,3 +1,4 @@ +""" Functions used to query the PVlive api """ from datetime import datetime, timedelta import logging import pandas as pd @@ -15,6 +16,7 @@ def load_pv_gsp_raw_data_from_pvlive( ) -> pd.DataFrame: """ Load raw pv gsp data from pvlive. Note that each gsp is loaded separately. Also the data is loaded in 30 day chunks. + Args: start: the start date for gsp data to load end: the end date for gsp data to load @@ -23,7 +25,6 @@ def load_pv_gsp_raw_data_from_pvlive( Returns: Data frame of time series of gsp data. Shows PV data for each GSP from {start} to {end} """ - # get a lit of gsp ids gsp_ids = get_list_of_gsp_ids(maximum_number_of_gsp=number_of_gsp) diff --git a/nowcasting_dataset/data_sources/nwp_data_source.py b/nowcasting_dataset/data_sources/nwp_data_source.py index ca24782d..b2ff3eea 100644 --- a/nowcasting_dataset/data_sources/nwp_data_source.py +++ b/nowcasting_dataset/data_sources/nwp_data_source.py @@ -1,3 +1,4 @@ +""" NWP Data Source """ from nowcasting_dataset.data_sources.data_source import ZarrDataSource from nowcasting_dataset.dataset.example import Example, to_numpy from nowcasting_dataset import utils @@ -57,6 +58,8 @@ @dataclass class NWPDataSource(ZarrDataSource): """ + NWP Data Source (Numerical Weather Predictions) + Args (for init): filename: The base path in which we find '2018_1-6', etc. @@ -86,6 +89,14 @@ class NWPDataSource(ZarrDataSource): meters_per_pixel: InitVar[int] = 2_000 def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ + Post init + + Args: + image_size_pixels: number of pixels in image + meters_per_pixel: how many meteres for each pixel + + """ super().__post_init__(image_size_pixels, meters_per_pixel) n_channels = len(self.channels) self._shape_of_example = ( @@ -96,10 +107,14 @@ def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): ) def open(self) -> None: - # We don't want to open_sat_data in __init__. - # If we did that, then we couldn't copy NWPDataSource - # instances into separate processes. Instead, - # call open() _after_ creating separate processes. + """ + Open NWP data + + We don't want to open_sat_data in __init__. + If we did that, then we couldn't copy NWPDataSource + instances into separate processes. Instead, + call open() _after_ creating separate processes. + """ data = self._open_data() self._data = data["UKV"].sel(variable=list(self.channels)) @@ -109,7 +124,17 @@ def get_batch( x_locations: Iterable[Number], y_locations: Iterable[Number], ) -> List[Example]: + """ + Get batch data + + Args: + t0_datetimes: list of timstamps + x_locations: list of x locations, where the batch data is for + y_locations: list of y locations, where the batch data is for + + Returns: batch data + """ # Lazily select time slices. selections = [] for t0_dt in t0_datetimes[: self.n_timesteps_per_batch]: @@ -168,12 +193,20 @@ def _put_data_into_example(self, selected_data: xr.DataArray) -> Example: ) def _get_time_slice(self, t0_dt: pd.Timestamp) -> xr.DataArray: - """Select the numerical weather predictions for a single time slice. + """ + Select the numerical weather predictions for a single time slice. Note that this function does *not* resample from hourly to 5 minutely. Resampling would be very expensive if done on the whole geographical extent of the NWP data! So resampling is done in - _post_process_example().""" + _post_process_example(). + + Args: + t0_dt: the time slice is around t0_dt. + + Returns: Slice of data + + """ start_dt = self._get_start_dt(t0_dt) end_dt = self._get_end_dt(t0_dt) @@ -222,8 +255,14 @@ def datetime_index(self) -> pd.DatetimeIndex: def open_nwp(filename: str, consolidated: bool) -> xr.Dataset: """ + Open The NWP data + Args: - filename must start with 'gs://' if it's on GCP. + filename: filename must start with 'gs://' if it's on GCP. + consolidated: consolidate the zarr file? + + Returns: nwp data + """ _LOG.debug("Opening NWP data: %s", filename) utils.set_fsspec_for_multiprocess() diff --git a/nowcasting_dataset/data_sources/pv_data_source.py b/nowcasting_dataset/data_sources/pv_data_source.py index 32af2120..c25b1226 100644 --- a/nowcasting_dataset/data_sources/pv_data_source.py +++ b/nowcasting_dataset/data_sources/pv_data_source.py @@ -1,3 +1,4 @@ +""" PV Data Source """ from nowcasting_dataset.consts import ( PV_SYSTEM_ID, PV_SYSTEM_ROW_NUMBER, @@ -35,6 +36,8 @@ @dataclass class PVDataSource(ImageDataSource): + """ PV Data Source """ + filename: Union[str, Path] metadata_filename: Union[str, Path] start_dt: Optional[datetime.datetime] = None @@ -48,12 +51,16 @@ class PVDataSource(ImageDataSource): get_center: bool = True def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ Post Init """ super().__post_init__(image_size_pixels, meters_per_pixel) seed = torch.initial_seed() self.rng = np.random.default_rng(seed=seed) self.load() def load(self): + """ + Load metadata and pv power + """ self._load_metadata() self._load_pv_power() if self.load_azimuth_and_elevation: @@ -180,9 +187,12 @@ def _get_all_pv_system_ids_in_roi( y_meters_center: Number, pv_system_ids_with_data_for_timeslice: pd.Int64Index, ) -> pd.Int64Index: - """Find the PV system IDs for all the PV systems within the geospatial - region of interest, defined by self.square.""" + """ + Find the PV system IDs. for all the PV systems within the geospatial + This is for all the PV systems within the geospatial + region of interest, defined by self.square. + """ logger.debug(f"Getting PV example data for {x_meters_center} and {y_meters_center}") bounding_box = self._square.bounding_box_centered_on( @@ -205,7 +215,18 @@ def _get_all_pv_system_ids_in_roi( def get_example( self, t0_dt: pd.Timestamp, x_meters_center: Number, y_meters_center: Number ) -> Example: + """ + Get Example data for PV data + + Args: + t0_dt: list of timestamps for the datetime of the batches. The batch will also include data + for historic and future depending on 'history_minutes' and 'future_minutes'. + x_meters_center: x center batch locations + y_meters_center: y center batch locations + Returns: Example data + + """ logger.debug("Getting PV example data") ( @@ -288,7 +309,6 @@ def get_locations_for_batch( Returns: x_locations, y_locations. Each has one entry per t0_datetime. Locations are in OSGB coordinates. """ - # Set this up as a separate function, so we can cache the result! @functools.cache # functools.cache requires Python >= 3.9 def _get_pv_system_ids(t0_datetime: pd.Timestamp) -> pd.Int64Index: @@ -323,7 +343,6 @@ def _calculate_azimuth_and_elevation(self): """ Calculate the azimuth and elevation angles for each datestamp, for each pv system. """ - logger.debug("Calculating azimuth and elevation angles") self.pv_azimuth, self.pv_elevation = calculate_azimuth_and_elevation_all_pv_systems( @@ -336,8 +355,14 @@ def calculate_azimuth_and_elevation_all_pv_systems( ) -> (pd.Series, pd.Series): """ Calculate the azimuth and elevation angles for each datestamp, for each pv system. - """ + Args: + datestamps: list of timestamps for when to collected data for + pv_metadata: pv metadata, so we know where to collected data for + + Returns: Azimuth and Elevations data + + """ logger.debug( f"Will be calculating for {len(datestamps)} datestamps and {len(pv_metadata)} pv systems" ) @@ -395,12 +420,16 @@ def load_solar_pv_data_from_gcs( from_gcs: bool = True, ) -> pd.DataFrame: """ - Load solar pv data from gcs (althought there is an option to load from loca - for testing) - @param filename: filename of file to be loaded - @param start_dt: the start datetime, which to trim the data to - @param end_dt: the end datetime, which to trim the data to - @param from_gcs: option to laod from gcs, or form local file - @return: dataframe of pv data + Load solar pv data from gcs (although there is an option to load from local - for testing) + + Args: + filename: filename of file to be loaded + start_dt: the start datetime, which to trim the data to + end_dt: the end datetime, which to trim the data to + from_gcs: option to laod from gcs, or form local file + + Returns: Solar PV data + """ gcs = gcsfs.GCSFileSystem(access="read_only") diff --git a/nowcasting_dataset/data_sources/satellite_data_source.py b/nowcasting_dataset/data_sources/satellite_data_source.py index e1910448..77650a52 100644 --- a/nowcasting_dataset/data_sources/satellite_data_source.py +++ b/nowcasting_dataset/data_sources/satellite_data_source.py @@ -1,3 +1,4 @@ +""" Satellite Data Source """ from nowcasting_dataset.data_sources.data_source import ZarrDataSource from nowcasting_dataset.dataset.example import Example, to_numpy from nowcasting_dataset import utils @@ -62,8 +63,9 @@ @dataclass class SatelliteDataSource(ZarrDataSource): """ - Args: - filename: Must start with 'gs://' if on GCP. + Satellite Data Source + + filename: Must start with 'gs://' if on GCP. """ filename: str = None @@ -73,6 +75,7 @@ class SatelliteDataSource(ZarrDataSource): normalise: bool = True def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ Post Init """ super().__post_init__(image_size_pixels, meters_per_pixel) self._cache = {} n_channels = len(self.channels) @@ -84,10 +87,14 @@ def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): ) def open(self) -> None: - # We don't want to open_sat_data in __init__. - # If we did that, then we couldn't copy SatelliteDataSource - # instances into separate processes. Instead, - # call open() _after_ creating separate processes. + """ + Open Satellite data + + We don't want to open_sat_data in __init__. + If we did that, then we couldn't copy SatelliteDataSource + instances into separate processes. Instead, + call open() _after_ creating separate processes. + """ self._data = self._open_data() self._data = self._data.sel(variable=list(self.channels)) @@ -100,6 +107,24 @@ def get_batch( x_locations: Iterable[Number], y_locations: Iterable[Number], ) -> List[Example]: + """ + Get batch data + + Load the first _n_timesteps_per_batch concurrently. This + loads the timesteps from disk concurrently, and fills the + cache. If we try loading all examples + concurrently, then SatelliteDataSource will try reading from + empty caches, and things are much slower! + + Args: + t0_datetimes: list of timestamps for the datetime of the batches. The batch will also include data + for historic and future depending on 'history_minutes' and 'future_minutes'. + x_locations: x center batch locations + y_locations: y center batch locations + + Returns: Batch data + + """ # Load the first _n_timesteps_per_batch concurrently. This # loads the timesteps from disk concurrently, and fills the # cache. If we try loading all examples diff --git a/nowcasting_dataset/data_sources/topographic_data_source.py b/nowcasting_dataset/data_sources/topographic_data_source.py index 39940943..20902e27 100644 --- a/nowcasting_dataset/data_sources/topographic_data_source.py +++ b/nowcasting_dataset/data_sources/topographic_data_source.py @@ -1,3 +1,4 @@ +""" Topological DataSource """ from nowcasting_dataset.data_sources.data_source import ImageDataSource from nowcasting_dataset.dataset.example import Example from nowcasting_dataset.consts import TOPOGRAPHIC_DATA @@ -41,6 +42,7 @@ class TopographicDataSource(ImageDataSource): normalize: bool = True def __post_init__(self, image_size_pixels: int, meters_per_pixel: int): + """ Post init """ super().__post_init__(image_size_pixels, meters_per_pixel) self._shape_of_example = ( image_size_pixels, @@ -70,7 +72,6 @@ def get_example( Returns: Example containing topographic data for the selected area """ - bounding_box = self._square.bounding_box_centered_on( x_meters_center=x_meters_center, y_meters_center=y_meters_center ) @@ -126,8 +127,7 @@ def _post_process_example( self, selected_data: xr.DataArray, t0_dt: pd.Timestamp ) -> xr.DataArray: """ - Post process the topographical data, removing an extra dim and optionally - normalizing + Post process the topographical data, removing an extra dim and optionally normalizing Args: selected_data: DataArray containing the topographic data diff --git a/nowcasting_dataset/dataset/__init__.py b/nowcasting_dataset/dataset/__init__.py index e69de29b..5e8bd110 100644 --- a/nowcasting_dataset/dataset/__init__.py +++ b/nowcasting_dataset/dataset/__init__.py @@ -0,0 +1 @@ +""" Data objects """ diff --git a/nowcasting_dataset/dataset/batch.py b/nowcasting_dataset/dataset/batch.py index fcd47f7d..977d88e9 100644 --- a/nowcasting_dataset/dataset/batch.py +++ b/nowcasting_dataset/dataset/batch.py @@ -1,3 +1,4 @@ +""" batch functions """ from typing import List, Optional, Union import logging @@ -27,6 +28,7 @@ def write_batch_locally(batch: List[Example], batch_i: int, path: Path): """ Write a batch to a locally file + Args: batch: A batch of data batch_i: The number of the batch @@ -196,8 +198,13 @@ def batch_to_dataset(batch: List[Example]) -> xr.Dataset: def coord_to_range( da: xr.DataArray, dim: str, prefix: Optional[str], dtype=np.int32 ) -> xr.DataArray: - # TODO: Actually, I think this is over-complicated? I think we can - # just strip off the 'coord' from the dimension. + """ + TODO + + TODO: Actually, I think this is over-complicated? I think we can + just strip off the 'coord' from the dimension. + + """ coord = da[dim] da[dim] = np.arange(len(coord), dtype=dtype) if prefix is not None: diff --git a/nowcasting_dataset/dataset/datamodule.py b/nowcasting_dataset/dataset/datamodule.py index 2b8efadd..0817f79b 100644 --- a/nowcasting_dataset/dataset/datamodule.py +++ b/nowcasting_dataset/dataset/datamodule.py @@ -1,3 +1,4 @@ +""" Data Modules """ from typing import Union, Optional, Iterable, Dict, Callable from pathlib import Path import pandas as pd @@ -24,6 +25,8 @@ @dataclass class NowcastingDataModule(pl.LightningDataModule): """ + Nowcasting Data Module, used to make batches + Attributes (additional to the dataclass attributes): pv_data_source: PVDataSource sat_data_source: SatelliteDataSource @@ -79,6 +82,7 @@ class NowcastingDataModule(pl.LightningDataModule): skip_n_test_batches: int = 0 # number of test batches to skip def __post_init__(self): + """ Post Init """ super().__init__() self.history_len_30_minutes = self.history_minutes // 30 @@ -97,7 +101,7 @@ def __post_init__(self): self.prefetch_factor = 2 # Set to default when not using multiprocessing. def prepare_data(self) -> None: - # Satellite data + """ Prepare all datasources """ n_timesteps_per_batch = self.batch_size // self.n_samples_per_timestep self.sat_data_source = data_sources.SatelliteDataSource( @@ -286,7 +290,6 @@ def _n_batches_per_epoch_per_worker(self, n_batches_per_epoch: int) -> int: def _split_data(self): """Sets self.train_t0_datetimes and self.val_t0_datetimes.""" - logger.debug("Going to split data") self._check_has_prepared_data() @@ -304,15 +307,19 @@ def _split_data(self): ) def train_dataloader(self) -> torch.utils.data.DataLoader: + """ Train dataloader """ return torch.utils.data.DataLoader(self.train_dataset, **self._common_dataloader_params()) def val_dataloader(self) -> torch.utils.data.DataLoader: + """ Validation dataloader """ return torch.utils.data.DataLoader(self.val_dataset, **self._common_dataloader_params()) def test_dataloader(self) -> torch.utils.data.DataLoader: + """ Test dataloader """ return torch.utils.data.DataLoader(self.test_dataset, **self._common_dataloader_params()) def contiguous_dataloader(self) -> torch.utils.data.DataLoader: + """ Get continours dataloader TODO this is not needed anymore?""" if self.contiguous_dataset is None: pv_data_source = deepcopy(self.pv_data_source) pv_data_source.random_pv_system_for_given_location = False @@ -351,7 +358,8 @@ def _common_dataloader_params(self) -> Dict: def _get_datetimes( self, interpolate_for_30_minute_data: bool = False, adjust_for_sequence_length: bool = True ) -> pd.DatetimeIndex: - """Compute the datetime index. + """ + Compute the datetime index. interpolate_for_30_minute_data: If True, 1. all datetimes from source will be interpolated to 5 min intervals, @@ -364,7 +372,8 @@ def _get_datetimes( This deals with a mixture of data sources that have 5 mins and 30 min datatime. Returns the intersection of the datetime indicies of all the - data_sources, filtered by daylight hours.""" + data_sources, filtered by daylight hours. + """ logger.debug("Get the datetimes") self._check_has_prepared_data() diff --git a/nowcasting_dataset/dataset/datasets.py b/nowcasting_dataset/dataset/datasets.py index 8220233c..5c58ef32 100644 --- a/nowcasting_dataset/dataset/datasets.py +++ b/nowcasting_dataset/dataset/datasets.py @@ -1,3 +1,4 @@ +""" Dataset and functions""" import pandas as pd from numbers import Number from typing import List, Tuple, Callable, Union, Optional @@ -89,7 +90,9 @@ class NetCDFDataset(torch.utils.data.Dataset): - """Loads data saved by the `prepare_ml_training_data.py` script. + """ + Loads data saved by the `prepare_ml_training_data.py` script. + Moved from predict_pv_yield """ @@ -105,6 +108,7 @@ def __init__( forecast_minutes: Optional[int] = None, ): """ + Netcdf Dataset Args: n_batches: Number of batches available on disk. @@ -117,8 +121,8 @@ def __init__( history_minutes: How many past minutes of data to use, if subsetting the batch forecast_minutes: How many future minutes of data to use, if reducing the amount of forecast time configuration: configuration object + cloud: which cloud is used, can be "gcp", "aws" or "local". """ - self.n_batches = n_batches self.src_path = src_path self.tmp_path = tmp_path @@ -157,12 +161,14 @@ def __init__( os.mkdir(self.tmp_path) def per_worker_init(self, worker_id: int): + """ Function called by a worker """ if self.cloud == "gcp": self.gcs = gcsfs.GCSFileSystem() elif self.cloud == "aws": self.s3_resource = boto3.resource("s3") def __len__(self): + """ Length of dataset """ return self.n_batches def __getitem__(self, batch_idx: int) -> example.Example: @@ -248,6 +254,7 @@ class NowcastingDataset(torch.utils.data.IterableDataset): batch_index: int = 0 def __post_init__(self): + """ Post Init """ super().__init__() self._per_worker_init_has_run = False self._n_timesteps_per_batch = self.batch_size // self.n_samples_per_timestep @@ -266,8 +273,11 @@ def __post_init__(self): _LOG.warning(f"Will be skipping {self.skip_batch_index}, is this correct?") def per_worker_init(self, worker_id: int) -> None: - """Called by worker_init_fn on each copy of NowcastingDataset after - the worker process has been spawned.""" + """ + Called by worker_init_fn on each copy of NowcastingDataset + + This happens after the worker process has been spawned. + """ # Each worker must have a different seed for its random number gen. # Otherwise all the workers will output exactly the same data! self.worker_id = worker_id @@ -402,7 +412,6 @@ def subselect_data( Returns: Example with only data between [t0 - history_minutes, t0 + forecast_minutes] remaining """ - _LOG.debug( f"Select sub data with new historic minutes of {history_minutes} " f"and forecast minutes if {forecast_minutes}" diff --git a/nowcasting_dataset/dataset/example.py b/nowcasting_dataset/dataset/example.py index 0fee3032..b49bb7eb 100644 --- a/nowcasting_dataset/dataset/example.py +++ b/nowcasting_dataset/dataset/example.py @@ -1,3 +1,4 @@ +""" Example Data Class """ from typing import TypedDict, List import pandas as pd @@ -106,7 +107,6 @@ def xr_to_example(batch_xr: xr.core.dataset.Dataset, required_keys: List[str]) - Returns: Example object of the xarray data """ - batch = Example( sat_datetime_index=batch_xr.sat_time_coords, nwp_target_time=batch_xr.nwp_time_coords, @@ -121,6 +121,9 @@ def xr_to_example(batch_xr: xr.core.dataset.Dataset, required_keys: List[str]) - def to_numpy(example: Example) -> Example: + """ + Change items in Example to numpy objects + """ for key, value in example.items(): if isinstance(value, xr.DataArray): # TODO: Use to_numpy() or as_numpy(), introduced in xarray v0.19? diff --git a/nowcasting_dataset/dataset/split/method.py b/nowcasting_dataset/dataset/split/method.py index df8164b8..66eca857 100644 --- a/nowcasting_dataset/dataset/split/method.py +++ b/nowcasting_dataset/dataset/split/method.py @@ -1,3 +1,4 @@ +""" Methods for splitting data into train, validation and test """ from typing import List, Tuple import numpy as np @@ -45,7 +46,6 @@ def split_method( Returns: train, validation and test datetimes """ - # find all the unique periods (dates, weeks, e.t.c) datetimes_period = pd.to_datetime(datetimes.to_period(freq).to_timestamp()) unique_periods_in_dataset = datetimes_period.unique() diff --git a/nowcasting_dataset/dataset/split/model.py b/nowcasting_dataset/dataset/split/model.py index c633245c..1f319c23 100644 --- a/nowcasting_dataset/dataset/split/model.py +++ b/nowcasting_dataset/dataset/split/model.py @@ -1,15 +1,19 @@ +""" Model for splitting data """ from typing import List from pydantic import BaseModel, validator class TrainValidationTestSpecific(BaseModel): + """ Class on how to specifically split the data into train, validation and test. """ + train: List[str] validation: List[str] test: List[str] @validator("train") def train_validation_test(cls, v, values): + """ Make sure there is no overlap for the train data """ for vv in ["test", "validation"]: if vv in values.keys(): overlap = [period for period in v if period in values[vv]] @@ -20,6 +24,7 @@ def train_validation_test(cls, v, values): @validator("validation") def validation_overlap(cls, v, values): + """ Make sure there is no overlap for the validation data """ for vv in ["test", "train"]: if vv in values.keys(): overlap = [period for period in v if period in values[vv]] @@ -30,6 +35,7 @@ def validation_overlap(cls, v, values): @validator("test") def test_overlap(cls, v, values): + """ Make sure there is no overlap for the test data """ for vv in ["validation", "train"]: if vv in values.keys(): overlap = [period for period in v if period in values[vv]] diff --git a/nowcasting_dataset/dataset/split/split.py b/nowcasting_dataset/dataset/split/split.py index 1fc3f859..1252aadf 100644 --- a/nowcasting_dataset/dataset/split/split.py +++ b/nowcasting_dataset/dataset/split/split.py @@ -16,6 +16,8 @@ class SplitMethod(Enum): + """ Different split methods """ + DAY = "day" DAY_RANDOM = "day_random" DAY_SPECIFIC = "day_specific" @@ -47,7 +49,6 @@ def split_data( Returns: train, validation and test dataset """ - logger.info(f"Splitting data with method {method}") datetimes = pd.DatetimeIndex(datetimes) diff --git a/nowcasting_dataset/dataset/validate.py b/nowcasting_dataset/dataset/validate.py index 7d4c68b1..fde00c0f 100644 --- a/nowcasting_dataset/dataset/validate.py +++ b/nowcasting_dataset/dataset/validate.py @@ -1,6 +1,4 @@ -""" -A class to validate the prepare ml dataset -""" +""" A class to validate the prepare ml dataset """ from typing import Union import numpy as np @@ -51,7 +49,6 @@ def __init__( batches: Dataset that needs validating configuration: Configuration file """ - self.batches = batches self.configuration = configuration @@ -102,7 +99,13 @@ class FakeDataset(torch.utils.data.Dataset): """Fake dataset.""" def __init__(self, configuration: Configuration, length: int = 10): + """ + Init + Args: + configuration: configuration object + length: length of dataset + """ self.batch_size = configuration.process.batch_size self.seq_length_5 = ( configuration.process.seq_len_5_minutes @@ -117,13 +120,23 @@ def __init__(self, configuration: Configuration, length: int = 10): self.length = length def __len__(self): + """ Number of pieces of data """ return self.length def per_worker_init(self, worker_id: int): + """ Not needed """ pass def __getitem__(self, idx): + """ + Get item, use for iter and next method + + Args: + idx: batch index + + Returns: Dictionary of random data + """ x = { "sat_data": torch.randn( self.batch_size, @@ -216,6 +229,7 @@ def validate_example( ): """ Validate the size and shape of the data + Args: data: Typed dictionary of the data seq_len_30_minutes: the length of the sequence for 30 minutely data @@ -228,7 +242,6 @@ def validate_example( n_gsp_per_example: the number gsp systems with nan padding batch: if this example class is a batch or not """ - n_gsp_id = data[GSP_ID].shape[-1] assert ( n_gsp_id == n_gsp_per_example @@ -345,7 +358,6 @@ def validate_batch_from_configuration(data: Example, configuration: Configuratio configuration: confgiruation of the data """ - validate_example( data=data, seq_len_30_minutes=configuration.process.seq_len_30_minutes, diff --git a/nowcasting_dataset/geospatial.py b/nowcasting_dataset/geospatial.py index 00745191..3955602a 100644 --- a/nowcasting_dataset/geospatial.py +++ b/nowcasting_dataset/geospatial.py @@ -1,3 +1,4 @@ +""" Geospatial functions """ import pandas as pd import pyproj from numbers import Number @@ -21,27 +22,35 @@ class Transformers: """ - Class to store transformation from one Grid to another. Its good to make this only once, but need the + Class to store transformation from one Grid to another. + + Its good to make this only once, but need the option of updating them, due to out of data grids. """ def __init__(self): - + """ Init """ self._osgb_to_lat_lon = None self._lat_lon_to_osgb = None self.make_transformers() def make_transformers(self): - # Nice to only make these once, as it makes calling the functions below quicker + """ + Make transformers + + Nice to only make these once, as it makes calling the functions below quicker + """ self._osgb_to_lat_lon = pyproj.Transformer.from_crs(crs_from=OSGB, crs_to=WGS84) self._lat_lon_to_osgb = pyproj.Transformer.from_crs(crs_from=WGS84, crs_to=OSGB) @property def osgb_to_lat_lon(self): + """ OSGB to lat-lon property """ return self._osgb_to_lat_lon @property def lat_lon_to_osgb(self): + """ lat-lon to OSGB property """ return self._lat_lon_to_osgb @@ -50,10 +59,7 @@ def lat_lon_to_osgb(self): def download_grids(): - """ - The transformer grid sometimes need updating - """ - + """ The transformer grid sometimes need updating """ pyproj.transformer.TransformerGroup(crs_from=OSGB, crs_to=WGS84).download_grids(verbose=True) pyproj.transformer.TransformerGroup(crs_from=WGS84, crs_to=OSGB).download_grids(verbose=True) @@ -61,20 +67,29 @@ def download_grids(): def osgb_to_lat_lon(x: Number, y: Number) -> Tuple[Number, Number]: - """Returns 2-tuple of latitude (north-south), longitude (east-west). + """ + Change OSGB coordinates to lat, lon Args: - x, y: Location in Ordnance Survey GB 1936, also known as - British National Grid, coordinates. + x: osgb east-west + y: osgb north-south + + Return: 2-tuple of latitude (north-south), longitude (east-west). + """ return transformers.osgb_to_lat_lon.transform(x, y) def lat_lon_to_osgb(lat: Number, lon: Number) -> Tuple[Number, Number]: - """Returns 2-tuple of x (east-west), y (north-south). + """ + Change lat, lon to a OSGB coordinates Args: - lat, lon: Location is WGS84 coordinates. + lat: latitude + lon: longitude + + Return: 2-tuple of x (east-west), y (north-south). + """ return transformers.lat_lon_to_osgb.transform(lat, lon) @@ -96,7 +111,6 @@ def calculate_azimuth_and_elevation_angle( have been calculate. """ - # get the solor position solpos = pvlib.solarposition.get_solarposition(datestamps, latitude, longitude) diff --git a/nowcasting_dataset/square.py b/nowcasting_dataset/square.py index b8c77fc0..2624261e 100644 --- a/nowcasting_dataset/square.py +++ b/nowcasting_dataset/square.py @@ -1,3 +1,4 @@ +""" Square objects """ from typing import NamedTuple, Union from numbers import Number @@ -5,6 +6,8 @@ class BoundingBox(NamedTuple): + """ Bounding box tuple """ + top: Union[Number, float] bottom: Union[Number, float] left: Union[Number, float] @@ -12,9 +15,16 @@ class BoundingBox(NamedTuple): class Square: - """"Class for computing bounding box for satellite imagery.""" + """ Class for computing bounding box for satellite imagery. """ def __init__(self, size_pixels: int, meters_per_pixel: Number): + """ + Init + + Args: + size_pixels: number of pixels + meters_per_pixel: how many meters for each pixel + """ self.size_pixels = size_pixels size_meters = size_pixels * meters_per_pixel self._half_size_meters = size_meters / 2 @@ -22,6 +32,16 @@ def __init__(self, size_pixels: int, meters_per_pixel: Number): def bounding_box_centered_on( self, x_meters_center: Number, y_meters_center: Number ) -> BoundingBox: + """ + Get bounding box from a centre + + Args: + x_meters_center: x center of the bounding box + y_meters_center: y center of the bounding box + + Returns: Bounding box + + """ return BoundingBox( top=y_meters_center + self._half_size_meters, bottom=y_meters_center - self._half_size_meters, @@ -33,6 +53,7 @@ def bounding_box_centered_on( def get_bounding_box_mask(bounding_box: BoundingBox, x: Array, y: Array) -> Array: """ Get boundary box mask from x and y locations. I.e are the x,y coords in the boundaring box + Args: bounding_box: Bounding box x: x coordinates diff --git a/nowcasting_dataset/time.py b/nowcasting_dataset/time.py index 29984e86..0d63ad7a 100644 --- a/nowcasting_dataset/time.py +++ b/nowcasting_dataset/time.py @@ -1,3 +1,4 @@ +""" Time functions """ import pandas as pd import numpy as np from typing import Iterable, Tuple, List @@ -18,16 +19,18 @@ def select_daylight_datetimes( datetimes: pd.DatetimeIndex, locations: Iterable[Tuple[float, float]], ghi_threshold: float = 10 ) -> pd.DatetimeIndex: - """Returns datetimes for which the global horizontal irradiance - (GHI) is above ghi_threshold across all locations. + """ + Select only the day time datetimes Args: - dt_index: DatetimeIndex to filter. - locations: List of Tuples of x, y coordinates in OSGB projection. + datetimes: DatetimeIndex to filter. + locations: List of Tuples of x, y coordinates in OSGB projection. For example, use the four corners of the satellite imagery. - ghi_threshold: Global horizontal irradiance threshold. + ghi_threshold: Global horizontal irradiance threshold. (Watts per square meter?) + Returns: datetimes for which the global horizontal irradiance (GHI) is above ghi_threshold across all locations. + """ ghi_for_all_locations = [] for x, y in locations: @@ -50,6 +53,7 @@ def select_daylight_datetimes( def intersection_of_datetimeindexes(indexes: List[pd.DatetimeIndex]) -> pd.DatetimeIndex: + """ Get intersections of datetime indexes """ assert len(indexes) > 0 intersection = indexes[0] for index in indexes[1:]: @@ -113,6 +117,7 @@ def get_t0_datetimes( ) -> pd.DatetimeIndex: """ Get datetimes for ML learning batches. T0 refers to the time 'now'. + Args: datetimes: list of datetimes when data is available total_seq_len: total sequence length of data for ml model @@ -123,7 +128,6 @@ def get_t0_datetimes( Returns: Datetimes that ml learning data can be built around. """ - logger.debug("Getting t0 datetimes") start_datetimes = get_start_datetimes( @@ -138,11 +142,21 @@ def get_t0_datetimes( def timesteps_to_duration(n_timesteps: int, minute_delta: int = 5) -> pd.Timedelta: + """ Change timesteps to a time duration """ assert n_timesteps >= 0 return pd.Timedelta(n_timesteps * minute_delta, unit="minutes") def datetime_features(index: pd.DatetimeIndex) -> pd.DataFrame: + """ + Make datetime features, hour_of_day and day_of_year + + Args: + index: index of datestamps + + Returns: Example data with datetime features + + """ features = {} features["hour_of_day"] = index.hour + (index.minute / 60) features["day_of_year"] = index.day_of_year @@ -150,6 +164,15 @@ def datetime_features(index: pd.DatetimeIndex) -> pd.DataFrame: def datetime_features_in_example(index: pd.DatetimeIndex) -> Example: + """ + Make datetime features with sin and cos + + Args: + index: index of datestamps + + Returns: Example data with datetime features + + """ dt_features = datetime_features(index) dt_features["hour_of_day"] /= 24 dt_features["day_of_year"] /= 365 @@ -164,7 +187,6 @@ def fill_30_minutes_timestamps_to_5_minutes(index: pd.DatetimeIndex) -> pd.Datet """ Fill a 30 minute index with 5 minute timestamps too. Note any gaps in 30 mins are not filled """ - # resample index to 5 mins index_5 = pd.Series(0, index=index).resample("5T") diff --git a/nowcasting_dataset/utils.py b/nowcasting_dataset/utils.py index f75b765a..bb0179af 100644 --- a/nowcasting_dataset/utils.py +++ b/nowcasting_dataset/utils.py @@ -1,3 +1,4 @@ +""" utils functions """ import logging import numpy as np import pandas as pd @@ -12,15 +13,20 @@ def set_fsspec_for_multiprocess() -> None: - """Clear reference to the loop and thread. This is necessary otherwise + """ + Clear reference to the loop and thread. + + This is necessary otherwise gcsfs hangs in the ML training loop. Only required for fsspec >= 0.9.0 See https://github.com/dask/gcsfs/issues/379#issuecomment-839929801 - TODO: Try deleting this two lines to make sure this is still relevant.""" + TODO: Try deleting this two lines to make sure this is still relevant. + """ fsspec.asyn.iothread[0] = None fsspec.asyn.loop[0] = None def is_monotonically_increasing(a: Array) -> bool: + """ Check the array is monotonically increasing """ # TODO: Can probably replace with pd.Index.is_monotonic_increasing() assert a is not None assert len(a) > 0 @@ -31,6 +37,7 @@ def is_monotonically_increasing(a: Array) -> bool: def is_unique(a: Array) -> bool: + """ Check array has unique values """ # TODO: Can probably replace with pd.Index.is_unique() return len(a) == len(np.unique(a)) @@ -45,7 +52,8 @@ def scale_to_0_to_1(a: Array) -> Array: def sin_and_cos(df: pd.DataFrame) -> pd.DataFrame: - """For every column in df, creates cols for sin and cos of that col. + """ + For every column in df, creates cols for sin and cos of that col. Args: df: Input DataFrame. The values must be in the range [0, 1]. @@ -56,7 +64,8 @@ def sin_and_cos(df: pd.DataFrame) -> pd.DataFrame: Returns: A new DataFrame, with twice the number of columns as the input df. For each col in df, the output DataFrame will have a