diff --git a/nowcasting_dataset/config/README.md b/nowcasting_dataset/config/README.md index 7fa4f249..c92b7a92 100644 --- a/nowcasting_dataset/config/README.md +++ b/nowcasting_dataset/config/README.md @@ -9,6 +9,11 @@ See `model.py` for documentation of the expected configuration fields. See either `gcp.yaml` or `on_premises.yaml` for example config files. +All paths must include the protocol prefix. For local files, +it's sufficient to just start with a '/'. For aws, start with 's3://', +for gcp start with 'gs://'. + + # Example ```python diff --git a/nowcasting_dataset/config/load.py b/nowcasting_dataset/config/load.py index 23f46b9b..dc230c35 100644 --- a/nowcasting_dataset/config/load.py +++ b/nowcasting_dataset/config/load.py @@ -1,11 +1,8 @@ """ Loading configuration functions """ -import io import logging -import os from typing import Union import fsspec -import gcsfs import yaml from pathy import Pathy @@ -33,36 +30,3 @@ def load_yaml_configuration(filename: Union[str, Pathy]) -> Configuration: configuration = Configuration(**configuration) return configuration - - -def load_configuration_from_gcs( - gcp_dir: str, bucket: str = "solar-pv-nowcasting-data", filename: str = "configuration.yaml" -) -> Configuration: - """ - Load configuration from gcs - - gcp_dir: the directory where the configruation is saved - bucket: the gcs bucket to load from - filename: the filename that will be loaded - - Returns: configuration class - """ - logger.info("Loading configuration from gcs") - - bucket_and_dir = os.path.join(f"gs://{bucket}", gcp_dir) - filename = os.path.join(bucket_and_dir, filename) - logger.debug(f"Will be opening {filename}") - - # set up gcs - gcs = gcsfs.GCSFileSystem(access="read_only") - - # load the file into bytes - with gcs.open(filename, mode="rb") as file: - file_bytes = file.read() - - # load the bytes to yaml - with io.BytesIO(file_bytes) as file: - data = yaml.load(file) - - # put into pydantic class and returns - return Configuration(**data) diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py index 22c89704..0580c219 100644 --- a/nowcasting_dataset/config/model.py +++ b/nowcasting_dataset/config/model.py @@ -1,4 +1,9 @@ -""" Configuration model for the dataset """ +""" Configuration model for the dataset + +All paths must include the protocol prefix. For local files, +it's sufficient to just start with a '/'. For aws, start with 's3://', +for gcp start with 'gs://'. +""" from datetime import datetime from typing import Optional @@ -152,11 +157,7 @@ class Sun(DataSourceMixin): class InputData(BaseModel): """ - Input data model - - All paths must include the protocol prefix. For local files, - it's sufficient to just start with a '/'. For aws, start with 's3://', - for gcp start with 'gs://'. + Input data model. """ pv: PV = PV() diff --git a/scripts/validate_ml_data.py b/scripts/validate_ml_data.py index 74c960b4..fa291cda 100644 --- a/scripts/validate_ml_data.py +++ b/scripts/validate_ml_data.py @@ -4,7 +4,7 @@ import nowcasting_dataset import torch -from nowcasting_dataset.config.load import load_configuration_from_gcs, load_yaml_configuration +from nowcasting_dataset.config.load import load_yaml_configuration from nowcasting_dataset.dataset.datasets import NetCDFDataset, worker_init_fn from nowcasting_dataset.dataset.validate import ValidatorDataset from nowcasting_dataset.cloud.utils import get_maximum_batch_id @@ -17,7 +17,6 @@ # load configuration, this can be changed to a different filename as needed filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml") -config = load_configuration_from_gcs(gcp_dir="prepared_ML_training_data/v5/") config = load_yaml_configuration(filename=filename) DST_NETCDF4_PATH = config.output_data.filepath diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 66afe7d2..15571b93 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -7,7 +7,7 @@ import pytest import nowcasting_dataset -from nowcasting_dataset.config.load import load_yaml_configuration, load_configuration_from_gcs +from nowcasting_dataset.config.load import load_yaml_configuration from nowcasting_dataset.config.model import Configuration, set_git_commit from nowcasting_dataset.config.save import save_yaml_configuration @@ -87,7 +87,9 @@ def test_load_to_gcs(): """ Check that configuration can be loaded to gcs """ - config = load_configuration_from_gcs(gcp_dir="prepared_ML_training_data/v-default") + config = load_yaml_configuration( + filename="gs://solar-pv-nowcasting-data/prepared_ML_training_data/v-default/configuration.yaml" + ) assert isinstance(config, Configuration)