diff --git a/python_dwd/additionals/helpers.py b/python_dwd/additionals/helpers.py index a061088c3..394375c9c 100644 --- a/python_dwd/additionals/helpers.py +++ b/python_dwd/additionals/helpers.py @@ -303,3 +303,7 @@ def create_stationdata_dtype_mapping(columns: List[str]) -> dict: stationdata_dtype_mapping[column] = float return stationdata_dtype_mapping + + +def convert_datetime_hourly(value): + return pd.to_datetime(value, format='%Y%m%d%H') diff --git a/python_dwd/data_collection.py b/python_dwd/data_collection.py index 8ac21015d..b5ec8dfd6 100644 --- a/python_dwd/data_collection.py +++ b/python_dwd/data_collection.py @@ -79,7 +79,7 @@ def collect_dwd_data(station_ids: List[int], filenames_and_files = download_dwd_data(remote_files, parallel_download) - station_data = parse_dwd_data(filenames_and_files) + station_data = parse_dwd_data(filenames_and_files, time_resolution) if write_file: store_dwd_data( diff --git a/python_dwd/parsing_data/parse_data_from_files.py b/python_dwd/parsing_data/parse_data_from_files.py index 6f5da00b0..82e8f7bb1 100644 --- a/python_dwd/parsing_data/parse_data_from_files.py +++ b/python_dwd/parsing_data/parse_data_from_files.py @@ -1,17 +1,19 @@ """ function to read data from dwd server """ import logging -from typing import List, Tuple +from typing import List, Tuple, Union from io import BytesIO import pandas as pd -from python_dwd.additionals.helpers import create_stationdata_dtype_mapping +from python_dwd.additionals.helpers import create_stationdata_dtype_mapping, convert_datetime_hourly from python_dwd.constants.column_name_mapping import GERMAN_TO_ENGLISH_COLUMNS_MAPPING from python_dwd.constants.metadata import NA_STRING, STATIONDATA_SEP +from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns +from python_dwd.enumerations.time_resolution_enumeration import TimeResolution log = logging.getLogger(__name__) -def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFrame: +def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]], time_resolution: Union[TimeResolution, str]) -> pd.DataFrame: """ This function is used to read the station data from given bytes object. The filename is required to defined if and where an error happened. @@ -23,9 +25,12 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFra Returns: pandas.DataFrame with requested data, for different station ids the data is still put into one DataFrame """ + + time_resolution = TimeResolution(time_resolution) + data = [] for filename_and_file in filenames_and_files: - data.append(_parse_dwd_data(filename_and_file)) + data.append(_parse_dwd_data(filename_and_file, time_resolution)) try: data = pd.concat(data).reset_index(drop=True) @@ -38,7 +43,7 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFra return data -def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO]) -> pd.DataFrame: +def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: TimeResolution) -> pd.DataFrame: """ A wrapping function that only handles data for one station id. The files passed to it are thus related to this id. This is important for storing the data locally as the DataFrame that is stored should obviously only handle one @@ -78,6 +83,10 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO]) -> pd.DataFrame: # Assign meaningful column names (baseline). data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING) + # Properly handle timestamps from "hourly" resolution. + if time_resolution == TimeResolution.HOURLY: + data[DWDMetaColumns.DATE.value] = data[DWDMetaColumns.DATE.value].apply(convert_datetime_hourly) + # Coerce the data types appropriately. data = data.astype(create_stationdata_dtype_mapping(data.columns)) diff --git a/tests/parsing_data/test_parse_data_from_files.py b/tests/parsing_data/test_parse_data_from_files.py index 59ec11d8c..bd286316c 100644 --- a/tests/parsing_data/test_parse_data_from_files.py +++ b/tests/parsing_data/test_parse_data_from_files.py @@ -4,6 +4,7 @@ from pathlib import Path import pandas as pd +from python_dwd.enumerations.time_resolution_enumeration import TimeResolution from python_dwd.parsing_data.parse_data_from_files import parse_dwd_data fixtures_dir = Path(__file__, "../..").resolve().absolute() / "fixtures" @@ -18,6 +19,8 @@ def test_parse_dwd_data(): file_in_bytes.seek(0) station_data = parse_dwd_data( - filenames_and_files=[(filename, file_in_bytes)]) + filenames_and_files=[(filename, file_in_bytes)], + time_resolution=TimeResolution.DAILY + ) station_data.equals(station_data_original) diff --git a/tests/test_data_collection.py b/tests/test_data_collection.py index f3ea50d3d..65a585ba9 100644 --- a/tests/test_data_collection.py +++ b/tests/test_data_collection.py @@ -119,7 +119,7 @@ def test_collect_dwd_data_empty(): @pytest.mark.remote -def test_fetch_and_parse_dwd_data_vanilla_columns(): +def test_collect_daily_vanilla(): """ Test for data collection with real data """ data = collect_dwd_data( @@ -152,7 +152,7 @@ def test_fetch_and_parse_dwd_data_vanilla_columns(): @pytest.mark.remote -def test_fetch_and_parse_dwd_data_humanized_columns(): +def test_collect_daily_humanized(): """ Test for data collection with real data and humanized column names """ data = collect_dwd_data( @@ -183,3 +183,23 @@ def test_fetch_and_parse_dwd_data_humanized_columns(): 'TEMPERATURE_MIN_200', 'TEMPERATURE_MIN_005', ] + + +@pytest.mark.remote +def test_collect_hourly_vanilla(): + """ Test for data collection with real data """ + + data = collect_dwd_data( + station_ids=[1048], + parameter=Parameter.TEMPERATURE_AIR, + time_resolution=TimeResolution.HOURLY, + period_type=PeriodType.RECENT + ) + + assert list(data.columns.values) == [ + 'STATION_ID', + 'DATE', + 'QN_9', + 'TT_TU', + 'RF_TU', + ]