From e82f0729976e4d54cf8d970bee52eed34d8bfa9b Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 17 Jun 2020 02:00:47 +0200 Subject: [PATCH] Improve "hourly" data acquisition This accounts for another two anomalies within hourly/solar data. - Skip field "MESS_DATUM_WOZ" on import. - Grok timestamp field values like "2001010100:03". Notwithstanding the above, obtaining timestamps from the command line has been improved to say things like "--date=2020-06-15T12" in order to properly address the "hour" slot when filtering. --- python_dwd/additionals/helpers.py | 4 --- python_dwd/additionals/time_handling.py | 29 +++++++++++++++++++ python_dwd/cli.py | 11 ++++--- .../parsing_data/parse_data_from_files.py | 7 ++++- tests/additionals/test_time_handling.py | 16 +++++++++- 5 files changed, 57 insertions(+), 10 deletions(-) diff --git a/python_dwd/additionals/helpers.py b/python_dwd/additionals/helpers.py index cbc814014..8e32961a8 100644 --- a/python_dwd/additionals/helpers.py +++ b/python_dwd/additionals/helpers.py @@ -308,7 +308,3 @@ def create_stationdata_dtype_mapping(columns: List[str]) -> dict: stationdata_dtype_mapping[column] = float return stationdata_dtype_mapping - - -def convert_datetime_hourly(value): - return pd.to_datetime(value, format='%Y%m%d%H') diff --git a/python_dwd/additionals/time_handling.py b/python_dwd/additionals/time_handling.py index 2c5eb9aec..0c67bc721 100644 --- a/python_dwd/additionals/time_handling.py +++ b/python_dwd/additionals/time_handling.py @@ -1,5 +1,6 @@ """ date time handling functions """ from datetime import datetime +from dateparser import parse as parsedate from typing import Optional, Tuple, Union import pandas as pd from pandas import Timestamp @@ -25,6 +26,10 @@ def parse_date(date_string: str) -> Optional[Timestamp]: return date +def parse_datetime(date_string: str) -> datetime: + return parsedate(date_string, date_formats=['%Y-%m-%dT%H']) + + def mktimerange(time_resolution: TimeResolution, date_from: Union[datetime, str], date_to: Union[datetime, str] = None) -> Tuple[Timestamp, Timestamp]: @@ -57,3 +62,27 @@ def mktimerange(time_resolution: TimeResolution, raise NotImplementedError("mktimerange only implemented for annual and monthly time ranges") return date_from, date_to + + +def convert_datetime_hourly(date_string: str) -> Timestamp: + """ + Data from the hourly time resolution has a timestamp format + of e.g. "2018121300". So, let's parse it using the custom + timestamp pattern %Y%m%d%H. + + There's also an anomaly for hourly/solar observations, + where the timestamp seems to also include minutes, + like "2001010100:03" or "2001011508:09". For them, + we consider it to be safe to drop the minute part + right away by flooring it to "00". + + :param date_string: + :return: + """ + + pattern = '%Y%m%d%H' + + if ':' in date_string: + pattern = '%Y%m%d%H:%M' + + return pd.to_datetime(date_string, format=pattern).replace(minute=00) diff --git a/python_dwd/cli.py b/python_dwd/cli.py index baee84b01..0c55fdb1b 100644 --- a/python_dwd/cli.py +++ b/python_dwd/cli.py @@ -7,7 +7,7 @@ import pandas as pd from python_dwd import __version__, metadata_for_dwd_data -from python_dwd.additionals.time_handling import mktimerange +from python_dwd.additionals.time_handling import mktimerange, parse_datetime from python_dwd.additionals.util import normalize_options, setup_logging, read_list from python_dwd.dwd_station_request import DWDStationRequest from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns @@ -76,6 +76,9 @@ def run(): # Acquire annual data from 2010 to 2020 dwd readings --station=44,1048 --parameter=kl --resolution=annual --period=recent,historical --date=2010/2020 + # Acquire hourly data + dwd readings --station=44,1048 --parameter=air_temperature --resolution=hourly --period=recent --date=2020-06-15T12 + """ # Read command line options. @@ -129,8 +132,8 @@ def run(): # Filter by time interval. if '/' in options.date: date_from, date_to = options.date.split('/') - date_from = parsedate(date_from) - date_to = parsedate(date_to) + date_from = parse_datetime(date_from) + date_to = parse_datetime(date_to) if request.time_resolution in (TimeResolution.ANNUAL, TimeResolution.MONTHLY): date_from, date_to = mktimerange(request.time_resolution, date_from, date_to) expression = (date_from <= df[DWDMetaColumns.FROM_DATE.value]) & (df[DWDMetaColumns.TO_DATE.value] <= date_to) @@ -140,7 +143,7 @@ def run(): # Filter by date. else: - date = parsedate(options.date) + date = parse_datetime(options.date) if request.time_resolution in (TimeResolution.ANNUAL, TimeResolution.MONTHLY): date_from, date_to = mktimerange(request.time_resolution, date) expression = (date_from <= df[DWDMetaColumns.FROM_DATE.value]) & (df[DWDMetaColumns.TO_DATE.value] <= date_to) diff --git a/python_dwd/parsing_data/parse_data_from_files.py b/python_dwd/parsing_data/parse_data_from_files.py index 82e8f7bb1..a96e4f62c 100644 --- a/python_dwd/parsing_data/parse_data_from_files.py +++ b/python_dwd/parsing_data/parse_data_from_files.py @@ -4,7 +4,8 @@ from io import BytesIO import pandas as pd -from python_dwd.additionals.helpers import create_stationdata_dtype_mapping, convert_datetime_hourly +from python_dwd.additionals.helpers import create_stationdata_dtype_mapping +from python_dwd.additionals.time_handling import convert_datetime_hourly from python_dwd.constants.column_name_mapping import GERMAN_TO_ENGLISH_COLUMNS_MAPPING from python_dwd.constants.metadata import NA_STRING, STATIONDATA_SEP from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns @@ -80,6 +81,10 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: Tim # End of record (EOR) has no value, so drop it right away. data = data.drop(columns='EOR', errors='ignore') + # Skip "interval in local true solar time" for hourly/solar + # observations, we will only import the UTC field "MESS_DATUM". + data = data.drop(columns='MESS_DATUM_WOZ', errors='ignore') + # Assign meaningful column names (baseline). data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING) diff --git a/tests/additionals/test_time_handling.py b/tests/additionals/test_time_handling.py index 09d5d5be6..3bdcae4b1 100644 --- a/tests/additionals/test_time_handling.py +++ b/tests/additionals/test_time_handling.py @@ -1,11 +1,19 @@ +from datetime import datetime + import pytest from dateparser import parse as parsedate from pandas import Timestamp -from python_dwd.additionals.time_handling import mktimerange +from python_dwd.additionals.time_handling import mktimerange, parse_datetime, convert_datetime_hourly from python_dwd.enumerations.time_resolution_enumeration import TimeResolution +def test_parse_datetime(): + assert parse_datetime('2020-05-01') == datetime(2020, 5, 1, 0, 0) + assert parse_datetime('2020-05-01T13:14:15') == datetime(2020, 5, 1, 13, 14, 15) + assert parse_datetime('2020-05-01T13') == datetime(2020, 5, 1, 13, 0) + + def test_mktimerange_annual(): assert mktimerange(TimeResolution.ANNUAL, parsedate('2019')) == \ @@ -28,3 +36,9 @@ def test_mktimerange_invalid(): with pytest.raises(NotImplementedError): mktimerange(TimeResolution.DAILY, parsedate('2020-05-01')) + + +def test_convert_datetime_hourly(): + + assert convert_datetime_hourly('2018121308') == Timestamp('2018-12-13 08:00:00') + assert convert_datetime_hourly('2001010112:03') == Timestamp('2001-01-01 12:00:00')