Skip to content

Commit

Permalink
Improve "hourly" data acquisition
Browse files Browse the repository at this point in the history
This accounts for another two anomalies within hourly/solar data.
- Skip field "MESS_DATUM_WOZ" on import.
- Grok timestamp field values like "2001010100:03".

Notwithstanding the above, obtaining timestamps from the command
line has been improved to say things like "--date=2020-06-15T12"
in order to properly address the "hour" slot when filtering.
  • Loading branch information
amotl committed Jun 17, 2020
1 parent b87975c commit e82f072
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 10 deletions.
4 changes: 0 additions & 4 deletions python_dwd/additionals/helpers.py
Expand Up @@ -308,7 +308,3 @@ def create_stationdata_dtype_mapping(columns: List[str]) -> dict:
stationdata_dtype_mapping[column] = float

return stationdata_dtype_mapping


def convert_datetime_hourly(value):
return pd.to_datetime(value, format='%Y%m%d%H')
29 changes: 29 additions & 0 deletions python_dwd/additionals/time_handling.py
@@ -1,5 +1,6 @@
""" date time handling functions """
from datetime import datetime
from dateparser import parse as parsedate
from typing import Optional, Tuple, Union
import pandas as pd
from pandas import Timestamp
Expand All @@ -25,6 +26,10 @@ def parse_date(date_string: str) -> Optional[Timestamp]:
return date


def parse_datetime(date_string: str) -> datetime:
return parsedate(date_string, date_formats=['%Y-%m-%dT%H'])


def mktimerange(time_resolution: TimeResolution,
date_from: Union[datetime, str],
date_to: Union[datetime, str] = None) -> Tuple[Timestamp, Timestamp]:
Expand Down Expand Up @@ -57,3 +62,27 @@ def mktimerange(time_resolution: TimeResolution,
raise NotImplementedError("mktimerange only implemented for annual and monthly time ranges")

return date_from, date_to


def convert_datetime_hourly(date_string: str) -> Timestamp:
"""
Data from the hourly time resolution has a timestamp format
of e.g. "2018121300". So, let's parse it using the custom
timestamp pattern %Y%m%d%H.
There's also an anomaly for hourly/solar observations,
where the timestamp seems to also include minutes,
like "2001010100:03" or "2001011508:09". For them,
we consider it to be safe to drop the minute part
right away by flooring it to "00".
:param date_string:
:return:
"""

pattern = '%Y%m%d%H'

if ':' in date_string:
pattern = '%Y%m%d%H:%M'

return pd.to_datetime(date_string, format=pattern).replace(minute=00)
11 changes: 7 additions & 4 deletions python_dwd/cli.py
Expand Up @@ -7,7 +7,7 @@
import pandas as pd

from python_dwd import __version__, metadata_for_dwd_data
from python_dwd.additionals.time_handling import mktimerange
from python_dwd.additionals.time_handling import mktimerange, parse_datetime
from python_dwd.additionals.util import normalize_options, setup_logging, read_list
from python_dwd.dwd_station_request import DWDStationRequest
from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
Expand Down Expand Up @@ -76,6 +76,9 @@ def run():
# Acquire annual data from 2010 to 2020
dwd readings --station=44,1048 --parameter=kl --resolution=annual --period=recent,historical --date=2010/2020
# Acquire hourly data
dwd readings --station=44,1048 --parameter=air_temperature --resolution=hourly --period=recent --date=2020-06-15T12
"""

# Read command line options.
Expand Down Expand Up @@ -129,8 +132,8 @@ def run():
# Filter by time interval.
if '/' in options.date:
date_from, date_to = options.date.split('/')
date_from = parsedate(date_from)
date_to = parsedate(date_to)
date_from = parse_datetime(date_from)
date_to = parse_datetime(date_to)
if request.time_resolution in (TimeResolution.ANNUAL, TimeResolution.MONTHLY):
date_from, date_to = mktimerange(request.time_resolution, date_from, date_to)
expression = (date_from <= df[DWDMetaColumns.FROM_DATE.value]) & (df[DWDMetaColumns.TO_DATE.value] <= date_to)
Expand All @@ -140,7 +143,7 @@ def run():

# Filter by date.
else:
date = parsedate(options.date)
date = parse_datetime(options.date)
if request.time_resolution in (TimeResolution.ANNUAL, TimeResolution.MONTHLY):
date_from, date_to = mktimerange(request.time_resolution, date)
expression = (date_from <= df[DWDMetaColumns.FROM_DATE.value]) & (df[DWDMetaColumns.TO_DATE.value] <= date_to)
Expand Down
7 changes: 6 additions & 1 deletion python_dwd/parsing_data/parse_data_from_files.py
Expand Up @@ -4,7 +4,8 @@
from io import BytesIO
import pandas as pd

from python_dwd.additionals.helpers import create_stationdata_dtype_mapping, convert_datetime_hourly
from python_dwd.additionals.helpers import create_stationdata_dtype_mapping
from python_dwd.additionals.time_handling import convert_datetime_hourly
from python_dwd.constants.column_name_mapping import GERMAN_TO_ENGLISH_COLUMNS_MAPPING
from python_dwd.constants.metadata import NA_STRING, STATIONDATA_SEP
from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
Expand Down Expand Up @@ -80,6 +81,10 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: Tim
# End of record (EOR) has no value, so drop it right away.
data = data.drop(columns='EOR', errors='ignore')

# Skip "interval in local true solar time" for hourly/solar
# observations, we will only import the UTC field "MESS_DATUM".
data = data.drop(columns='MESS_DATUM_WOZ', errors='ignore')

# Assign meaningful column names (baseline).
data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)

Expand Down
16 changes: 15 additions & 1 deletion tests/additionals/test_time_handling.py
@@ -1,11 +1,19 @@
from datetime import datetime

import pytest
from dateparser import parse as parsedate
from pandas import Timestamp

from python_dwd.additionals.time_handling import mktimerange
from python_dwd.additionals.time_handling import mktimerange, parse_datetime, convert_datetime_hourly
from python_dwd.enumerations.time_resolution_enumeration import TimeResolution


def test_parse_datetime():
assert parse_datetime('2020-05-01') == datetime(2020, 5, 1, 0, 0)
assert parse_datetime('2020-05-01T13:14:15') == datetime(2020, 5, 1, 13, 14, 15)
assert parse_datetime('2020-05-01T13') == datetime(2020, 5, 1, 13, 0)


def test_mktimerange_annual():

assert mktimerange(TimeResolution.ANNUAL, parsedate('2019')) == \
Expand All @@ -28,3 +36,9 @@ def test_mktimerange_invalid():

with pytest.raises(NotImplementedError):
mktimerange(TimeResolution.DAILY, parsedate('2020-05-01'))


def test_convert_datetime_hourly():

assert convert_datetime_hourly('2018121308') == Timestamp('2018-12-13 08:00:00')
assert convert_datetime_hourly('2001010112:03') == Timestamp('2001-01-01 12:00:00')

0 comments on commit e82f072

Please sign in to comment.