Skip to content

Commit

Permalink
Merge pull request earthobservations#63 from panodata/fix-hourly
Browse files Browse the repository at this point in the history
Properly handle timestamps from "hourly" resolution data set
  • Loading branch information
gutzbenj committed Jun 16, 2020
2 parents 408a039 + 79b6ebd commit 8d6f9fb
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 9 deletions.
4 changes: 4 additions & 0 deletions python_dwd/additionals/helpers.py
Expand Up @@ -303,3 +303,7 @@ def create_stationdata_dtype_mapping(columns: List[str]) -> dict:
stationdata_dtype_mapping[column] = float

return stationdata_dtype_mapping


def convert_datetime_hourly(value):
return pd.to_datetime(value, format='%Y%m%d%H')
2 changes: 1 addition & 1 deletion python_dwd/data_collection.py
Expand Up @@ -79,7 +79,7 @@ def collect_dwd_data(station_ids: List[int],

filenames_and_files = download_dwd_data(remote_files, parallel_download)

station_data = parse_dwd_data(filenames_and_files)
station_data = parse_dwd_data(filenames_and_files, time_resolution)

if write_file:
store_dwd_data(
Expand Down
19 changes: 14 additions & 5 deletions python_dwd/parsing_data/parse_data_from_files.py
@@ -1,17 +1,19 @@
""" function to read data from dwd server """
import logging
from typing import List, Tuple
from typing import List, Tuple, Union
from io import BytesIO
import pandas as pd

from python_dwd.additionals.helpers import create_stationdata_dtype_mapping
from python_dwd.additionals.helpers import create_stationdata_dtype_mapping, convert_datetime_hourly
from python_dwd.constants.column_name_mapping import GERMAN_TO_ENGLISH_COLUMNS_MAPPING
from python_dwd.constants.metadata import NA_STRING, STATIONDATA_SEP
from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
from python_dwd.enumerations.time_resolution_enumeration import TimeResolution

log = logging.getLogger(__name__)


def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFrame:
def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]], time_resolution: Union[TimeResolution, str]) -> pd.DataFrame:
"""
This function is used to read the station data from given bytes object.
The filename is required to defined if and where an error happened.
Expand All @@ -23,9 +25,12 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFra
Returns:
pandas.DataFrame with requested data, for different station ids the data is still put into one DataFrame
"""

time_resolution = TimeResolution(time_resolution)

data = []
for filename_and_file in filenames_and_files:
data.append(_parse_dwd_data(filename_and_file))
data.append(_parse_dwd_data(filename_and_file, time_resolution))

try:
data = pd.concat(data).reset_index(drop=True)
Expand All @@ -38,7 +43,7 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]]) -> pd.DataFra
return data


def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO]) -> pd.DataFrame:
def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: TimeResolution) -> pd.DataFrame:
"""
A wrapping function that only handles data for one station id. The files passed to it are thus related to this id.
This is important for storing the data locally as the DataFrame that is stored should obviously only handle one
Expand Down Expand Up @@ -78,6 +83,10 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO]) -> pd.DataFrame:
# Assign meaningful column names (baseline).
data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)

# Properly handle timestamps from "hourly" resolution.
if time_resolution == TimeResolution.HOURLY:
data[DWDMetaColumns.DATE.value] = data[DWDMetaColumns.DATE.value].apply(convert_datetime_hourly)

# Coerce the data types appropriately.
data = data.astype(create_stationdata_dtype_mapping(data.columns))

Expand Down
5 changes: 4 additions & 1 deletion tests/parsing_data/test_parse_data_from_files.py
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
import pandas as pd

from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
from python_dwd.parsing_data.parse_data_from_files import parse_dwd_data

fixtures_dir = Path(__file__, "../..").resolve().absolute() / "fixtures"
Expand All @@ -18,6 +19,8 @@ def test_parse_dwd_data():
file_in_bytes.seek(0)

station_data = parse_dwd_data(
filenames_and_files=[(filename, file_in_bytes)])
filenames_and_files=[(filename, file_in_bytes)],
time_resolution=TimeResolution.DAILY
)

station_data.equals(station_data_original)
24 changes: 22 additions & 2 deletions tests/test_data_collection.py
Expand Up @@ -119,7 +119,7 @@ def test_collect_dwd_data_empty():


@pytest.mark.remote
def test_fetch_and_parse_dwd_data_vanilla_columns():
def test_collect_daily_vanilla():
""" Test for data collection with real data """

data = collect_dwd_data(
Expand Down Expand Up @@ -152,7 +152,7 @@ def test_fetch_and_parse_dwd_data_vanilla_columns():


@pytest.mark.remote
def test_fetch_and_parse_dwd_data_humanized_columns():
def test_collect_daily_humanized():
""" Test for data collection with real data and humanized column names """

data = collect_dwd_data(
Expand Down Expand Up @@ -183,3 +183,23 @@ def test_fetch_and_parse_dwd_data_humanized_columns():
'TEMPERATURE_MIN_200',
'TEMPERATURE_MIN_005',
]


@pytest.mark.remote
def test_collect_hourly_vanilla():
""" Test for data collection with real data """

data = collect_dwd_data(
station_ids=[1048],
parameter=Parameter.TEMPERATURE_AIR,
time_resolution=TimeResolution.HOURLY,
period_type=PeriodType.RECENT
)

assert list(data.columns.values) == [
'STATION_ID',
'DATE',
'QN_9',
'TT_TU',
'RF_TU',
]

0 comments on commit 8d6f9fb

Please sign in to comment.