Skip to content

Commit

Permalink
Use meaningful column names, starting with daily climate summary data
Browse files Browse the repository at this point in the history
From "column_name_mapping.py" and "column_names_enumeration.py",
we are seeing the intention to map meteorological short identifiers
to more meaningful english names.

This complements the current implementation by providing appropriate
mappings for daily climate summary data (kl) and also adds an 
appropriate test case for that.

While being on it, we discovered that "_parse_dwd_data" as well as
"collect_dwd_data" somehow wouldn't actually account for column names
to be propagated, so we adjusted some spots on data frame handling.

Trivia:
- The test case has been marked as "remote" to be able to tell unit
  tests based on fixtures and full integration tests apart.
- When massaging the data frame after parsing data from CSV,
  the "EOR" column gets dropped right away as it actually 
  has no real value on downstream processing.
  • Loading branch information
amotl committed Jun 10, 2020
1 parent 969902f commit 750fef7
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 4 deletions.
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]

markers =
remote: Tests accessing the internet.
18 changes: 17 additions & 1 deletion python_dwd/constants/column_name_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,23 @@
DWDOrigColumns.LONGITUDE.value: DWDColumns.LONGITUDE.value,
DWDOrigColumns.LONGITUDE_ALTERNATIVE.value: DWDColumns.LONGITUDE.value,
DWDOrigColumns.STATIONNAME.value: DWDColumns.STATIONNAME.value,
DWDOrigColumns.STATE.value: DWDColumns.STATE.value
DWDOrigColumns.STATE.value: DWDColumns.STATE.value,

# Daily climate summary
DWDOrigColumns.FX.value: DWDColumns.FX.value,
DWDOrigColumns.FM.value: DWDColumns.FM.value,
DWDOrigColumns.RSK.value: DWDColumns.RSK.value,
DWDOrigColumns.RSKF.value: DWDColumns.RSKF.value,
DWDOrigColumns.SDK.value: DWDColumns.SDK.value,
DWDOrigColumns.SHK_TAG.value: DWDColumns.SHK_TAG.value,
DWDOrigColumns.NM.value: DWDColumns.NM.value,
DWDOrigColumns.VPM.value: DWDColumns.VPM.value,
DWDOrigColumns.PM.value: DWDColumns.PM.value,
DWDOrigColumns.TMK.value: DWDColumns.TMK.value,
DWDOrigColumns.UPM.value: DWDColumns.UPM.value,
DWDOrigColumns.TXK.value: DWDColumns.TXK.value,
DWDOrigColumns.TNK.value: DWDColumns.TNK.value,
DWDOrigColumns.TGK.value: DWDColumns.TGK.value,
}

METADATA_DTYPE_MAPPING = {
Expand Down
2 changes: 1 addition & 1 deletion python_dwd/data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ def collect_dwd_data(station_ids: List[int],

data.append(station_data)

return pd.concat(data, axis=1, ignore_index=True)
return pd.concat(data)
32 changes: 32 additions & 0 deletions python_dwd/enumerations/column_names_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,22 @@ class DWDOrigColumns(Enum):
STATIONNAME = "STATIONSNAME"
STATE = "BUNDESLAND"

# Daily climate summary
FX = "FX"
FM = "FM"
RSK = "RSK"
RSKF = "RSKF"
SDK = "SDK"
SHK_TAG = "SHK_TAG"
NM = "NM"
VPM = "VPM"
PM = "PM"
TMK = "TMK"
UPM = "UPM"
TXK = "TXK"
TNK = "TNK"
TGK = "TGK"


class DWDColumns(Enum):
""" Overhauled column names for the library """
Expand All @@ -41,3 +57,19 @@ class DWDColumns(Enum):
FILENAME = "FILENAME"
HAS_FILE = "HAS_FILE"
FILEID = "FILEID"

# Daily climate summary
FX = "WIND_GUST_MAX"
FM = "WIND_VELOCITY"
RSK = "PRECIPITATION_HEIGHT"
RSKF = "PRECIPITATION_FORM"
SDK = "SUNSHINE_DURATION"
SHK_TAG = "SNOW_DEPTH"
NM = "CLOUD_COVER"
VPM = "VAPOR_PRESSURE"
PM = "PRESSURE"
TMK = "TEMPERATURE"
UPM = "HUMIDITY"
TXK = "TEMPERATURE_MAX_200"
TNK = "TEMPERATURE_MIN_200"
TGK = "TEMPERATURE_MIN_005"
17 changes: 15 additions & 2 deletions python_dwd/parsing_data/parse_data_from_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO]) -> pd.DataFrame:
log.warning(f"The file representing {filename} is None and is skipped.")
return pd.DataFrame()

data = data.rename(columns=str.upper).rename(GERMAN_TO_ENGLISH_COLUMNS_MAPPING)
# Column names contain spaces, so strip them away.
data.rename(columns=str.strip, inplace=True)

return data.astype(create_stationdata_dtype_mapping(data.columns))
# Make column names uppercase.
data.rename(columns=str.upper, inplace=True)

# End of record (EOR) has no value, so drop it right away.
data.drop(columns='EOR', inplace=True)

# Assign meaningful column names.
data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING, inplace=True)

# Get the data types right.
data = data.astype(create_stationdata_dtype_mapping(data.columns))

return data
37 changes: 37 additions & 0 deletions tests/parsing_data/test_fetch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest

from python_dwd import collect_dwd_data
from python_dwd.enumerations.parameter_enumeration import Parameter
from python_dwd.enumerations.period_type_enumeration import PeriodType
from python_dwd.enumerations.time_resolution_enumeration import TimeResolution


@pytest.mark.remote
def test_fetch_and_parse_dwd_data():
data = collect_dwd_data(
station_ids=[1048],
parameter=Parameter.CLIMATE_SUMMARY,
time_resolution=TimeResolution.DAILY,
period_type=PeriodType.RECENT,
)

assert list(data.columns.values) == [
'STATION_ID',
'DATE',
'QN_3',
'WIND_GUST_MAX',
'WIND_VELOCITY',
'QN_4',
'PRECIPITATION_HEIGHT',
'PRECIPITATION_FORM',
'SUNSHINE_DURATION',
'SNOW_DEPTH',
'CLOUD_COVER',
'VAPOR_PRESSURE',
'PRESSURE',
'TEMPERATURE',
'HUMIDITY',
'TEMPERATURE_MAX_200',
'TEMPERATURE_MIN_200',
'TEMPERATURE_MIN_005',
]

0 comments on commit 750fef7

Please sign in to comment.