Skip to content

Commit

Permalink
Merge pull request earthobservations#115 from earthobservations/exten…
Browse files Browse the repository at this point in the history
…d-coerce-column-types

Dtype conversion is extended to integer fields and string fields
  • Loading branch information
gutzbenj committed Jul 22, 2020
2 parents a545bd5 + a17d0f0 commit eea55dc
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 38 deletions.
25 changes: 24 additions & 1 deletion tests/additionals/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import pytest
import numpy as np
import pandas as pd

from wetterdienst.additionals.functions import check_parameters, retrieve_time_resolution_from_filename, \
retrieve_parameter_from_filename, retrieve_period_type_from_filename, determine_parameters, \
cast_to_list, parse_enumeration_from_template, create_humanized_column_names_mapping
cast_to_list, parse_enumeration_from_template, create_humanized_column_names_mapping, coerce_field_types
from wetterdienst.enumerations.period_type_enumeration import PeriodType
from wetterdienst.enumerations.time_resolution_enumeration import TimeResolution
from wetterdienst.enumerations.parameter_enumeration import Parameter
Expand Down Expand Up @@ -54,6 +56,27 @@ def test_parse_enumeration_from_template():
parse_enumeration_from_template("climate", Parameter)


def test_coerce_field_types():
df = pd.DataFrame({
"QN": ["1"],
"RS_IND_01": ["1"],
"DATE": ["1970010100"],
"END_OF_INTERVAL": ["1970010100:00"],
"V_VV_I": ["P"]
})

expected_df = pd.DataFrame({
"QN": pd.Series([1], dtype=np.int32),
"RS_IND_01": pd.Series([1], dtype=np.int32),
"DATE": [pd.Timestamp("1970-01-01")],
"END_OF_INTERVAL": [pd.Timestamp("1970-01-01")],
"V_VV_I": ["P"]
})

assert coerce_field_types(df, TimeResolution.HOURLY).values.tolist() == \
expected_df.values.tolist()


def test_create_humanized_column_names_mapping():
""" Test for function to create a mapping to humanized column names """
hcnm = create_humanized_column_names_mapping(TimeResolution.DAILY, Parameter.CLIMATE_SUMMARY)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_data_storing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import mock
from shutil import rmtree

from wetterdienst.additionals.functions import coerce_column_types
from wetterdienst.additionals.functions import coerce_field_types
from wetterdienst.enumerations.parameter_enumeration import Parameter
from wetterdienst.enumerations.time_resolution_enumeration import TimeResolution
from wetterdienst.enumerations.period_type_enumeration import PeriodType
Expand All @@ -31,7 +31,7 @@

# Loading test data
file = pd.read_json(fixtures_dir / "FIXED_STATIONDATA.JSON")
file = coerce_column_types(file, time_resolution)
file = coerce_field_types(file, time_resolution)

# Prepare csv for regular "downloading" test
csv_file = StringIO()
Expand Down
224 changes: 209 additions & 15 deletions wetterdienst/additionals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,207 @@
'row': PeriodType.RECENT # files with row are also classified as "recent" by DWD
}

DATE_FIELDS_REGULAR = (
DWDMetaColumns.DATE.value,
DWDMetaColumns.FROM_DATE.value,
DWDMetaColumns.TO_DATE.value
)

DATE_FIELDS_IRREGULAR = (
DWDDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value,
DWDDataColumns.HOURLY.SOLAR.TRUE_LOCAL_TIME.value,
)

QUALITY_FIELDS = (
# 1_minute
# precipitation
DWDOrigDataColumns.MINUTE_1.PRECIPITATION.QN.value,
# 10_minutes
# temperature_air
DWDOrigDataColumns.MINUTES_10.TEMPERATURE_AIR.QN.value,
# temperature_extreme
DWDOrigDataColumns.MINUTES_10.TEMPERATURE_EXTREME.QN.value,
# wind_extreme
DWDOrigDataColumns.MINUTES_10.WIND_EXTREME.QN.value,
# precipitation
DWDOrigDataColumns.MINUTES_10.PRECIPITATION.QN.value,
# solar
DWDOrigDataColumns.MINUTES_10.SOLAR.QN.value,
# wind
DWDOrigDataColumns.MINUTES_10.WIND.QN.value,
# hourly
# temperature_air
DWDOrigDataColumns.HOURLY.TEMPERATURE_AIR.QN_9.value,
# cloud_type
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.QN_8.value,
# cloudiness
DWDOrigDataColumns.HOURLY.CLOUDINESS.QN_8.value,
# dew_point
DWDOrigDataColumns.HOURLY.DEW_POINT.QN_8.value,
# precipitation
DWDOrigDataColumns.HOURLY.PRECIPITATION.QN_8.value,
# pressure
DWDOrigDataColumns.HOURLY.PRESSURE.QN_8.value,
# soil_temperature
DWDOrigDataColumns.HOURLY.TEMPERATURE_SOIL.QN_2.value,
# solar
DWDOrigDataColumns.HOURLY.SOLAR.QN_592.value,
# sun
DWDOrigDataColumns.HOURLY.SUN.QN_7.value,
# visibility
DWDOrigDataColumns.HOURLY.VISIBILITY.QN_8.value,
# wind
DWDOrigDataColumns.HOURLY.WIND.QN_3.value,
# wind_synop
DWDOrigDataColumns.HOURLY.WIND_SYNOPTIC.QN_8.value,
# subdaily
# air_temperature
DWDOrigDataColumns.SUBDAILY.TEMPERATURE_AIR.QN_4.value,
# cloudiness
DWDOrigDataColumns.SUBDAILY.CLOUDINESS.QN_4.value,
# moisture
DWDOrigDataColumns.SUBDAILY.MOISTURE.QN_4.value,
# pressure
DWDOrigDataColumns.SUBDAILY.PRESSURE.QN_4.value,
# soil
DWDOrigDataColumns.SUBDAILY.SOIL.QN_4.value,
# visibility
DWDOrigDataColumns.SUBDAILY.VISIBILITY.QN_4.value,
# wind
DWDOrigDataColumns.SUBDAILY.WIND.QN_4.value,
# daily
# kl
DWDOrigDataColumns.DAILY.CLIMATE_SUMMARY.QN_3.value,
DWDOrigDataColumns.DAILY.CLIMATE_SUMMARY.QN_4.value,
# more_precip
DWDOrigDataColumns.DAILY.PRECIPITATION_MORE.QN_6.value,
# soil_temperature
DWDOrigDataColumns.DAILY.TEMPERATURE_SOIL.QN_2.value,
# solar
DWDOrigDataColumns.DAILY.SOLAR.QN_592.value,
# water_equiv
DWDOrigDataColumns.DAILY.WATER_EQUIVALENT.QN_6.value,
# weather_phenomena
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.QN_4.value,
# monthly
# kl
DWDOrigDataColumns.MONTHLY.CLIMATE_SUMMARY.QN_4.value,
DWDOrigDataColumns.MONTHLY.CLIMATE_SUMMARY.QN_6.value,
# more_precip
DWDOrigDataColumns.MONTHLY.PRECIPITATION_MORE.QN_6.value,
# weather_phenomena
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.QN_4.value,
# annual
# kl
DWDOrigDataColumns.ANNUAL.CLIMATE_SUMMARY.QN_4.value,
# more_precip
DWDOrigDataColumns.ANNUAL.PRECIPITATION_MORE.QN_6.value,
# weather_phenomena
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.QN_4.value,
)

INTEGER_FIELDS = (
# 1_minute
# precipitation
DWDOrigDataColumns.MINUTE_1.PRECIPITATION.RS_IND_01.value,
# 10_minutes
# wind_extreme
DWDOrigDataColumns.MINUTES_10.WIND_EXTREME.DX_10.value,
# precipitation
DWDOrigDataColumns.MINUTES_10.PRECIPITATION.RWS_IND_10.value,
# hourly
# cloud_type
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_N.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S1_CS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S1_NS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S2_CS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S2_NS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S3_CS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S3_NS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S4_CS.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S4_NS.value,
# cloudiness
DWDOrigDataColumns.HOURLY.CLOUDINESS.V_N.value,
# precipitation
DWDOrigDataColumns.HOURLY.PRECIPITATION.RS_IND.value,
DWDOrigDataColumns.HOURLY.PRECIPITATION.WRTR.value,
# visibility
DWDOrigDataColumns.HOURLY.VISIBILITY.V_VV.value,
# wind
DWDOrigDataColumns.HOURLY.WIND.D.value,
# wind_synop
DWDOrigDataColumns.HOURLY.WIND_SYNOPTIC.DD.value,
# subdaily
# cloudiness
DWDOrigDataColumns.SUBDAILY.CLOUDINESS.N_TER.value,
DWDOrigDataColumns.SUBDAILY.CLOUDINESS.CD_TER.value,
# soil
DWDOrigDataColumns.SUBDAILY.SOIL.EK_TER.value,
# visibility
DWDOrigDataColumns.SUBDAILY.VISIBILITY.VK_TER.value,
# wind
DWDOrigDataColumns.SUBDAILY.WIND.DK_TER.value,
DWDOrigDataColumns.SUBDAILY.WIND.FK_TER.value,
# daily
# more_precip
DWDOrigDataColumns.DAILY.PRECIPITATION_MORE.RSF.value,
DWDOrigDataColumns.DAILY.PRECIPITATION_MORE.SH_TAG.value,
DWDOrigDataColumns.DAILY.PRECIPITATION_MORE.NSH_TAG.value,
# water_equiv
DWDOrigDataColumns.DAILY.WATER_EQUIVALENT.ASH_6.value,
DWDOrigDataColumns.DAILY.WATER_EQUIVALENT.SH_TAG.value,
# weather_phenomena
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.NEBEL.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.GEWITTER.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.STURM_6.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.STURM_8.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.TAU.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.GLATTEIS.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.REIF.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.GRAUPEL.value,
DWDOrigDataColumns.DAILY.WEATHER_PHENOMENA.HAGEL.value,
# monthly
# more_precip
DWDOrigDataColumns.MONTHLY.PRECIPITATION_MORE.MO_NSH.value,
DWDOrigDataColumns.MONTHLY.PRECIPITATION_MORE.MO_SH_S.value,
# weather_phenomena
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_STURM_6.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_STURM_8.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_GEWITTER.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_GLATTEIS.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_GRAUPEL.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_HAGEL.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_NEBEL.value,
DWDOrigDataColumns.MONTHLY.WEATHER_PHENOMENA.MO_TAU.value,
# annual
# more_precip
DWDOrigDataColumns.ANNUAL.PRECIPITATION_MORE.JA_NSH.value,
DWDOrigDataColumns.ANNUAL.PRECIPITATION_MORE.JA_SH_S.value,
# weather_phenomena
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_STURM_6.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_STURM_8.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_GEWITTER.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_GLATTEIS.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_GRAUPEL.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_HAGEL.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_NEBEL.value,
DWDOrigDataColumns.ANNUAL.WEATHER_PHENOMENA.JA_TAU.value,
)

STRING_FIELDS = (
# hourly
# cloud_type
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_N_I.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S1_CSA.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S2_CSA.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S3_CSA.value,
DWDOrigDataColumns.HOURLY.CLOUD_TYPE.V_S4_CSA.value,
# cloudiness
DWDOrigDataColumns.HOURLY.CLOUDINESS.V_N_I.value,
# visibility
DWDOrigDataColumns.HOURLY.VISIBILITY.V_VV_I.value,
)


def determine_parameters(filename: str) -> Tuple[Parameter, TimeResolution, PeriodType]:
"""
Expand Down Expand Up @@ -188,8 +389,8 @@ def check_parameters(parameter: Parameter,
return True


def coerce_column_types(df: pd.DataFrame,
time_resolution: TimeResolution) -> pd.DataFrame:
def coerce_field_types(df: pd.DataFrame,
time_resolution: TimeResolution) -> pd.DataFrame:
"""
A function used to create a unique dtype mapping for a given list of column names. This function is needed as we
want to ensure the expected dtypes of the returned DataFrame as well as for mapping data after reading it from a
Expand All @@ -204,27 +405,20 @@ def coerce_column_types(df: pd.DataFrame,
"""
""" Possible columns: STATION_ID, DATETIME, EOR, QN_ and other, measured values like rainfall """

regular_date_columns = (
DWDMetaColumns.DATE.value,
DWDMetaColumns.FROM_DATE.value,
DWDMetaColumns.TO_DATE.value
)

irregular_date_columns = (
DWDDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value,
DWDDataColumns.HOURLY.SOLAR.TRUE_LOCAL_TIME.value,
)

for column in df.columns:
# Properly handle timestamps from "hourly" resolution, subdaily also has hour in timestamp
if column == DWDMetaColumns.STATION_ID.value:
df[column] = df[column].astype(int)
elif column in regular_date_columns:
elif column in DATE_FIELDS_REGULAR:
df[column] = pd.to_datetime(
df[column], format=TIME_RESOLUTION_TO_DATETIME_FORMAT_MAPPING[time_resolution])
elif column in irregular_date_columns:
elif column in DATE_FIELDS_IRREGULAR:
df[column] = pd.to_datetime(
df[column], format=DatetimeFormat.YMDH_COLUMN_M.value)
elif column in QUALITY_FIELDS or column in INTEGER_FIELDS:
df[column] = df[column].astype(int)
elif column in STRING_FIELDS:
df[column] = df[column].astype(str)
else:
df[column] = df[column].astype(float)

Expand Down
5 changes: 3 additions & 2 deletions wetterdienst/download/download_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import BytesIO
from pathlib import PurePosixPath
from typing import Union
import requests

from wetterdienst.download.https_handling import create_dwd_session
from wetterdienst.file_path_handling.path_handling import build_climate_observations_path
Expand All @@ -17,9 +18,9 @@ def download_file_from_climate_observations(filepath: Union[PurePosixPath, str])
Returns:
bytes of the file
"""
dwd_session = create_dwd_session()
# dwd_session = create_dwd_session()

r = dwd_session.get(build_climate_observations_path(filepath))
r = requests.get(build_climate_observations_path(filepath))
r.raise_for_status()

return BytesIO(r.content)
16 changes: 8 additions & 8 deletions wetterdienst/enumerations/column_names_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,14 +388,14 @@ class PRECIPITATION_MORE(Enum): # noqa
# weather_phenomena
class WEATHER_PHENOMENA(Enum): # noqa
QN_4 = "QN_4"
JA_STURM_6 = "JA_STURM_6"
JA_STURM_8 = "JA_STURM_8"
JA_GEWITTER = "JA_GEWITTER"
JA_GLATTEIS = "JA_GLATTEIS"
JA_GRAUPEL = "JA_GRAUPEL"
JA_HAGEL = "JA_HAGEL"
JA_NEBEL = "JA_NEBEL"
JA_TAU = "JA_TAU"
JA_STURM_6 = "JA_STURM_6" # int
JA_STURM_8 = "JA_STURM_8" # int
JA_GEWITTER = "JA_GEWITTER" # int
JA_GLATTEIS = "JA_GLATTEIS" # int
JA_GRAUPEL = "JA_GRAUPEL" # int
JA_HAGEL = "JA_HAGEL" # int
JA_NEBEL = "JA_NEBEL" # int
JA_TAU = "JA_TAU" # int


class DWDDataColumns(_DWDDataColumnBase):
Expand Down

0 comments on commit eea55dc

Please sign in to comment.