Skip to content

Commit

Permalink
Setup basis for tidy data reshaping
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed Jul 26, 2020
1 parent 7e8a8cf commit f8c71aa
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 24 deletions.
90 changes: 86 additions & 4 deletions wetterdienst/data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
parse_enumeration_from_template,
create_humanized_column_names_mapping,
)
from wetterdienst.enumerations.column_names_enumeration import DWDMetaColumns
from wetterdienst.enumerations.parameter_enumeration import Parameter
from wetterdienst.enumerations.period_type_enumeration import PeriodType
from wetterdienst.enumerations.time_resolution_enumeration import TimeResolution
Expand All @@ -31,6 +32,20 @@
log = logging.getLogger(__name__)


POSSIBLE_ID_VARS = (
DWDMetaColumns.STATION_ID.value,
DWDMetaColumns.DATE.value,
DWDMetaColumns.FROM_DATE.value,
DWDMetaColumns.TO_DATE.value,
)

POSSIBLE_DATE_VARS = (
DWDMetaColumns.DATE.value,
DWDMetaColumns.FROM_DATE.value,
DWDMetaColumns.TO_DATE.value,
)


def collect_dwd_data(
station_ids: List[int],
parameter: Union[Parameter, str],
Expand All @@ -39,9 +54,10 @@ def collect_dwd_data(
folder: Union[str, Path] = DWD_FOLDER_MAIN,
prefer_local: bool = False,
write_file: bool = False,
create_new_file_index: bool = False,
tidy_data: bool = True,
humanize_column_names: bool = False,
run_download_only: bool = False,
create_new_file_index: bool = False,
) -> Optional[pd.DataFrame]:
"""
Function that organizes the complete pipeline of data collection, either
Expand All @@ -56,12 +72,15 @@ def collect_dwd_data(
period_type: period type as enumeration
folder: folder for local file interaction
prefer_local: boolean for if local data should be preferred
write_file: boolean if to write data to local storage
create_new_file_index: boolean if to create a new file index for the
data selection
write_file: boolean to write data to local storage
tidy_data: boolean to tidy up data so that there's only one set of values for
a datetime in a row
e.g. station_id, parameter, element, datetime, value, quality
humanize_column_names: boolean to yield column names better for
human consumption
run_download_only: boolean to run only the download and storing process
create_new_file_index: boolean if to create a new file index for the
data selection
Returns:
a pandas DataFrame with all the data given by the station ids
Expand Down Expand Up @@ -134,9 +153,72 @@ def collect_dwd_data(
except ValueError:
return pd.DataFrame()

if tidy_data:
data = _tidy_up_data(data, parameter)

# Assign meaningful column names (humanized).
if humanize_column_names:
hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
data = data.rename(columns=hcnm)

return data


def _tidy_up_data(df: pd.DataFrame, parameter: Parameter) -> pd.DataFrame:
"""
Function to create a tidy DataFrame by reshaping it, putting quality in a
separate column and setting an extra column with the parameter.
Args:
df: DataFrame to be tidied
parameter: the parameter that is written in a column to identify a set of
different parameters amongst each other
Returns:
the tidied DataFrame
"""
id_vars = []
date_vars = []

# Add id columns based on metadata columns
for column in POSSIBLE_ID_VARS:
if column in df:
id_vars.append(column)
if column in POSSIBLE_DATE_VARS:
date_vars.append(column)

# Extract quality
# Set empty quality for first columns until first QN column
quality = pd.Series()
column_quality = pd.Series()

for column in df:
# If is quality column, overwrite current "column quality"
if column.startswith("QN"):
column_quality = df.pop(column)
else:
quality = quality.append(column_quality)

df_tidy = df.melt(
id_vars=id_vars,
var_name=DWDMetaColumns.ELEMENT.value,
value_name=DWDMetaColumns.VALUE.value,
)

df_tidy[DWDMetaColumns.PARAMETER.value] = parameter.name

df_tidy[DWDMetaColumns.QUALITY.value] = quality.values

# Reorder properly
df_tidy = df_tidy.reindex(
columns=[
DWDMetaColumns.STATION_ID.value,
DWDMetaColumns.PARAMETER.value,
DWDMetaColumns.ELEMENT.value,
*date_vars,
DWDMetaColumns.VALUE.value,
DWDMetaColumns.QUALITY.value,
]
)

return df_tidy
12 changes: 8 additions & 4 deletions wetterdienst/dwd_station_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def __init__(
prefer_local: bool = False,
write_file: bool = False,
folder: Union[str, Path] = DWD_FOLDER_MAIN,
create_new_file_index: bool = False,
tidy_data: bool = True,
humanize_column_names: bool = False,
create_new_file_index: bool = False,
) -> None:
"""
Class with mostly flexible arguments to define a request regarding DWD data.
Expand All @@ -61,8 +62,9 @@ def __init__(
prefer_local: definition if data should rather be taken from a local source
write_file: should data be written to a local file
folder: place where file lists (and station data) are stored
tidy_data: reshape DataFrame to a more tidy, row based version of data
humanize_column_names: replace column names by more meaningful ones
create_new_file_index: definition if the file index should be recreated
humanize_column_names:
"""

if not (period_type or (start_date and end_date)):
Expand Down Expand Up @@ -139,8 +141,9 @@ def __init__(
self.prefer_local = prefer_local
self.write_file = write_file
self.folder = folder
self.create_new_file_index = create_new_file_index
self.tidy_data = tidy_data
self.humanize_column_names = humanize_column_names
self.create_new_file_index = create_new_file_index

def __eq__(self, other):
return [
Expand Down Expand Up @@ -195,8 +198,9 @@ def collect_data(self) -> Generator[pd.DataFrame, None, None]:
folder=self.folder,
prefer_local=self.prefer_local,
write_file=self.write_file,
create_new_file_index=False,
tidy_data=self.tidy_data,
humanize_column_names=self.humanize_column_names,
create_new_file_index=False,
)

# Filter out values which already are in the DataFrame
Expand Down
6 changes: 6 additions & 0 deletions wetterdienst/enumerations/column_names_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ class DWDMetaColumns(Enum):
FILENAME = "FILENAME"
HAS_FILE = "HAS_FILE"
FILEID = "FILEID"
# Columns used for tidy data
# Column for quality
PARAMETER = "PARAMETER"
ELEMENT = "ELEMENT"
VALUE = "VALUE"
QUALITY = "QUALITY"


# https://stackoverflow.com/questions/33727217/subscriptable-objects-in-class
Expand Down
24 changes: 8 additions & 16 deletions wetterdienst/parsing_data/parse_data_from_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,27 +101,19 @@ def _parse_dwd_data(
# information. Also rename column with true local time to english one
data = data.rename(
columns={
DWDOrigMetaColumns.DATE.value: (
DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value
),
"MESS_DATUM_WOZ": DWDOrigDataColumns.HOURLY.SOLAR.TRUE_LOCAL_TIME.value,
}
)

# Duplicate the end of interval column to create real datetime column
# remove minutes e.g. ":09" at the end of string
data[DWDMetaColumns.DATE.value] = data[
DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value
].str[:-3]
# Duplicate the date column to end of interval column
data[DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value] = data[
DWDOrigMetaColumns.DATE.value
]

# Store columns for later reordering
columns = data.columns.values.tolist()
# Create newly ordered columns, date is inserted while original date was
# renamed above
columns_reordered = [columns[0], columns[-1], *columns[1:-1]]

# Reorder columns to general format
data = data.reindex(columns=columns_reordered)
# Fix real date column by cutting of minutes
data[DWDOrigMetaColumns.DATE.value] = data[DWDOrigMetaColumns.DATE.value].str[
:-3
]

# Assign meaningful column names (baseline).
data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)
Expand Down

0 comments on commit f8c71aa

Please sign in to comment.