Setup basis for tidy data reshaping

panodata · Jul 26, 2020 · f8c71aa · f8c71aa
1 parent 7e8a8cf
commit f8c71aa
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 24 deletions.
diff --git a/wetterdienst/data_collection.py b/wetterdienst/data_collection.py
@@ -9,6 +9,7 @@
     parse_enumeration_from_template,
     create_humanized_column_names_mapping,
 )
+from wetterdienst.enumerations.column_names_enumeration import DWDMetaColumns
 from wetterdienst.enumerations.parameter_enumeration import Parameter
 from wetterdienst.enumerations.period_type_enumeration import PeriodType
 from wetterdienst.enumerations.time_resolution_enumeration import TimeResolution
@@ -31,6 +32,20 @@
 log = logging.getLogger(__name__)
 
 
+POSSIBLE_ID_VARS = (
+    DWDMetaColumns.STATION_ID.value,
+    DWDMetaColumns.DATE.value,
+    DWDMetaColumns.FROM_DATE.value,
+    DWDMetaColumns.TO_DATE.value,
+)
+
+POSSIBLE_DATE_VARS = (
+    DWDMetaColumns.DATE.value,
+    DWDMetaColumns.FROM_DATE.value,
+    DWDMetaColumns.TO_DATE.value,
+)
+
+
 def collect_dwd_data(
     station_ids: List[int],
     parameter: Union[Parameter, str],
@@ -39,9 +54,10 @@ def collect_dwd_data(
     folder: Union[str, Path] = DWD_FOLDER_MAIN,
     prefer_local: bool = False,
     write_file: bool = False,
-    create_new_file_index: bool = False,
+    tidy_data: bool = True,
     humanize_column_names: bool = False,
     run_download_only: bool = False,
+    create_new_file_index: bool = False,
 ) -> Optional[pd.DataFrame]:
     """
     Function that organizes the complete pipeline of data collection, either
@@ -56,12 +72,15 @@ def collect_dwd_data(
         period_type: period type as enumeration
         folder: folder for local file interaction
         prefer_local: boolean for if local data should be preferred
-        write_file: boolean if to write data to local storage
-        create_new_file_index: boolean if to create a new file index for the
-        data selection
+        write_file: boolean to write data to local storage
+        tidy_data: boolean to tidy up data so that there's only one set of values for
+        a datetime in a row
+        e.g. station_id, parameter, element, datetime, value, quality
         humanize_column_names: boolean to yield column names better for
         human consumption
         run_download_only: boolean to run only the download and storing process
+        create_new_file_index: boolean if to create a new file index for the
+        data selection
 
     Returns:
         a pandas DataFrame with all the data given by the station ids
@@ -134,9 +153,72 @@ def collect_dwd_data(
     except ValueError:
         return pd.DataFrame()
 
+    if tidy_data:
+        data = _tidy_up_data(data, parameter)
+
     # Assign meaningful column names (humanized).
     if humanize_column_names:
         hcnm = create_humanized_column_names_mapping(time_resolution, parameter)
         data = data.rename(columns=hcnm)
 
     return data
+
+
+def _tidy_up_data(df: pd.DataFrame, parameter: Parameter) -> pd.DataFrame:
+    """
+    Function to create a tidy DataFrame by reshaping it, putting quality in a
+    separate column and setting an extra column with the parameter.
+
+    Args:
+        df: DataFrame to be tidied
+        parameter: the parameter that is written in a column to identify a set of
+        different parameters amongst each other
+
+    Returns:
+        the tidied DataFrame
+    """
+    id_vars = []
+    date_vars = []
+
+    # Add id columns based on metadata columns
+    for column in POSSIBLE_ID_VARS:
+        if column in df:
+            id_vars.append(column)
+        if column in POSSIBLE_DATE_VARS:
+            date_vars.append(column)
+
+    # Extract quality
+    # Set empty quality for first columns until first QN column
+    quality = pd.Series()
+    column_quality = pd.Series()
+
+    for column in df:
+        # If is quality column, overwrite current "column quality"
+        if column.startswith("QN"):
+            column_quality = df.pop(column)
+        else:
+            quality = quality.append(column_quality)
+
+    df_tidy = df.melt(
+        id_vars=id_vars,
+        var_name=DWDMetaColumns.ELEMENT.value,
+        value_name=DWDMetaColumns.VALUE.value,
+    )
+
+    df_tidy[DWDMetaColumns.PARAMETER.value] = parameter.name
+
+    df_tidy[DWDMetaColumns.QUALITY.value] = quality.values
+
+    # Reorder properly
+    df_tidy = df_tidy.reindex(
+        columns=[
+            DWDMetaColumns.STATION_ID.value,
+            DWDMetaColumns.PARAMETER.value,
+            DWDMetaColumns.ELEMENT.value,
+            *date_vars,
+            DWDMetaColumns.VALUE.value,
+            DWDMetaColumns.QUALITY.value,
+        ]
+    )
+
+    return df_tidy
diff --git a/wetterdienst/dwd_station_request.py b/wetterdienst/dwd_station_request.py
@@ -39,8 +39,9 @@ def __init__(
         prefer_local: bool = False,
         write_file: bool = False,
         folder: Union[str, Path] = DWD_FOLDER_MAIN,
-        create_new_file_index: bool = False,
+        tidy_data: bool = True,
         humanize_column_names: bool = False,
+        create_new_file_index: bool = False,
     ) -> None:
         """
         Class with mostly flexible arguments to define a request regarding DWD data.
@@ -61,8 +62,9 @@ def __init__(
             prefer_local: definition if data should rather be taken from a local source
             write_file: should data be written to a local file
             folder: place where file lists (and station data) are stored
+            tidy_data: reshape DataFrame to a more tidy, row based version of data
+            humanize_column_names: replace column names by more meaningful ones
             create_new_file_index: definition if the file index should be recreated
-            humanize_column_names:
         """
 
         if not (period_type or (start_date and end_date)):
@@ -139,8 +141,9 @@ def __init__(
         self.prefer_local = prefer_local
         self.write_file = write_file
         self.folder = folder
-        self.create_new_file_index = create_new_file_index
+        self.tidy_data = tidy_data
         self.humanize_column_names = humanize_column_names
+        self.create_new_file_index = create_new_file_index
 
     def __eq__(self, other):
         return [
@@ -195,8 +198,9 @@ def collect_data(self) -> Generator[pd.DataFrame, None, None]:
                     folder=self.folder,
                     prefer_local=self.prefer_local,
                     write_file=self.write_file,
-                    create_new_file_index=False,
+                    tidy_data=self.tidy_data,
                     humanize_column_names=self.humanize_column_names,
+                    create_new_file_index=False,
                 )
 
                 # Filter out values which already are in the DataFrame

diff --git a/wetterdienst/enumerations/column_names_enumeration.py b/wetterdienst/enumerations/column_names_enumeration.py
@@ -36,6 +36,12 @@ class DWDMetaColumns(Enum):
     FILENAME = "FILENAME"
     HAS_FILE = "HAS_FILE"
     FILEID = "FILEID"
+    # Columns used for tidy data
+    # Column for quality
+    PARAMETER = "PARAMETER"
+    ELEMENT = "ELEMENT"
+    VALUE = "VALUE"
+    QUALITY = "QUALITY"
 
 
 # https://stackoverflow.com/questions/33727217/subscriptable-objects-in-class

diff --git a/wetterdienst/parsing_data/parse_data_from_files.py b/wetterdienst/parsing_data/parse_data_from_files.py
@@ -101,27 +101,19 @@ def _parse_dwd_data(
         # information. Also rename column with true local time to english one
         data = data.rename(
             columns={
-                DWDOrigMetaColumns.DATE.value: (
-                    DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value
-                ),
                 "MESS_DATUM_WOZ": DWDOrigDataColumns.HOURLY.SOLAR.TRUE_LOCAL_TIME.value,
             }
         )
 
-        # Duplicate the end of interval column to create real datetime column
-        # remove minutes e.g. ":09" at the end of string
-        data[DWDMetaColumns.DATE.value] = data[
-            DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value
-        ].str[:-3]
+        # Duplicate the date column to end of interval column
+        data[DWDOrigDataColumns.HOURLY.SOLAR.END_OF_INTERVAL.value] = data[
+            DWDOrigMetaColumns.DATE.value
+        ]
 
-        # Store columns for later reordering
-        columns = data.columns.values.tolist()
-        # Create newly ordered columns, date is inserted while original date was
-        # renamed above
-        columns_reordered = [columns[0], columns[-1], *columns[1:-1]]
-
-        # Reorder columns to general format
-        data = data.reindex(columns=columns_reordered)
+        # Fix real date column by cutting of minutes
+        data[DWDOrigMetaColumns.DATE.value] = data[DWDOrigMetaColumns.DATE.value].str[
+            :-3
+        ]
 
     # Assign meaningful column names (baseline).
     data = data.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING)