Skip to content

Commit

Permalink
Finish implementation of new methods plus testing
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed Jun 10, 2020
1 parent 527ba8f commit 0c7dded
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 81 deletions.
2 changes: 2 additions & 0 deletions python_dwd/data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ def collect_dwd_data(station_ids: List[int],
remote_files = create_file_list_for_dwd_server(
station_ids, parameter, time_resolution, period_type, folder, create_new_filelist)

print(remote_files)

filenames_and_files = download_dwd_data(remote_files, parallel_download)

station_data = parse_dwd_data(filenames_and_files)
Expand Down
15 changes: 7 additions & 8 deletions python_dwd/data_storing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Data storing/restoring methods"""
from pathlib import Path
from typing import Tuple
from typing import Tuple, Union
import pandas as pd

from python_dwd.additionals.helpers import create_stationdata_dtype_mapping
Expand Down Expand Up @@ -80,11 +80,10 @@ def restore_dwd_data(station_id: int,
f"{STATIONDATA_NAME}{H5_FORMAT}"

try:
station_data = pd.DataFrame(
pd.read_hdf(
path_or_buf=local_filepath,
key=request_string
)
# typing required as pandas.read_hdf returns an object by typing
station_data: Union[object, pd.DataFrame] = pd.read_hdf(
path_or_buf=local_filepath,
key=request_string
)
except FileNotFoundError:
print(f"Error: There seems to be no file at {str(local_filepath)}. "
Expand All @@ -99,7 +98,7 @@ def restore_dwd_data(station_id: int,
return True, station_data.astype(create_stationdata_dtype_mapping(station_data.columns))


def _build_local_store_key(station_id: int,
def _build_local_store_key(station_id: Union[str, int],
parameter: Parameter,
time_resolution: TimeResolution,
period_type: PeriodType) -> str:
Expand All @@ -116,6 +115,6 @@ def _build_local_store_key(station_id: int,
a string building a key that is used to identify the request
"""
request_string = f"{parameter.value}/{time_resolution.value}/" \
f"{period_type.value}/{station_id}"
f"{period_type.value}/station_id_{int(station_id)}"

return request_string
47 changes: 26 additions & 21 deletions python_dwd/file_path_handling/file_list_creation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
""" file list creation for requested files """
from pathlib import Path
from typing import List, Union
from typing import List
import pandas as pd

from python_dwd.additionals.functions import check_parameters, cast_to_list
from python_dwd.additionals.functions import check_parameters
from python_dwd.file_path_handling.path_handling import correct_folder_path
from python_dwd.additionals.helpers import create_fileindex
from python_dwd.constants.access_credentials import DWD_FOLDER_MAIN, DWD_FOLDER_METADATA
Expand All @@ -14,7 +14,7 @@
from python_dwd.enumerations.time_resolution_enumeration import TimeResolution


def create_file_list_for_dwd_server(station_id: Union[str, int, List[int]],
def create_file_list_for_dwd_server(station_ids: List[int],
parameter: Parameter,
time_resolution: TimeResolution,
period_type: PeriodType,
Expand All @@ -27,7 +27,7 @@ def create_file_list_for_dwd_server(station_id: Union[str, int, List[int]],
available online.
Args:
station_id: id(s) for the weather station to ask for data
station_ids: id(s) for the weather station to ask for data
parameter: observation measure
time_resolution: frequency/granularity of measurement interval
period_type: recent or historical files
Expand All @@ -38,30 +38,35 @@ def create_file_list_for_dwd_server(station_id: Union[str, int, List[int]],
List of path's to file
"""
# Ensure integers
station_id = [int(s) for s in cast_to_list(station_id)]
# Check type of function parameters
station_ids = [int(statid) for statid in station_ids]

# Check for the combination of requested parameters
check_parameters(parameter=parameter, time_resolution=time_resolution, period_type=period_type)
check_parameters(parameter=parameter,
time_resolution=time_resolution,
period_type=period_type)

# Create name of fileslist file
# Create name of fileslistfile
filelist_local = f'{FILELIST_NAME}_{parameter.value}_' \
f'{time_resolution.value}_{period_type.value}{DATA_FORMAT}'
f'{time_resolution.value}_{period_type.value}'

# Create filepath to filelist in folder
filelist_local_path = Path(folder, DWD_FOLDER_METADATA, filelist_local)
filelist_local_path = Path(folder,
DWD_FOLDER_METADATA,
filelist_local)

filelist_local_path = f"{filelist_local_path}{DATA_FORMAT}"

if create_new_filelist or not Path(filelist_local_path).is_file():
create_fileindex(parameter, time_resolution, period_type, folder)
create_fileindex(parameter=parameter,
time_resolution=time_resolution,
period_type=period_type,
folder=folder)

filelist = pd.read_csv(
filepath_or_buffer=filelist_local_path,
sep=",",
dtype={
DWDColumns.FILEID.value: int,
DWDColumns.STATION_ID.value: int,
DWDColumns.FILENAME.value: str
}
)
filelist = pd.read_csv(filepath_or_buffer=filelist_local_path,
sep=",",
dtype={DWDColumns.FILEID.value: int,
DWDColumns.STATION_ID.value: int,
DWDColumns.FILENAME.value: str})

return filelist.loc[filelist[DWDColumns.STATION_ID.value].isin(station_id), :]
return filelist.loc[filelist[DWDColumns.STATION_ID.value].isin(station_ids), :]
45 changes: 8 additions & 37 deletions tests/parsing_data/test_parse_data_from_files.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,23 @@
from io import StringIO
from shutil import rmtree
""" Tests for parse_dwd_data function """
from typing import Union
from io import StringIO, BytesIO
from pathlib import Path
import pandas as pd

from python_dwd import parse_dwd_data
from python_dwd.parsing_data.parse_data_from_files import parse_dwd_data

fixtures_dir = Path(__file__, "../..").resolve().absolute() / "fixtures"


def test_parse_dwd_data():
filename = "tageswerte_KL_00001_19370101_19860630_hist.zip"

file = pd.read_json(fixtures_dir / "FIXED_STATIONDATA.JSON")
file_in_bytes = StringIO()
file.to_csv(file_in_bytes, sep=";")
station_data_original = pd.read_json(fixtures_dir / "FIXED_STATIONDATA.JSON")
file_in_bytes: Union[StringIO, BytesIO] = StringIO()
station_data_original.to_csv(file_in_bytes, sep=";")
file_in_bytes.seek(0)

station_data = parse_dwd_data(
filenames_and_files=[(filename, file_in_bytes)])

stationdata_local = parse_dwd_data(
filenames_and_files=[(filename, file_in_bytes)],
prefer_local=True,
folder=Path(__file__).parent.absolute() / "dwd_data"
)

# 1. Compare freshly loaded data with data read from hdf and assure it's identical
assert stationdata_online.equals(stationdata_local)

# 2. Check functioning for filename only, that is used for parameter definition to parse data from local hdf file
assert stationdata_online.equals(
parse_dwd_data(
filenames_and_files=[filename],
prefer_local=True,
folder=Path(__file__).parent.absolute() / "dwd_data",
write_file=False
)
)

# 3. Check for only giving filename but no valid filepath
assert parse_dwd_data(
filenames_and_files=[filename],
prefer_local=True,
folder="wrong/folder/name",
write_file=False
).empty

rmtree(Path(Path(__file__).parent.absolute() / "dwd_data"))

# To ensure files are deleted with the above execution
assert True
station_data.equals(station_data_original)
89 changes: 75 additions & 14 deletions tests/test_data_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,116 @@
from pathlib import Path
import pandas as pd
from io import StringIO
from shutil import rmtree

from python_dwd.enumerations.column_names_enumeration import DWDColumns
from python_dwd.enumerations.parameter_enumeration import Parameter
from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
from python_dwd.enumerations.period_type_enumeration import PeriodType
from python_dwd.data_collection import collect_dwd_data


fixtures_dir = Path(__file__, "../").resolve().absolute() / "fixtures"

# Setting parameters
# Setting parameters for tests
station_ids = [1]
parameter = Parameter.CLIMATE_SUMMARY
time_resolution = TimeResolution.DAILY
period_type = PeriodType.HISTORICAL
folder = ""
parallel_download = False
write_file = False
create_new_filelist = False

# Set filename for mock
filename = "tageswerte_KL_00001_19370101_19860630_hist.zip"

# Loading test data
file = pd.read_json(fixtures_dir / "FIXED_STATIONDATA.JSON")
file_in_bytes = StringIO()
file.to_csv(file_in_bytes, sep=";")
file_in_bytes.seek(0)

# Prepare csv for regular "downloading" test
csv_file = StringIO()
file.to_csv(csv_file, sep=";")
csv_file.seek(0)


@patch(
"python_dwd.file_path_handling.file_list_creation.create_file_list_for_dwd_server",
"python_dwd.data_collection.create_file_list_for_dwd_server",
mock.MagicMock(return_value=pd.DataFrame({DWDColumns.FILENAME.value: [filename]}))
)
@patch(
"python_dwd.download.download.download_dwd_data",
mock.MagicMock(return_value=(filename, file_in_bytes))
"python_dwd.data_collection.download_dwd_data",
mock.MagicMock(return_value=[(filename, csv_file)])
)
def test_collect_dwd_data_online():
# test for no local interaction
def test_collect_dwd_data():
""" Test for data collection """

# Create folder for storage test
test_folder = Path(Path(__file__).parent.absolute() / "dwd_data")
test_folder.mkdir(parents=True, exist_ok=True)

"""
1. Scenario
This scenario makes sure we take fresh data and write it to the given folder, thus we can run
just another test afterwards as no old data is used
"""
collect_dwd_data(
station_ids=station_ids,
parameter=parameter,
time_resolution=time_resolution,
period_type=period_type,
folder=folder,
folder=test_folder,
prefer_local=False,
parallel_download=parallel_download,
write_file=write_file,
write_file=True,
create_new_filelist=create_new_filelist
).equals(file)

"""
2. Scenario
This scenario tries to get the data from the given folder. This data was placed by the first test
and is now restored
"""
collect_dwd_data(
station_ids=station_ids,
parameter=parameter,
time_resolution=time_resolution,
period_type=period_type,
folder=test_folder,
prefer_local=True,
parallel_download=parallel_download,
write_file=True,
create_new_filelist=create_new_filelist
).equals(file)

# @todo implement further tests
# Remove storage folder
rmtree(test_folder)

# Have to place an assert afterwards to ensure that above function is executed
assert True


@patch(
"python_dwd.data_collection.restore_dwd_data",
mock.MagicMock(return_value=(False, pd.DataFrame()))
)
@patch(
"python_dwd.data_collection.create_file_list_for_dwd_server",
mock.MagicMock(return_value=pd.DataFrame(None, columns=[DWDColumns.FILENAME.value]))
)
def test_collect_dwd_data_empty():
""" Test for data collection with no available data """

"""
1. Scenario
Test for request where no data is available
"""
assert collect_dwd_data(
station_ids=station_ids,
parameter=parameter,
time_resolution=time_resolution,
period_type=period_type,
folder="",
prefer_local=True,
parallel_download=parallel_download,
write_file=False,
create_new_filelist=create_new_filelist
).empty

0 comments on commit 0c7dded

Please sign in to comment.