From 95aae9aceb63ce2d454cc32a7ff6fed3dfb61e9d Mon Sep 17 00:00:00 2001
From: Benjamin Gutzmann <gutzemann@gmail.com>
Date: Sun, 21 Jun 2020 11:09:07 +0200
Subject: [PATCH] Overhaul file index creation based on caching

---
 README.md                                     |   6 +-
 python_dwd/__init__.py                        |   3 +
 python_dwd/data_collection.py                 |  25 ++--
 python_dwd/dwd_station_request.py             |  13 ++-
 .../file_path_handling/file_index_creation.py |  84 ++++++++++++++
 .../file_path_handling/file_list_creation.py  |  61 +++-------
 python_dwd/metadata_dwd.py                    | 109 +++---------------
 .../parsing_data/parse_data_from_files.py     |   8 +-
 python_dwd/version.py                         |   1 +
 setup.py                                      |   3 +-
 .../test_file_index_creation.py               |  27 +++++
 11 files changed, 178 insertions(+), 162 deletions(-)
 create mode 100644 python_dwd/file_path_handling/file_index_creation.py
 create mode 100644 python_dwd/version.py
 create mode 100644 tests/file_path_handling/test_file_index_creation.py

diff --git a/README.md b/README.md
index 658693735..bfcbd40ee 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Third those variables are also available in different tenses, which are:
 The toolset provides different functions which are:
 
 - metadata_for_dwd_data
-- create_file_list_for_dwd_server
+- create_file_list_for_dwd_server (+ reset_file_index_cache)
 - download_dwd_data
 - parse_dwd_data
 - get_nearest_station
@@ -46,7 +46,9 @@ All those functions have one same argument which is **folder**. It can be used t
 
 **metadata_for_dwd_data** is used to discover what data for a set of parameters (**var**, **res**, **per**) is available, specificly which stations can be found for the requested variable, resolution and period. Also it can be defined by **write_file**, if the resulting **DataFrame** should be written as **csv** to the given folder. **write_file** is a boolean value. Furthermore with **create_new_filelist**, by default set to **False**, the function can be forced to retrieve a new list of files from the ftp server, which is usually avoided if there's already a file existing in the explicit folder.
 
-**create_file_list_for_dwd_server** is used with the help of the **information of the metadata** to retrieve filelinks to files that represent a **set of parameters** in combination with the requested **statid**. Here also **create_new_filelist** can be set to **True**, if the user is sure that the **file to a certain statid** is available but somehow the old **filelist** doesn't contain a corresponding information.
+**create_file_list_for_dwd_server** is used with the help of the **information of the metadata** to retrieve filelinks to files that represent a **set of parameters** in combination with the requested **statid**. Here also **create_new_file_index** can be set to **True**, if the user is sure that the **file to a certain station id** is available but somehow the old **file index** doesn't contain a corresponding information.
+
+**reset_file_index_cache** can be used to remove old file index in order to get the latest update of files in the remote server
 
 **download_dwd_data** is used with the created filelinks of select_dwd to **download and store** the data in the given folder. Therefor it connects with the ftp and writes the corresponding file to the harddisk as defined. Furthermore it returns the local filelink or to be clear the link where the file is saved on the harddrive.
 
diff --git a/python_dwd/__init__.py b/python_dwd/__init__.py
index 0383b4e3b..beabe94fa 100644
--- a/python_dwd/__init__.py
+++ b/python_dwd/__init__.py
@@ -1,6 +1,9 @@
+from python_dwd.version import __version__
+
 from python_dwd.metadata_dwd import metadata_for_dwd_data
 from python_dwd.file_path_handling.file_list_creation import \
     create_file_list_for_dwd_server
+from python_dwd.file_path_handling.file_index_creation import reset_file_index_cache
 from python_dwd.download.download import download_dwd_data
 from python_dwd.parsing_data.parse_data_from_files import parse_dwd_data
 from python_dwd.additionals.geo_location import get_nearest_station
diff --git a/python_dwd/data_collection.py b/python_dwd/data_collection.py
index f281fa305..9fe11b6cf 100644
--- a/python_dwd/data_collection.py
+++ b/python_dwd/data_collection.py
@@ -1,7 +1,7 @@
 """ Data collection pipeline """
 import logging
 from pathlib import Path
-from typing import List, Union, Optional
+from typing import List, Union
 import pandas as pd
 
 from python_dwd.constants.column_name_mapping import GERMAN_TO_ENGLISH_COLUMNS_MAPPING_HUMANIZED
@@ -9,6 +9,7 @@
 from python_dwd.enumerations.period_type_enumeration import PeriodType
 from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
 from python_dwd.constants.access_credentials import DWD_FOLDER_MAIN
+from python_dwd.file_path_handling.file_index_creation import reset_file_index_cache
 from python_dwd.file_path_handling.file_list_creation import create_file_list_for_dwd_server
 from python_dwd.download.download import download_dwd_data
 from python_dwd.parsing_data.parse_data_from_files import parse_dwd_data
@@ -25,16 +26,14 @@ def collect_dwd_data(station_ids: List[int],
                      prefer_local: bool = False,
                      parallel_download: bool = False,
                      write_file: bool = False,
-                     create_new_filelist: bool = False,
-                     humanize_column_names: bool = False,
-                     run_download_only: bool = False) -> Optional[pd.DataFrame]:
+                     create_new_file_index: bool = False,
+                     humanize_column_names: bool = False) -> pd.DataFrame:
     """
     Function that organizes the complete pipeline of data collection, either
     from the internet or from a local file. It therefor goes through every given
     station id and, given by the parameters, either tries to get data from local
     store and/or if fails tries to get data from the internet. Finally if wanted
     it will try to store the data in a hdf file.
-
     Args:
         station_ids: station ids that are trying to be loaded
         parameter: parameter as enumeration
@@ -44,17 +43,20 @@ def collect_dwd_data(station_ids: List[int],
         prefer_local: boolean for if local data should be preferred
         parallel_download: boolean if to use parallel download when downloading files
         write_file: boolean if to write data to local storage
-        create_new_filelist: boolean if to create a new filelist for the data selection
+        create_new_file_index: boolean if to create a new file index for the data selection
         humanize_column_names: boolean to yield column names better for human consumption
-        run_download_only: boolean to run only the download and storing process
-
     Returns:
         a pandas DataFrame with all the data given by the station ids
     """
+    if create_new_file_index:
+        reset_file_index_cache()
+
     parameter = Parameter(parameter)
     time_resolution = TimeResolution(time_resolution)
     period_type = PeriodType(period_type)
 
+    # todo check parameters and if combination not existing, print something and return empty DataFrame
+
     # List for collected pandas DataFrames per each station id
     data = []
     for station_id in set(station_ids):
@@ -77,7 +79,7 @@ def collect_dwd_data(station_ids: List[int],
         log.info(f"Data for {request_string} will be collected from internet.")
 
         remote_files = create_file_list_for_dwd_server(
-            [station_id], parameter, time_resolution, period_type, folder, create_new_filelist)
+            station_id, parameter, time_resolution, period_type)
 
         filenames_and_files = download_dwd_data(remote_files, parallel_download)
 
@@ -88,10 +90,7 @@ def collect_dwd_data(station_ids: List[int],
                 station_data, station_id, parameter, time_resolution, period_type, folder)
 
         data.append(station_data)
-        
-    if run_download_only: 
-        return None
-    
+
     data = pd.concat(data)
 
     # Assign meaningful column names (humanized).
diff --git a/python_dwd/dwd_station_request.py b/python_dwd/dwd_station_request.py
index 3a3d138d0..2dfd06f1d 100644
--- a/python_dwd/dwd_station_request.py
+++ b/python_dwd/dwd_station_request.py
@@ -13,9 +13,9 @@
 from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
 from python_dwd.additionals.functions import check_parameters, cast_to_list
 from python_dwd.exceptions.start_date_end_date_exception import StartDateEndDateError
-
 from python_dwd.constants.access_credentials import DWD_FOLDER_MAIN
 from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
+from python_dwd.file_path_handling.file_index_creation import reset_file_index_cache
 
 log = logging.getLogger(__name__)
 
@@ -117,7 +117,7 @@ def collect_data(self,
                      write_file: bool = False,
                      folder: Union[str, Path] = DWD_FOLDER_MAIN,
                      parallel_download: bool = False,
-                     create_new_filelist: bool = False) -> Generator[pd.DataFrame, None, None]:
+                     create_new_file_index: bool = False) -> Generator[pd.DataFrame, None, None]:
         """
         Method to collect data for a defined request. The function is build as generator in
         order to not cloak the memory thus if the user wants the data as one pandas DataFrame
@@ -127,13 +127,16 @@ def collect_data(self,
         Args:
             prefer_local: definition if data should rather be taken from a local source
             write_file: should data be written to a local file
-            folder: place where filelists (and station data) are stored
+            folder: place where file lists (and station data) are stored
             parallel_download: definition if data is downloaded in parallel
-            create_new_filelist: definition if the fileindex should be recreated
+            create_new_file_index: definition if the file index should be recreated
 
         Returns:
             via a generator per station a pandas.DataFrame
         """
+        if create_new_file_index:
+            reset_file_index_cache()
+
         for station_id in self.station_ids:
             df_of_station_id = pd.DataFrame()
 
@@ -147,7 +150,7 @@ def collect_data(self,
                     prefer_local=prefer_local,
                     parallel_download=parallel_download,
                     write_file=write_file,
-                    create_new_filelist=create_new_filelist,
+                    create_new_file_index=False,
                     humanize_column_names=self.humanize_column_names
                 )
 
diff --git a/python_dwd/file_path_handling/file_index_creation.py b/python_dwd/file_path_handling/file_index_creation.py
new file mode 100644
index 000000000..455809ab2
--- /dev/null
+++ b/python_dwd/file_path_handling/file_index_creation.py
@@ -0,0 +1,84 @@
+""" file index creation for available DWD station data """
+from pathlib import PurePosixPath
+import re
+from functools import lru_cache
+import ftplib
+import pandas as pd
+
+from python_dwd.constants.access_credentials import DWD_PATH, DWD_SERVER
+from python_dwd.constants.metadata import ARCHIVE_FORMAT, STATID_REGEX
+from python_dwd.download.ftp_handling import FTP
+from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
+from python_dwd.enumerations.parameter_enumeration import Parameter
+from python_dwd.enumerations.period_type_enumeration import PeriodType
+from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
+
+
+@lru_cache(maxsize=None)
+def create_file_index_for_dwd_server(parameter: Parameter,
+                                     time_resolution: TimeResolution,
+                                     period_type: PeriodType) -> pd.DataFrame:
+    """
+    Function (cached) to create a file index of the DWD station data. The file index
+    is created for an individual set of parameters.
+    Args:
+        parameter: parameter of Parameter enumeration
+        time_resolution: time resolution of TimeResolution enumeration
+        period_type: period type of PeriodType enumeration
+    Returns:
+        file index in a pandas.DataFrame with sets of parameters and station id
+    """
+    server_path = PurePosixPath(DWD_PATH) / time_resolution.value / \
+        parameter.value / period_type.value
+
+    # todo: replace with global requests.Session creating the index
+    try:
+        with FTP(DWD_SERVER) as ftp:
+            ftp.login()
+            files_server = ftp.list_files(
+                remote_path=str(server_path), also_subfolders=True)
+
+    except ftplib.all_errors as e:
+        raise e("Creating file index currently not possible.")
+
+    files_server = pd.DataFrame(
+        files_server, columns=[DWDMetaColumns.FILENAME.value], dtype='str')
+
+    # Add parameters
+    files_server[DWDMetaColumns.PARAMETER.value] = parameter.value
+    files_server[DWDMetaColumns.TIME_RESOLUTION.value] = time_resolution.value
+    files_server[DWDMetaColumns.PERIOD_TYPE.value] = period_type.value
+
+    # Filter for .zip files
+    files_server = files_server[files_server.FILENAME.str.endswith(
+        ARCHIVE_FORMAT)]
+
+    files_server.loc[:, DWDMetaColumns.FILENAME.value] = files_server.loc[:, DWDMetaColumns.FILENAME.value].\
+        str.replace(DWD_PATH + '/', '')
+
+    file_names = files_server.loc[:, DWDMetaColumns.FILENAME.value].str.split("/").apply(
+        lambda strings: strings[-1])
+
+    files_server.loc[:, DWDMetaColumns.STATION_ID.value] = file_names.apply(
+        lambda x: re.findall(STATID_REGEX, x).pop(0))
+
+    files_server.loc[:, DWDMetaColumns.STATION_ID.value] = files_server.loc[:, DWDMetaColumns.STATION_ID.value].\
+        astype(int)
+
+    files_server = files_server.sort_values(
+        by=[DWDMetaColumns.STATION_ID.value, DWDMetaColumns.FILENAME.value])
+
+    selected_file_index_columns = [
+        DWDMetaColumns.PARAMETER.value,
+        DWDMetaColumns.TIME_RESOLUTION.value,
+        DWDMetaColumns.PERIOD_TYPE.value,
+        DWDMetaColumns.STATION_ID.value,
+        DWDMetaColumns.FILENAME.value
+    ]
+
+    return files_server.loc[:, selected_file_index_columns]
+
+
+def reset_file_index_cache():
+    """ Function to reset the cached file index for all kinds of parameters """
+    create_file_index_for_dwd_server.cache_clear()
diff --git a/python_dwd/file_path_handling/file_list_creation.py b/python_dwd/file_path_handling/file_list_creation.py
index eccca0dd0..19f7ebc10 100644
--- a/python_dwd/file_path_handling/file_list_creation.py
+++ b/python_dwd/file_path_handling/file_list_creation.py
@@ -1,75 +1,46 @@
 """ file list creation for requested files """
-from pathlib import Path
-from typing import List, Union
+from typing import Union
 import pandas as pd
 
-from python_dwd.additionals.functions import check_parameters
-from python_dwd.additionals.helpers import create_fileindex
-from python_dwd.constants.access_credentials import DWD_FOLDER_MAIN, DWD_FOLDER_METADATA
-from python_dwd.constants.metadata import FILELIST_NAME, DATA_FORMAT
 from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
 from python_dwd.enumerations.parameter_enumeration import Parameter
 from python_dwd.enumerations.period_type_enumeration import PeriodType
 from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
+from python_dwd.file_path_handling.file_index_creation import create_file_index_for_dwd_server, \
+    reset_file_index_cache
 
 
-def create_file_list_for_dwd_server(station_ids: List[int],
+def create_file_list_for_dwd_server(station_id: Union[str, int],
                                     parameter: Union[Parameter, str],
                                     time_resolution: Union[TimeResolution, str],
                                     period_type: Union[PeriodType, str],
-                                    folder: str = DWD_FOLDER_MAIN,
-                                    create_new_filelist=False) -> pd.DataFrame:
+                                    create_new_file_index: bool = False) -> pd.DataFrame:
     """
     Function for selecting datafiles (links to archives) for given
     station_ids, parameter, time_resolution and period_type under consideration of a
     created list of files that are
     available online.
-
     Args:
-        station_ids: id(s) for the weather station to ask for data
+        station_id: id for the weather station to ask for data
         parameter: observation measure
         time_resolution: frequency/granularity of measurement interval
         period_type: recent or historical files
-        folder:
-        create_new_filelist: boolean for checking existing file list or not
-
+        create_new_file_index: set if new file index is created
     Returns:
         List of path's to file
-
     """
+    if create_new_file_index:
+        reset_file_index_cache()
+
     parameter = Parameter(parameter)
     time_resolution = TimeResolution(time_resolution)
     period_type = PeriodType(period_type)
 
-    # Check type of function parameters
-    station_ids = [int(statid) for statid in station_ids]
-
-    # Check for the combination of requested parameters
-    check_parameters(parameter=parameter,
-                     time_resolution=time_resolution,
-                     period_type=period_type)
-
-    # Create name of fileslistfile
-    filelist_local = f'{FILELIST_NAME}_{parameter.value}_' \
-                     f'{time_resolution.value}_{period_type.value}'
-
-    # Create filepath to filelist in folder
-    filelist_local_path = Path(folder,
-                               DWD_FOLDER_METADATA,
-                               filelist_local)
-
-    filelist_local_path = f"{filelist_local_path}{DATA_FORMAT}"
-
-    if create_new_filelist or not Path(filelist_local_path).is_file():
-        create_fileindex(parameter=parameter,
-                         time_resolution=time_resolution,
-                         period_type=period_type,
-                         folder=folder)
+    file_index = create_file_index_for_dwd_server(
+        parameter, time_resolution, period_type)
 
-    filelist = pd.read_csv(filepath_or_buffer=filelist_local_path,
-                           sep=",",
-                           dtype={DWDMetaColumns.FILEID.value: int,
-                                  DWDMetaColumns.STATION_ID.value: int,
-                                  DWDMetaColumns.FILENAME.value: str})
+    file_index = file_index[
+        file_index[DWDMetaColumns.STATION_ID.value] == int(station_id)
+    ]
 
-    return filelist.loc[filelist[DWDMetaColumns.STATION_ID.value].isin(station_ids), :]
+    return file_index.loc[:, [DWDMetaColumns.FILENAME.value]]
diff --git a/python_dwd/metadata_dwd.py b/python_dwd/metadata_dwd.py
index 0eb1b052a..c6aee7987 100644
--- a/python_dwd/metadata_dwd.py
+++ b/python_dwd/metadata_dwd.py
@@ -1,64 +1,37 @@
 """ Meta data handling """
-from pathlib import Path
 from typing import Union
 import pandas as pd
 
-from python_dwd.additionals.functions import check_parameters
-from python_dwd.additionals.helpers import create_fileindex, check_file_exist
 from python_dwd.additionals.helpers import metaindex_for_1minute_data, create_metaindex
 from python_dwd.enumerations.column_names_enumeration import DWDMetaColumns
-from python_dwd.constants.access_credentials import DWD_FOLDER_MAIN, \
-    DWD_FOLDER_METADATA
-from python_dwd.constants.metadata import METADATA_NAME, DATA_FORMAT
 from python_dwd.enumerations.parameter_enumeration import Parameter
 from python_dwd.enumerations.period_type_enumeration import PeriodType
 from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
-from python_dwd.file_path_handling.file_list_creation import \
-    create_file_list_for_dwd_server
-from python_dwd.file_path_handling.path_handling import remove_old_file, create_folder
+from python_dwd.file_path_handling.file_index_creation import create_file_index_for_dwd_server, \
+    reset_file_index_cache
 
 
 def add_filepresence(metainfo: pd.DataFrame,
                      parameter: Parameter,
                      time_resolution: TimeResolution,
-                     period_type: PeriodType,
-                     folder: str,
-                     create_new_filelist: bool) -> pd.DataFrame:
+                     period_type: PeriodType) -> pd.DataFrame:
     """
     updates the metainfo
-
     Args:
         metainfo: meta info about the weather data
         parameter: observation measure
         time_resolution: frequency/granularity of measurement interval
         period_type: recent or historical files
-        folder: local folder to store meta info file
-        create_new_filelist: if true: a new file_list for metadata will
-         be created
-
     Returns:
         updated meta info
     """
-    if not isinstance(metainfo, pd.DataFrame):
-        raise TypeError("Error: metainfo is not of type pandas.DataFrame.")
-
-    if create_new_filelist:
-        create_fileindex(parameter=parameter,
-                         time_resolution=time_resolution,
-                         period_type=period_type,
-                         folder=folder)
-
     metainfo[DWDMetaColumns.HAS_FILE.value] = False
 
-    filelist = create_file_list_for_dwd_server(
-        station_ids=metainfo.iloc[:, 0].to_list(),
-        parameter=parameter,
-        time_resolution=time_resolution,
-        period_type=period_type,
-        folder=folder)
+    file_index = create_file_index_for_dwd_server(
+        parameter, time_resolution, period_type)
 
     metainfo.loc[metainfo.iloc[:, 0].isin(
-        filelist[DWDMetaColumns.STATION_ID.value]), DWDMetaColumns.HAS_FILE.value] = True
+        file_index[DWDMetaColumns.STATION_ID.value]), DWDMetaColumns.HAS_FILE.value] = True
 
     return metainfo
 
@@ -66,49 +39,31 @@ def add_filepresence(metainfo: pd.DataFrame,
 def metadata_for_dwd_data(parameter: Union[Parameter, str],
                           time_resolution: Union[TimeResolution, str],
                           period_type: Union[PeriodType, str],
-                          folder: str = DWD_FOLDER_MAIN,
-                          write_file: bool = True,
-                          create_new_filelist: bool = False) -> pd.DataFrame:
+                          create_new_file_index: bool = False) -> pd.DataFrame:
     """
     A main function to retrieve metadata for a set of parameters that creates a
         corresponding csv.
-
     STATE information is added to metadata for cases where there's no such named
-    column (e.g. STATE) in the dataframe.
+    column (e.g. STATE) in the pandas.DataFrame.
     For this purpose we use daily precipitation data. That has two reasons:
      - daily precipitation data has a STATE information combined with a city
      - daily precipitation data is the most common data served by the DWD
-
-
     Args:
         parameter: observation measure
         time_resolution: frequency/granularity of measurement interval
         period_type: recent or historical files
-        folder: local file system folder where files should be stored
-        write_file: writes the meta data file to the local file system
-        create_new_filelist: if true: a new file_list for metadata will
+        create_new_file_index: if true: a new file_list for metadata will
          be created
-
     Returns:
-
+        pandas.DataFrame with metadata for selected parameters
     """
+    if create_new_file_index:
+        reset_file_index_cache()
+
     parameter = Parameter(parameter)
     time_resolution = TimeResolution(time_resolution)
     period_type = PeriodType(period_type)
 
-    check_parameters(parameter=parameter,
-                     time_resolution=time_resolution,
-                     period_type=period_type)
-
-    file_path = create_metainfo_fpath(folder,
-                                      parameter,
-                                      period_type,
-                                      time_resolution)
-
-    if check_file_exist(file_path) and not create_new_filelist:
-        metainfo = pd.read_csv(filepath_or_buffer=file_path)
-        return metainfo
-
     if time_resolution == TimeResolution.MINUTE_1:
         metainfo = metaindex_for_1minute_data(parameter=parameter,
                                               time_resolution=time_resolution)
@@ -122,9 +77,7 @@ def metadata_for_dwd_data(parameter: Union[Parameter, str],
         mdp = metadata_for_dwd_data(Parameter.PRECIPITATION_MORE,
                                     TimeResolution.DAILY,
                                     PeriodType.HISTORICAL,
-                                    folder=folder,
-                                    write_file=False,
-                                    create_new_filelist=False)
+                                    create_new_file_index=False)
 
         stateinfo = pd.merge(metainfo[DWDMetaColumns.STATION_ID],
                              mdp.loc[:, [DWDMetaColumns.STATION_ID.value, DWDMetaColumns.STATE.value]],
@@ -135,38 +88,6 @@ def metadata_for_dwd_data(parameter: Union[Parameter, str],
     metainfo = add_filepresence(metainfo=metainfo,
                                 parameter=parameter,
                                 time_resolution=time_resolution,
-                                period_type=period_type,
-                                folder=folder,
-                                create_new_filelist=create_new_filelist)
-
-    if write_file and not check_file_exist(file_path) and not \
-            create_new_filelist:
-        remove_old_file(file_type=METADATA_NAME,
-                        file_postfix=DATA_FORMAT,
-                        parameter=parameter,
-                        time_resolution=time_resolution,
-                        period_type=period_type,
-                        folder=folder,
-                        subfolder=DWD_FOLDER_METADATA)
-
-        metainfo.to_csv(path_or_buf=file_path,
-                        header=True,
-                        index=False)
+                                period_type=period_type)
 
     return metainfo
-
-
-def create_metainfo_fpath(folder: str,
-                          parameter: Parameter,
-                          period_type: PeriodType,
-                          time_resolution: TimeResolution) -> Path:
-    """ checks if the file behind the path exists """
-    # folder = correct_folder_path(folder)
-
-    create_folder(subfolder=DWD_FOLDER_METADATA,
-                  folder=folder)
-    return Path(folder,
-                DWD_FOLDER_METADATA,
-                f"{METADATA_NAME}_{parameter.value}_"
-                f"{time_resolution.value}_{period_type.value}"
-                f"{DATA_FORMAT}")
diff --git a/python_dwd/parsing_data/parse_data_from_files.py b/python_dwd/parsing_data/parse_data_from_files.py
index 82e8f7bb1..185f9c4cb 100644
--- a/python_dwd/parsing_data/parse_data_from_files.py
+++ b/python_dwd/parsing_data/parse_data_from_files.py
@@ -13,7 +13,8 @@
 log = logging.getLogger(__name__)
 
 
-def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]], time_resolution: Union[TimeResolution, str]) -> pd.DataFrame:
+def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]],
+                   time_resolution: Union[TimeResolution, str]) -> pd.DataFrame:
     """
     This function is used to read the station data from given bytes object.
     The filename is required to defined if and where an error happened.
@@ -21,6 +22,7 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]], time_resoluti
     Args:
         filenames_and_files: list of tuples of a filename and its local stored file
         that should be read
+        time_resolution: enumeration of time resolution used to correctly parse the date field
 
     Returns:
         pandas.DataFrame with requested data, for different station ids the data is still put into one DataFrame
@@ -43,7 +45,8 @@ def parse_dwd_data(filenames_and_files: List[Tuple[str, BytesIO]], time_resoluti
     return data
 
 
-def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: TimeResolution) -> pd.DataFrame:
+def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO],
+                    time_resolution: TimeResolution) -> pd.DataFrame:
     """
     A wrapping function that only handles data for one station id. The files passed to it are thus related to this id.
     This is important for storing the data locally as the DataFrame that is stored should obviously only handle one
@@ -51,6 +54,7 @@ def _parse_dwd_data(filename_and_file: Tuple[str, BytesIO], time_resolution: Tim
 
     Args:
         filename_and_file: the files belonging to one station
+        time_resolution: enumeration of time resolution used to correctly parse the date field
     Returns:
         pandas.DataFrame with data from that station, acn be empty if no data is provided or local file is not found
     or has no data in it
diff --git a/python_dwd/version.py b/python_dwd/version.py
new file mode 100644
index 000000000..5becc17c0
--- /dev/null
+++ b/python_dwd/version.py
@@ -0,0 +1 @@
+__version__ = "1.0.0"
diff --git a/setup.py b/setup.py
index 0dc5139d0..46780a4bd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
 from setuptools import setup, find_packages
+from python_dwd.version import __version__
 
 with open("README.md", 'r') as f:
     long_description = f.read()
 
 setup(
     name='python_dwd',
-    version="1.0.0",
+    version=__version__,
     description='A module for accessing data of the german weather service',
     license='MIT',
     long_description=long_description,
diff --git a/tests/file_path_handling/test_file_index_creation.py b/tests/file_path_handling/test_file_index_creation.py
new file mode 100644
index 000000000..1b67b902a
--- /dev/null
+++ b/tests/file_path_handling/test_file_index_creation.py
@@ -0,0 +1,27 @@
+""" tests for file index creation """
+import pytest
+
+from python_dwd.file_path_handling.file_index_creation import create_file_index_for_dwd_server, \
+    reset_file_index_cache
+from python_dwd.enumerations.parameter_enumeration import Parameter
+from python_dwd.enumerations.time_resolution_enumeration import TimeResolution
+from python_dwd.enumerations.period_type_enumeration import PeriodType
+
+
+@pytest.mark.remote
+def test_file_index_creation():
+    file_index = create_file_index_for_dwd_server(
+        Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.HISTORICAL)
+
+    assert not file_index.empty
+
+    assert create_file_index_for_dwd_server.cache_info().currsize == 1
+
+    reset_file_index_cache()
+
+    assert create_file_index_for_dwd_server.cache_info().currsize == 0
+
+    file_index2 = create_file_index_for_dwd_server(
+        Parameter.CLIMATE_SUMMARY, TimeResolution.DAILY, PeriodType.HISTORICAL)
+
+    assert file_index.equals(file_index2)