manusimidt · manusimidt · Sep 23, 2021 · Aug 13, 2021 · Aug 13, 2021 · Aug 18, 2021
diff --git a/tests/test_cache.py b/tests/test_cache.py
@@ -30,10 +30,10 @@ def test_cache_file(self):
         if os.path.isfile(expected_path):
             os.remove(expected_path)
 
-        # on the first execution the file will be downloaded from the internet
+        # on the first execution the file will be downloaded from the internet, no delay for first download
         time_stamp: float = time.time()
         self.assertEqual(cache.cache_file(test_url), expected_path)
-        self.assertGreaterEqual(time.time() - time_stamp, delay / 1000)
+        self.assertLess(time.time() - time_stamp, delay / 1000)
 
         # on the second execution the file path will be returned
         time_stamp = time.time()

diff --git a/xbrl/cache.py b/xbrl/cache.py
@@ -4,6 +4,7 @@
 import re
 import os
 import zipfile
+from pathlib import Path
 
 from xbrl.helper.connection_manager import ConnectionManager
 
@@ -34,7 +35,6 @@ def __init__(self, cache_dir: str, delay: int = 500, verify_https: bool = True):
         # check if the cache_dir ends with a /
         if not cache_dir.endswith('/'): cache_dir += '/'
         self.cache_dir: str = cache_dir
-        self.delay: int = delay
         self.headers: dict or None = None
         self.connection_manager = ConnectionManager(delay, verify_https=verify_https)
 
@@ -58,10 +58,9 @@ def set_connection_params(self, delay: int = 500, retries: int = 5, backoff_fact
             {backoff factor} * (2 ** ({number of total retries} - 1))
         :return:
         """
-        self.connection_manager._delay = delay
+        self.connection_manager._delay_ms = delay
         self.connection_manager._retries = retries
         self.connection_manager._backoff_factor = backoff_factor
-        self.connection_manager._delay = delay
         self.connection_manager.logs = logs
 
     def cache_file(self, file_url: str) -> str:
@@ -119,7 +118,7 @@ def url_to_path(self, url: str) -> str:
         """
         return self.cache_dir + re.sub("https?://", "", url)
 
-    def cache_edgar_enclosure(self, enclosure_url: str) -> None:
+    def cache_edgar_enclosure(self, enclosure_url: str) -> str:
         """
         The SEC provides zip folders that contain all xbrl related files for a given submission.
         These files are i.e: Instance Document, Extension Taxonomy, Linkbases.
@@ -130,7 +129,7 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
         One way to get the zip enclosure url is through the Structured Disclosure RSS Feeds provided by the SEC:
         https://www.sec.gov/structureddata/rss-feeds-submitted-filings
         :param enclosure_url: url to the zip folder.
-        :return:
+        :return: relative path to extracted zip's content
         """
         if not enclosure_url.endswith('.zip'):
             raise Exception("This is not a valid zip folder")
@@ -141,3 +140,37 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
         with zipfile.ZipFile(enclosure_path, "r") as zip_ref:
             zip_ref.extractall(submission_dir_path)
             zip_ref.close()
+        return submission_dir_path
+
+    def find_entry_file(self, dir: str) -> str:
+        """ Find the most likelly entry file in provided filling directory """
+
+        # filter for files in interest
+        valid_files = []
+        for ext in '.htm .xml .xsd'.split(): # valid extensions in priority
+            for f in os.listdir(dir):
+                f_full = os.path.join(dir,f)
+                if os.path.isfile(f_full) and f.lower().endswith(ext):
+                    valid_files.append(f_full)
+
+        # find first file which is not included by others
+        entryCandidates = []
+        for file1 in valid_files:
+            fdir, file_nm = os.path.split(file1)
+            # foreach file check all other for inclusion
+            foundInOther = False
+            for file2 in valid_files:
+                if file1!=file2:
+                    if file_nm in Path(file2).read_text():
+                        foundInOther = True
+                        break
+
+            if foundInOther == False:
+                entryCandidates.append((file1, os.path.getsize(file1)))
+
+        # if multiple choose biggest
+        entryCandidates.sort(key=lambda tup: tup[1], reverse=True)
+        if len(entryCandidates) > 0:
+            file_path, size = entryCandidates[0]
+            return file_path
+        return None
diff --git a/xbrl/helper/connection_manager.py b/xbrl/helper/connection_manager.py
@@ -28,22 +28,30 @@ def __init__(self, delay: int = 500, retries: int = 5, backoff_factor: float = 0
             The formula used is {backoff factor} * (2 ** ({number of total retries} - 1))
         @param headers: Headers to use in http request.
         """
-        self._delay = delay
+        self._delay_ms = delay # post delay after download
         self._retries = retries
         self._backoff_factor = backoff_factor
         self._headers = headers
         self._session = self._create_session()
         self.logs = logs
         self.verify_https = verify_https
+        self.next_try_systime_ms = self._get_systime_ms() # when can we try next download
 
         if verify_https is False:
           requests.packages.urllib3.disable_warnings()
 
+    def _get_systime_ms(self):
+        return int(time.time() * 1000)
+
     def download(self, url: str, headers: str):
+        # make sure last post-delay elapsed, to rate limit API usage
+        time.sleep(max(0, self.next_try_systime_ms - self._get_systime_ms()) / 1000)
+
         response = self._session.get(url, headers=headers, allow_redirects=True, verify=self.verify_https)
         if self.logs: logger.info(str(response.status_code) + " " + url)
-        # Set a timeout, so that we do not get blocked by the for making to many requests
-        time.sleep(self._delay / 1000)
+
+        # no actual delay after last download
+        self.next_try_systime_ms = self._get_systime_ms() + self._delay_ms
 
         return response