From 166ad4e5188b05fc3d8a0562658b93f2ab4cd6e1 Mon Sep 17 00:00:00 2001 From: Arpad Toth Date: Fri, 13 Aug 2021 10:29:48 +0100 Subject: [PATCH 1/4] cache_edgar_enclosure returns dir path --- xbrl/cache.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xbrl/cache.py b/xbrl/cache.py index 4226e73..3b8c352 100644 --- a/xbrl/cache.py +++ b/xbrl/cache.py @@ -119,7 +119,7 @@ def url_to_path(self, url: str) -> str: """ return self.cache_dir + re.sub("https?://", "", url) - def cache_edgar_enclosure(self, enclosure_url: str) -> None: + def cache_edgar_enclosure(self, enclosure_url: str) -> str: """ The SEC provides zip folders that contain all xbrl related files for a given submission. These files are i.e: Instance Document, Extension Taxonomy, Linkbases. @@ -130,7 +130,7 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None: One way to get the zip enclosure url is through the Structured Disclosure RSS Feeds provided by the SEC: https://www.sec.gov/structureddata/rss-feeds-submitted-filings :param enclosure_url: url to the zip folder. - :return: + :return: relative path to extracted zip's content """ if not enclosure_url.endswith('.zip'): raise Exception("This is not a valid zip folder") @@ -141,3 +141,4 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None: with zipfile.ZipFile(enclosure_path, "r") as zip_ref: zip_ref.extractall(submission_dir_path) zip_ref.close() + return submission_dir_path From b2afc3185ffd6b7047f339e79daaf54b441c0eaa Mon Sep 17 00:00:00 2001 From: Arpad Toth Date: Fri, 13 Aug 2021 10:51:34 +0100 Subject: [PATCH 2/4] added find_entry_file for filling dir --- xbrl/cache.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/xbrl/cache.py b/xbrl/cache.py index 3b8c352..f7798d9 100644 --- a/xbrl/cache.py +++ b/xbrl/cache.py @@ -4,6 +4,7 @@ import re import os import zipfile +from pathlib import Path from xbrl.helper.connection_manager import ConnectionManager @@ -142,3 +143,36 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> str: zip_ref.extractall(submission_dir_path) zip_ref.close() return submission_dir_path + + def find_entry_file(self, dir: str) -> str: + """ Find the most likelly entry file in provided filling directory """ + + # filter for files in interest + valid_files = [] + for ext in '.htm .xml .xsd'.split(): # valid extensions in priority + for f in os.listdir(dir): + f_full = os.path.join(dir,f) + if os.path.isfile(f_full) and f.lower().endswith(ext): + valid_files.append(f_full) + + # find first file which is not included by others + entryCandidates = [] + for file1 in valid_files: + fdir, file_nm = os.path.split(file1) + # foreach file check all other for inclusion + foundInOther = False + for file2 in valid_files: + if file1!=file2: + if file_nm in Path(file2).read_text(): + foundInOther = True + break + + if foundInOther == False: + entryCandidates.append((file1, os.path.getsize(file1))) + + # if multiple choose biggest + entryCandidates.sort(key=lambda tup: tup[1], reverse=True) + if len(entryCandidates) > 0: + file_path, size = entryCandidates[0] + return file_path + return None From a76f905c12b0a065e37902dcc31b2dd9a2547e3a Mon Sep 17 00:00:00 2001 From: mrx23dot Date: Wed, 18 Aug 2021 15:41:06 +0100 Subject: [PATCH 3/4] optimized delay between downloads --- xbrl/cache.py | 4 +--- xbrl/helper/connection_manager.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/xbrl/cache.py b/xbrl/cache.py index f7798d9..7d79674 100644 --- a/xbrl/cache.py +++ b/xbrl/cache.py @@ -35,7 +35,6 @@ def __init__(self, cache_dir: str, delay: int = 500, verify_https: bool = True): # check if the cache_dir ends with a / if not cache_dir.endswith('/'): cache_dir += '/' self.cache_dir: str = cache_dir - self.delay: int = delay self.headers: dict or None = None self.connection_manager = ConnectionManager(delay, verify_https=verify_https) @@ -59,10 +58,9 @@ def set_connection_params(self, delay: int = 500, retries: int = 5, backoff_fact {backoff factor} * (2 ** ({number of total retries} - 1)) :return: """ - self.connection_manager._delay = delay + self.connection_manager._delay_ms = delay self.connection_manager._retries = retries self.connection_manager._backoff_factor = backoff_factor - self.connection_manager._delay = delay self.connection_manager.logs = logs def cache_file(self, file_url: str) -> str: diff --git a/xbrl/helper/connection_manager.py b/xbrl/helper/connection_manager.py index 57b4de2..08971bf 100644 --- a/xbrl/helper/connection_manager.py +++ b/xbrl/helper/connection_manager.py @@ -28,22 +28,30 @@ def __init__(self, delay: int = 500, retries: int = 5, backoff_factor: float = 0 The formula used is {backoff factor} * (2 ** ({number of total retries} - 1)) @param headers: Headers to use in http request. """ - self._delay = delay + self._delay_ms = delay # post delay after download self._retries = retries self._backoff_factor = backoff_factor self._headers = headers self._session = self._create_session() self.logs = logs self.verify_https = verify_https + self.next_try_systime_ms = self._get_systime_ms() # when can we try next download if verify_https is False: requests.packages.urllib3.disable_warnings() + def _get_systime_ms(self): + return int(time.time() * 1000) + def download(self, url: str, headers: str): + # make sure last post-delay elapsed, to rate limit API usage + time.sleep(max(0, self.next_try_systime_ms - self._get_systime_ms()) / 1000) + response = self._session.get(url, headers=headers, allow_redirects=True, verify=self.verify_https) if self.logs: logger.info(str(response.status_code) + " " + url) - # Set a timeout, so that we do not get blocked by the for making to many requests - time.sleep(self._delay / 1000) + + # no actual delay after last download + self.next_try_systime_ms = self._get_systime_ms() + self._delay_ms return response From 44d28abae9d6fe5b1ddc7073e495853535f1c7d0 Mon Sep 17 00:00:00 2001 From: mrx23dot Date: Wed, 18 Aug 2021 16:04:48 +0100 Subject: [PATCH 4/4] fixed unit test --- tests/test_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cache.py b/tests/test_cache.py index adcfd86..c03a73c 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -30,10 +30,10 @@ def test_cache_file(self): if os.path.isfile(expected_path): os.remove(expected_path) - # on the first execution the file will be downloaded from the internet + # on the first execution the file will be downloaded from the internet, no delay for first download time_stamp: float = time.time() self.assertEqual(cache.cache_file(test_url), expected_path) - self.assertGreaterEqual(time.time() - time_stamp, delay / 1000) + self.assertLess(time.time() - time_stamp, delay / 1000) # on the second execution the file path will be returned time_stamp = time.time()