Skip to content

Commit

Permalink
Merge pull request #65 from mrx23dot/delay
Browse files Browse the repository at this point in the history
optimized delay between downloads
  • Loading branch information
manusimidt committed Sep 23, 2021
2 parents 08fb2d8 + 44d28ab commit 1e0f83a
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 10 deletions.
4 changes: 2 additions & 2 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def test_cache_file(self):
if os.path.isfile(expected_path):
os.remove(expected_path)

# on the first execution the file will be downloaded from the internet
# on the first execution the file will be downloaded from the internet, no delay for first download
time_stamp: float = time.time()
self.assertEqual(cache.cache_file(test_url), expected_path)
self.assertGreaterEqual(time.time() - time_stamp, delay / 1000)
self.assertLess(time.time() - time_stamp, delay / 1000)

# on the second execution the file path will be returned
time_stamp = time.time()
Expand Down
43 changes: 38 additions & 5 deletions xbrl/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import os
import zipfile
from pathlib import Path

from xbrl.helper.connection_manager import ConnectionManager

Expand Down Expand Up @@ -34,7 +35,6 @@ def __init__(self, cache_dir: str, delay: int = 500, verify_https: bool = True):
# check if the cache_dir ends with a /
if not cache_dir.endswith('/'): cache_dir += '/'
self.cache_dir: str = cache_dir
self.delay: int = delay
self.headers: dict or None = None
self.connection_manager = ConnectionManager(delay, verify_https=verify_https)

Expand All @@ -58,10 +58,9 @@ def set_connection_params(self, delay: int = 500, retries: int = 5, backoff_fact
{backoff factor} * (2 ** ({number of total retries} - 1))
:return:
"""
self.connection_manager._delay = delay
self.connection_manager._delay_ms = delay
self.connection_manager._retries = retries
self.connection_manager._backoff_factor = backoff_factor
self.connection_manager._delay = delay
self.connection_manager.logs = logs

def cache_file(self, file_url: str) -> str:
Expand Down Expand Up @@ -119,7 +118,7 @@ def url_to_path(self, url: str) -> str:
"""
return self.cache_dir + re.sub("https?://", "", url)

def cache_edgar_enclosure(self, enclosure_url: str) -> None:
def cache_edgar_enclosure(self, enclosure_url: str) -> str:
"""
The SEC provides zip folders that contain all xbrl related files for a given submission.
These files are i.e: Instance Document, Extension Taxonomy, Linkbases.
Expand All @@ -130,7 +129,7 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
One way to get the zip enclosure url is through the Structured Disclosure RSS Feeds provided by the SEC:
https://www.sec.gov/structureddata/rss-feeds-submitted-filings
:param enclosure_url: url to the zip folder.
:return:
:return: relative path to extracted zip's content
"""
if not enclosure_url.endswith('.zip'):
raise Exception("This is not a valid zip folder")
Expand All @@ -141,3 +140,37 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
with zipfile.ZipFile(enclosure_path, "r") as zip_ref:
zip_ref.extractall(submission_dir_path)
zip_ref.close()
return submission_dir_path

def find_entry_file(self, dir: str) -> str:
""" Find the most likelly entry file in provided filling directory """

# filter for files in interest
valid_files = []
for ext in '.htm .xml .xsd'.split(): # valid extensions in priority
for f in os.listdir(dir):
f_full = os.path.join(dir,f)
if os.path.isfile(f_full) and f.lower().endswith(ext):
valid_files.append(f_full)

# find first file which is not included by others
entryCandidates = []
for file1 in valid_files:
fdir, file_nm = os.path.split(file1)
# foreach file check all other for inclusion
foundInOther = False
for file2 in valid_files:
if file1!=file2:
if file_nm in Path(file2).read_text():
foundInOther = True
break

if foundInOther == False:
entryCandidates.append((file1, os.path.getsize(file1)))

# if multiple choose biggest
entryCandidates.sort(key=lambda tup: tup[1], reverse=True)
if len(entryCandidates) > 0:
file_path, size = entryCandidates[0]
return file_path
return None
14 changes: 11 additions & 3 deletions xbrl/helper/connection_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,30 @@ def __init__(self, delay: int = 500, retries: int = 5, backoff_factor: float = 0
The formula used is {backoff factor} * (2 ** ({number of total retries} - 1))
@param headers: Headers to use in http request.
"""
self._delay = delay
self._delay_ms = delay # post delay after download
self._retries = retries
self._backoff_factor = backoff_factor
self._headers = headers
self._session = self._create_session()
self.logs = logs
self.verify_https = verify_https
self.next_try_systime_ms = self._get_systime_ms() # when can we try next download

if verify_https is False:
requests.packages.urllib3.disable_warnings()

def _get_systime_ms(self):
return int(time.time() * 1000)

def download(self, url: str, headers: str):
# make sure last post-delay elapsed, to rate limit API usage
time.sleep(max(0, self.next_try_systime_ms - self._get_systime_ms()) / 1000)

response = self._session.get(url, headers=headers, allow_redirects=True, verify=self.verify_https)
if self.logs: logger.info(str(response.status_code) + " " + url)
# Set a timeout, so that we do not get blocked by the for making to many requests
time.sleep(self._delay / 1000)

# no actual delay after last download
self.next_try_systime_ms = self._get_systime_ms() + self._delay_ms

return response

Expand Down

0 comments on commit 1e0f83a

Please sign in to comment.