Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimized delay between downloads #65

Merged
merged 5 commits into from
Sep 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def test_cache_file(self):
if os.path.isfile(expected_path):
os.remove(expected_path)

# on the first execution the file will be downloaded from the internet
# on the first execution the file will be downloaded from the internet, no delay for first download
time_stamp: float = time.time()
self.assertEqual(cache.cache_file(test_url), expected_path)
self.assertGreaterEqual(time.time() - time_stamp, delay / 1000)
self.assertLess(time.time() - time_stamp, delay / 1000)

# on the second execution the file path will be returned
time_stamp = time.time()
Expand Down
43 changes: 38 additions & 5 deletions xbrl/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import os
import zipfile
from pathlib import Path

from xbrl.helper.connection_manager import ConnectionManager

Expand Down Expand Up @@ -34,7 +35,6 @@ def __init__(self, cache_dir: str, delay: int = 500, verify_https: bool = True):
# check if the cache_dir ends with a /
if not cache_dir.endswith('/'): cache_dir += '/'
self.cache_dir: str = cache_dir
self.delay: int = delay
self.headers: dict or None = None
self.connection_manager = ConnectionManager(delay, verify_https=verify_https)

Expand All @@ -58,10 +58,9 @@ def set_connection_params(self, delay: int = 500, retries: int = 5, backoff_fact
{backoff factor} * (2 ** ({number of total retries} - 1))
:return:
"""
self.connection_manager._delay = delay
self.connection_manager._delay_ms = delay
self.connection_manager._retries = retries
self.connection_manager._backoff_factor = backoff_factor
self.connection_manager._delay = delay
self.connection_manager.logs = logs

def cache_file(self, file_url: str) -> str:
Expand Down Expand Up @@ -119,7 +118,7 @@ def url_to_path(self, url: str) -> str:
"""
return self.cache_dir + re.sub("https?://", "", url)

def cache_edgar_enclosure(self, enclosure_url: str) -> None:
def cache_edgar_enclosure(self, enclosure_url: str) -> str:
"""
The SEC provides zip folders that contain all xbrl related files for a given submission.
These files are i.e: Instance Document, Extension Taxonomy, Linkbases.
Expand All @@ -130,7 +129,7 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
One way to get the zip enclosure url is through the Structured Disclosure RSS Feeds provided by the SEC:
https://www.sec.gov/structureddata/rss-feeds-submitted-filings
:param enclosure_url: url to the zip folder.
:return:
:return: relative path to extracted zip's content
"""
if not enclosure_url.endswith('.zip'):
raise Exception("This is not a valid zip folder")
Expand All @@ -141,3 +140,37 @@ def cache_edgar_enclosure(self, enclosure_url: str) -> None:
with zipfile.ZipFile(enclosure_path, "r") as zip_ref:
zip_ref.extractall(submission_dir_path)
zip_ref.close()
return submission_dir_path

def find_entry_file(self, dir: str) -> str:
""" Find the most likelly entry file in provided filling directory """

# filter for files in interest
valid_files = []
for ext in '.htm .xml .xsd'.split(): # valid extensions in priority
for f in os.listdir(dir):
f_full = os.path.join(dir,f)
if os.path.isfile(f_full) and f.lower().endswith(ext):
valid_files.append(f_full)

# find first file which is not included by others
entryCandidates = []
for file1 in valid_files:
fdir, file_nm = os.path.split(file1)
# foreach file check all other for inclusion
foundInOther = False
for file2 in valid_files:
if file1!=file2:
if file_nm in Path(file2).read_text():
foundInOther = True
break

if foundInOther == False:
entryCandidates.append((file1, os.path.getsize(file1)))

# if multiple choose biggest
entryCandidates.sort(key=lambda tup: tup[1], reverse=True)
if len(entryCandidates) > 0:
file_path, size = entryCandidates[0]
return file_path
return None
14 changes: 11 additions & 3 deletions xbrl/helper/connection_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,30 @@ def __init__(self, delay: int = 500, retries: int = 5, backoff_factor: float = 0
The formula used is {backoff factor} * (2 ** ({number of total retries} - 1))
@param headers: Headers to use in http request.
"""
self._delay = delay
self._delay_ms = delay # post delay after download
self._retries = retries
self._backoff_factor = backoff_factor
self._headers = headers
self._session = self._create_session()
self.logs = logs
self.verify_https = verify_https
self.next_try_systime_ms = self._get_systime_ms() # when can we try next download

if verify_https is False:
requests.packages.urllib3.disable_warnings()

def _get_systime_ms(self):
return int(time.time() * 1000)

def download(self, url: str, headers: str):
# make sure last post-delay elapsed, to rate limit API usage
time.sleep(max(0, self.next_try_systime_ms - self._get_systime_ms()) / 1000)

response = self._session.get(url, headers=headers, allow_redirects=True, verify=self.verify_https)
if self.logs: logger.info(str(response.status_code) + " " + url)
# Set a timeout, so that we do not get blocked by the for making to many requests
time.sleep(self._delay / 1000)

# no actual delay after last download
self.next_try_systime_ms = self._get_systime_ms() + self._delay_ms

return response

Expand Down