Skip to content

Commit

Permalink
updates based on testing
Browse files Browse the repository at this point in the history
  • Loading branch information
shawnmjones committed Jul 20, 2018
1 parent 3afeb64 commit c5f2425
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
2 changes: 1 addition & 1 deletion aiu/timemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,4 @@ def process_local_dict(local_dict, working_dict):

process_local_dict(local_dict, dict_timemap)

return dict_timemap
return dict_timemap
4 changes: 2 additions & 2 deletions aiu/version.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
__name__ = "aiu"
__version__ = "0.1.0a1"
__version__ = "0.1.1a1"

name = __name__
version = __version__

user_agent_string = "{}/{} - See: https://github.com/shawnmjones/archiveit_utilities".format(name, version)
user_agent_string = "{}/{} - See: https://github.com/shawnmjones/archiveit_utilities".format(name, version)
25 changes: 21 additions & 4 deletions bin/seeds2warc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders

from requests_futures.sessions import FuturesSession
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import aiu

from aiu import ArchiveItCollection
from aiu import convert_LinkTimeMap_to_dict
Expand Down Expand Up @@ -102,8 +106,20 @@ def fetch_mementos_and_write_warcs(timemap_data, working_directory, collection_i
raw_urims.append(raw_urim)

logger.info("Issuing requests for {} raw mementos".format(len(raw_urims)))

with FuturesSession(max_workers=cpu_count) as session:

retry_session = requests.Session()
retry = Retry(
total=10,
read=10,
connect=10,
backoff_factor=0.3,
status_forcelist=(500, 502, 504)
)
adapter = HTTPAdapter(max_retries=retry)
retry_session.mount('http://', adapter)
retry_session.mount('https://', adapter)

with FuturesSession(max_workers=cpu_count, session=retry_session) as session:
futures = get_uri_responses(session, raw_urims)

warcinfo = {
Expand Down Expand Up @@ -152,9 +168,10 @@ def fetch_mementos_and_write_warcs(timemap_data, working_directory, collection_i
response = futures[raw_urim].result()

try:
# TODO: if the original URI used a Link header, it will be overridden
linkdict = convert_LinkTimeMap_to_dict(response.headers["link"])
urir = linkdict["original_uri"]
except KeyError as e:
except (KeyError, aiu.timemap.MalformedLinkFormatTimeMap) as e:
logger.warn("no original relation in the Link header for raw memento at {}".format(raw_urim))

sample_urim = invert_raw_urimdata_mapping[raw_urim][0]
Expand Down Expand Up @@ -281,4 +298,4 @@ if __name__ == '__main__':

logger.info("Data has been written out to {}".format(output_directory))

logger.info("Finished run")
logger.info("Finished run")

0 comments on commit c5f2425

Please sign in to comment.