Cut build again

palewire · Jul 1, 2023 · 73652ed · 73652ed
2 parents 3e94f8a + 3f0b4af
commit 73652ed
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 17 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -8,7 +8,7 @@
 extensions = [
     "myst_parser",
     "sphinx.ext.autodoc",
-    "sphinxcontrib.napoleon",
+    "sphinx.ext.napoleon",
     "sphinx_click",
 ]
 templates_path = ["_templates"]

diff --git a/docs/python.md b/docs/python.md
@@ -3,24 +3,35 @@
 
 # Python Usage
 
-First import the library into your Python code.
+##  First import the library into your Python code.
 
 ```python
 import savepagenow
 ```
 
-Capture a URL.
+## Capture a URL.
 
 ```python
 archive_url = savepagenow.capture("http://www.example.com/")
 ```
 
-See where it's stored.
+## See where it's stored.
 
 ```python
 print(archive_url)
 ```
 
+## Capture a URL with authentication. 
+
+By default, savepagenow runs without authentication. This means that the tool can do four captures per minute. 
+If you'd like to run authenticated WayBack saves, which allows you to do 12 captures per minute, 
+set local environment variables ``access_key`` and ``secret`` to your [Internet Archive credentials](https://archive.org/account/s3.php). 
+Then, you can run capture() with the authenticate flag set to true like so:
+```python
+archive_url = savepagenow.capture("https://www.example.com/", authenticate=True)
+```
+
+## CachedPage Exception Handling
 If a URL has been recently cached, archive.org may return the URL to that page rather than conduct a new capture. When that happens, the ``capture`` method will raise a ``CachedPage`` exception.
 
 This is likely happen if you request the same URL twice within a few seconds.

diff --git a/savepagenow/api.py b/savepagenow/api.py
@@ -1,11 +1,12 @@
 import typing
 from urllib.parse import urljoin
-
+import os
+import sys
 import click
 import requests
 from requests.utils import parse_header_links
 
-from .exceptions import (
+from exceptions import (
     BadGateway,
     BlockedByRobots,
     CachedPage,
@@ -20,6 +21,7 @@ def capture(
     target_url,
     user_agent="savepagenow (https://github.com/pastpages/savepagenow)",
     accept_cache=False,
+    authenticate=False,
 ):
     """
     Archive the provided URL using archive.org's Wayback Machine.
@@ -31,17 +33,43 @@ def capture(
 
     To silence that exception, pass into True to the ``accept_cache`` keyword
     argument.
+
+    By default, operates in an anonymous, unauthed way.
+    Can mark as authenticate. Must have access_key and secret
+    set as local environment variables to use authenticated requests.
     """
     # Put together the URL that will save our request
     domain = "https://web.archive.org"
     save_url = urljoin(domain, "/save/")
     request_url = save_url + target_url
 
-    # Send the capture request to archive.org
-    headers = {
-        "User-Agent": user_agent,
-    }
-    response = requests.get(request_url, headers=headers)
+    # Access Keys for Internet Archive API
+    if authenticate:
+        if "access_key" in os.environ and "secret" in os.environ:
+            access_key = os.environ["access_key"]
+            secret = os.environ["secret"]
+        else:
+            print(
+                "You have not set your local environment variables access_key" 
+                "and secret in order to use the authenticate flag"
+            )
+            sys.exit(1)
+        authorization = f"LOW {access_key}:{secret}"
+        headers = {
+            "Accept": "application/json",
+            "User-Agent": user_agent,
+            "Authorization": authorization,
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        response = requests.get(request_url, headers=headers)
+        if response.status_code == 401:
+            print("Your archive.org access key and/or secret is not valid")
+            sys.exit(1)
+    else:
+        headers = {
+            "User-Agent": user_agent,
+        }
+        response = requests.get(request_url, headers=headers)
 
     # If it has an error header, raise that.
     has_error_header = "X-Archive-Wayback-Runtime-Error" in response.headers
@@ -101,8 +129,8 @@ def capture_or_cache(
     target_url, user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
 ):
     """
-    Archive the provided URL using archive.org's Wayback Machine, unless the page has been recently captured.
-
+    Archive the provided URL using archive.org's Wayback Machine,
+    unless the page has been recently captured.
     Returns a tuple with the archive.org URL where the capture is stored,
     along with a boolean indicating if a new capture was conducted.
 
@@ -120,17 +148,30 @@ def capture_or_cache(
 @click.argument("url")
 @click.option("-ua", "--user-agent", help="User-Agent header for the web request")
 @click.option("-c", "--accept-cache", help="Accept and return cached URL", is_flag=True)
-def cli(url: str, user_agent: typing.Optional[str] = None, accept_cache: bool = False):
+@click.option(
+    "-a",
+    "--authenticate",
+    help="Allows you to run saves with authenication",
+    is_flag=True,
+)
+def cli(
+    url: str,
+    user_agent: typing.Optional[str] = None,
+    accept_cache: bool = False,
+    authenticate: bool = False,
+):
     """
     Archive the provided URL using archive.org's Wayback Machine.
-
-    Raises a CachedPage exception if archive.org declines to conduct a new capture and returns a previous snapshot instead.
+    Raises a CachedPage exception if archive.org declines
+    to conduct a new capture and returns a previous snapshot instead.
     """
     kwargs: typing.Dict[typing.Any, typing.Any] = {}
     if user_agent:
         kwargs["user_agent"] = user_agent
     if accept_cache:
         kwargs["accept_cache"] = accept_cache
+    if authenticate:
+        kwargs["authenticate"] = authenticate
     archive_url = capture(url, **kwargs)
     click.echo(archive_url)
 

diff --git a/tests/test_capture.py b/tests/test_capture.py
@@ -1,8 +1,15 @@
 import savepagenow
 
-
 def test_capture():
     """Test the basic function of retriving a URL from Wayback."""
     url = "https://www.latimes.com/"
     archive_url, c = savepagenow.capture_or_cache(url)
     assert archive_url.startswith("https://web.archive.org/")
+
+def test_auth_capture():
+    """Test the retrieval of URL from Wayback with authentication"""
+    url = "https://www.latimes.com/"
+    # The env variables get pulled when we call capture()
+    # You have to set access_key and secret to the appropriate values for your archive.org account
+    archive_url, c = savepagenow.capture(url, authenticate=True)
+    assert archive_url.startswith("https://web.archive.org/")