Skip to content

Commit

Permalink
Cut build again
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 1, 2023
2 parents 3e94f8a + 3f0b4af commit 73652ed
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 17 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
extensions = [
"myst_parser",
"sphinx.ext.autodoc",
"sphinxcontrib.napoleon",
"sphinx.ext.napoleon",
"sphinx_click",
]
templates_path = ["_templates"]
Expand Down
17 changes: 14 additions & 3 deletions docs/python.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,35 @@

# Python Usage

First import the library into your Python code.
## First import the library into your Python code.

```python
import savepagenow
```

Capture a URL.
## Capture a URL.

```python
archive_url = savepagenow.capture("http://www.example.com/")
```

See where it's stored.
## See where it's stored.

```python
print(archive_url)
```

## Capture a URL with authentication.

By default, savepagenow runs without authentication. This means that the tool can do four captures per minute.
If you'd like to run authenticated WayBack saves, which allows you to do 12 captures per minute,
set local environment variables ``access_key`` and ``secret`` to your [Internet Archive credentials](https://archive.org/account/s3.php).
Then, you can run capture() with the authenticate flag set to true like so:
```python
archive_url = savepagenow.capture("https://www.example.com/", authenticate=True)
```

## CachedPage Exception Handling
If a URL has been recently cached, archive.org may return the URL to that page rather than conduct a new capture. When that happens, the ``capture`` method will raise a ``CachedPage`` exception.

This is likely happen if you request the same URL twice within a few seconds.
Expand Down
65 changes: 53 additions & 12 deletions savepagenow/api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import typing
from urllib.parse import urljoin

import os
import sys
import click
import requests
from requests.utils import parse_header_links

from .exceptions import (
from exceptions import (
BadGateway,
BlockedByRobots,
CachedPage,
Expand All @@ -20,6 +21,7 @@ def capture(
target_url,
user_agent="savepagenow (https://github.com/pastpages/savepagenow)",
accept_cache=False,
authenticate=False,
):
"""
Archive the provided URL using archive.org's Wayback Machine.
Expand All @@ -31,17 +33,43 @@ def capture(
To silence that exception, pass into True to the ``accept_cache`` keyword
argument.
By default, operates in an anonymous, unauthed way.
Can mark as authenticate. Must have access_key and secret
set as local environment variables to use authenticated requests.
"""
# Put together the URL that will save our request
domain = "https://web.archive.org"
save_url = urljoin(domain, "/save/")
request_url = save_url + target_url

# Send the capture request to archive.org
headers = {
"User-Agent": user_agent,
}
response = requests.get(request_url, headers=headers)
# Access Keys for Internet Archive API
if authenticate:
if "access_key" in os.environ and "secret" in os.environ:
access_key = os.environ["access_key"]
secret = os.environ["secret"]
else:
print(
"You have not set your local environment variables access_key"
"and secret in order to use the authenticate flag"
)
sys.exit(1)
authorization = f"LOW {access_key}:{secret}"
headers = {
"Accept": "application/json",
"User-Agent": user_agent,
"Authorization": authorization,
"Content-Type": "application/x-www-form-urlencoded",
}
response = requests.get(request_url, headers=headers)
if response.status_code == 401:
print("Your archive.org access key and/or secret is not valid")
sys.exit(1)
else:
headers = {
"User-Agent": user_agent,
}
response = requests.get(request_url, headers=headers)

# If it has an error header, raise that.
has_error_header = "X-Archive-Wayback-Runtime-Error" in response.headers
Expand Down Expand Up @@ -101,8 +129,8 @@ def capture_or_cache(
target_url, user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
):
"""
Archive the provided URL using archive.org's Wayback Machine, unless the page has been recently captured.
Archive the provided URL using archive.org's Wayback Machine,
unless the page has been recently captured.
Returns a tuple with the archive.org URL where the capture is stored,
along with a boolean indicating if a new capture was conducted.
Expand All @@ -120,17 +148,30 @@ def capture_or_cache(
@click.argument("url")
@click.option("-ua", "--user-agent", help="User-Agent header for the web request")
@click.option("-c", "--accept-cache", help="Accept and return cached URL", is_flag=True)
def cli(url: str, user_agent: typing.Optional[str] = None, accept_cache: bool = False):
@click.option(
"-a",
"--authenticate",
help="Allows you to run saves with authenication",
is_flag=True,
)
def cli(
url: str,
user_agent: typing.Optional[str] = None,
accept_cache: bool = False,
authenticate: bool = False,
):
"""
Archive the provided URL using archive.org's Wayback Machine.
Raises a CachedPage exception if archive.org declines to conduct a new capture and returns a previous snapshot instead.
Raises a CachedPage exception if archive.org declines
to conduct a new capture and returns a previous snapshot instead.
"""
kwargs: typing.Dict[typing.Any, typing.Any] = {}
if user_agent:
kwargs["user_agent"] = user_agent
if accept_cache:
kwargs["accept_cache"] = accept_cache
if authenticate:
kwargs["authenticate"] = authenticate
archive_url = capture(url, **kwargs)
click.echo(archive_url)

Expand Down
9 changes: 8 additions & 1 deletion tests/test_capture.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
import savepagenow


def test_capture():
"""Test the basic function of retriving a URL from Wayback."""
url = "https://www.latimes.com/"
archive_url, c = savepagenow.capture_or_cache(url)
assert archive_url.startswith("https://web.archive.org/")

def test_auth_capture():
"""Test the retrieval of URL from Wayback with authentication"""
url = "https://www.latimes.com/"
# The env variables get pulled when we call capture()
# You have to set access_key and secret to the appropriate values for your archive.org account
archive_url, c = savepagenow.capture(url, authenticate=True)
assert archive_url.startswith("https://web.archive.org/")

0 comments on commit 73652ed

Please sign in to comment.