Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit check-url behavior and provide User-Agent a custom default value #229

Merged
merged 4 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed

- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
- User-Agent now has a default value (#228)
- Manipulation of spaces with UA suffix and adminEmail has been modified
- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)

## [1.5.3] - 2023-10-02

Expand Down
7 changes: 5 additions & 2 deletions test/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,11 @@ def test_user_agent():
if record.rec_type == "request":
print(record.http_headers)
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "Pixel" in ua
# remove 'and ua != "undefined"' once
# https://github.com/webrecorder/browsertrix-crawler/pull/420 is
# released / used by us
if ua and ua != "undefined":
assert "Mozilla" in ua
assert ua.endswith(" +Zimit test@example.com")
found = True

Expand Down
33 changes: 18 additions & 15 deletions zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from warc2zim.main import warc2zim
from zimscraperlib.uri import rebuild_uri

DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"

class ProgressFileWatcher:
def __init__(self, output_dir, stats_path):
Expand Down Expand Up @@ -226,14 +227,15 @@ def zimit(args=None):

parser.add_argument(
"--userAgent",
help="Override user-agent with specified",
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
default=DEFAULT_USER_AGENT
)

parser.add_argument(
"--userAgentSuffix",
help="Append suffix to existing browser user-agent "
"(ex: +MyCrawler, info@example.com)",
default="+Zimit ",
default="+Zimit",
)

parser.add_argument(
Expand Down Expand Up @@ -344,8 +346,14 @@ def zimit(args=None):

url = zimit_args.url

user_agent = zimit_args.userAgent
if zimit_args.userAgentSuffix:
user_agent += f" {zimit_args.userAgentSuffix}"
if zimit_args.adminEmail:
user_agent += f" {zimit_args.adminEmail}"

if url:
url = check_url(url, zimit_args.scopeType)
url = check_url(url, user_agent, zimit_args.scopeType)
warc2zim_args.append("--url")
warc2zim_args.append(url)

Expand Down Expand Up @@ -394,12 +402,8 @@ def cleanup():
cmd_args.append("--url")
cmd_args.append(url)

user_agent_suffix = zimit_args.userAgentSuffix
if zimit_args.adminEmail:
user_agent_suffix += zimit_args.adminEmail

cmd_args.append("--userAgentSuffix")
cmd_args.append(user_agent_suffix)
cmd_args.append("--userAgent")
cmd_args.append(user_agent)

cmd_args.append("--cwd")
cmd_args.append(str(temp_root_dir))
Expand Down Expand Up @@ -445,13 +449,13 @@ def cleanup():
return warc2zim(warc2zim_args)


def check_url(url, scope=None):
def check_url(url, user_agent, scope=None):
url = urllib.parse.urlparse(url)
try:
resp = requests.head(
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27)
)
resp.raise_for_status()
with requests.get(
url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent}
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
raise SystemExit(1)
Expand Down Expand Up @@ -505,7 +509,6 @@ def get_node_cmd_line(args):
"allowHashUrls",
"lang",
"mobileDevice",
"userAgent",
"useSitemap",
"behaviors",
"behaviorTimeout",
Expand Down