Merge pull request #82 from mediacloud/feature-mc-user-agent

central storage for User-Agent to use across MC projects
mediacloud · Feb 15, 2024 · ed79cee · ed79cee
2 parents b67c651 + 6f2b8f9
commit ed79cee
Showing 1 changed file with 12 additions and 1 deletion.
diff --git a/mcmetadata/webpages.py b/mcmetadata/webpages.py
@@ -6,10 +6,21 @@
 DEFAULT_TIMEOUT_SECS = 3  # wait only this many seconds for a server to respond with content
 
 # pretend to be this kind of browser
-DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
+DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; automated fetch; TBD domain)'
+
+# central storage for use by Media Cloud projects, follows industry norms for bots
+MEDIA_CLOUD_USER_AGENT = 'Mozilla/5.0 (compatible; mediacloud academic archive; mediacloud.org)'
 
 
 def fetch(url: str, user_agent: str = None, timeout: int = None, fix_encoding: bool = True) -> tuple:
+    """
+    Simple helper to fetch a webpage and return the HTML content and the response object.
+    @param url: the URL to fetch
+    @param user_agent: the user agent to use (defaults to generic DEFAULT_USER_AGENT)
+    @param timeout: how long to wait before giving up (defaults to DEFAULT_TIMEOUT_SECS)
+    @param fix_encoding: encodings are terribly inconsistent; we found it helps to fix obvious errors  (default True)
+    @return: a tuple with the HTML text content and the `requests` response object
+    """
     custom_user_agent = user_agent or DEFAULT_USER_AGENT
     custom_timeout = timeout or DEFAULT_TIMEOUT_SECS
     # grab HTML only once so each library doesn't have to do it