Skip to content

Commit

Permalink
Merge pull request #82 from mediacloud/feature-mc-user-agent
Browse files Browse the repository at this point in the history
central storage for User-Agent to use across MC projects
  • Loading branch information
rahulbot committed Feb 15, 2024
2 parents b67c651 + 6f2b8f9 commit ed79cee
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion mcmetadata/webpages.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,21 @@
DEFAULT_TIMEOUT_SECS = 3 # wait only this many seconds for a server to respond with content

# pretend to be this kind of browser
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; automated fetch; TBD domain)'

# central storage for use by Media Cloud projects, follows industry norms for bots
MEDIA_CLOUD_USER_AGENT = 'Mozilla/5.0 (compatible; mediacloud academic archive; mediacloud.org)'


def fetch(url: str, user_agent: str = None, timeout: int = None, fix_encoding: bool = True) -> tuple:
"""
Simple helper to fetch a webpage and return the HTML content and the response object.
@param url: the URL to fetch
@param user_agent: the user agent to use (defaults to generic DEFAULT_USER_AGENT)
@param timeout: how long to wait before giving up (defaults to DEFAULT_TIMEOUT_SECS)
@param fix_encoding: encodings are terribly inconsistent; we found it helps to fix obvious errors (default True)
@return: a tuple with the HTML text content and the `requests` response object
"""
custom_user_agent = user_agent or DEFAULT_USER_AGENT
custom_timeout = timeout or DEFAULT_TIMEOUT_SECS
# grab HTML only once so each library doesn't have to do it
Expand Down

0 comments on commit ed79cee

Please sign in to comment.