# Finding the owner of the webiste

To find who owns the website, the WHOIS protocol can be used to see who is the registered owner of the domain name.

In [None]:
import whois

print(whois.whois("https://utahavalanchecenter.org"))

# Download web page

When download a web page we might find errors that are byond our control, such as the requested page may no longer exist. So, bellow is a robust version to chatch these excepetions.

In [None]:
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError

In [None]:
import urllib.request

url = ("http://httpstat.us/500")
url1 = "http://www.abola.pt"
def download(url):
    return urllib.request.urlopen(url).read()

In [None]:
def download(url):
    print("Downloading:", url)
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError) as e:
        print("Download error:", e.reason)
        html = None
    return html

In [None]:
download(url)

In [None]:
# This script ensure that when downloading a web page, when a download or error is eountere, the 
# exception is caught and the function returns none. Often, the errors when downloading are temporary 
# like when the web serve is overloaded and returns a 503 Service Unavailable error, so for that we can 
# try the download after a short time. However if the server returns 4040 Not Found, then try again is unlikely to produce.

def download(url, num_retries=2):
    print("Downloading:", url)
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError) as e:
        print("Download error:", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500  <= e.code < 600:
                return download(url, num_retries - 1)
    return html



# Setting a user agent

In [None]:
def download(url, user_agent ="wswp", num_retries=2):
    print("Downloading:", url)
    request = urllib.request.Request (url)
    request.add_header("User-agent", user_agent)
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError) as e:
        print("Download error:", e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500  <= e.code < 600:
                return download(url, num_retries - 1)
    return html

In [None]:
my_url = "http://example.webscraping.com/sitemap.xml"

In [21]:
import urllib.request
import re
import gzip

from urllib.error import URLError, HTTPError, ContentTooShortError


def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
    print('Downloading:', url)
    request = urllib.request.Request(url)
    request.add_header('User-agent', user_agent)
    try:
        resp = urllib.request.urlopen(request)
        
        if resp.info().get('Content-Encoding') == 'gzip':
            compressed_data = resp.read()
            html = gzip.decompress(compressed_data).decode(charset)
        else:
            cs = resp.headers.get_content_charset()
            if not cs:
                cs = charset
            html = resp.read().decode(cs)
            
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # recursively retry 5xx HTTP errors
                return download(url, num_retries - 1)
    return html


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<td>(.*?)</td>', sitemap)
    # download each link
    for link in links:
        html = download(link)
        # scrape html here

In [24]:
html = download("https://en.wikipedia.org/wiki/List_of_largest_banks")
html

Downloading: https://en.wikipedia.org/wiki/List_of_largest_banks


'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>List of largest banks - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-di

In [22]:
crawl_sitemap("https://en.wikipedia.org/wiki/List_of_largest_banks")

Downloading: https://en.wikipedia.org/wiki/List_of_largest_banks
Downloading: 1


ValueError: unknown url type: '1'