# Utilizando a lib "urllib.request" para fazer requisções

In [1]:
from urllib.request import urlopen

In [44]:
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


# Utilizando a lib BeautifulSoup:

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [8]:
html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


# Tratando exceções no scraping:

In [9]:
from urllib.request import urlopen
from urllib.error import HTTPError, URLError

In [16]:
try: 
    html = urlopen('https://pythonscraping')
except HTTPError as e:
    print('The server returned an HTTP error')
except URLError as e:
    print('The server could not be found!')
else:
    print(html.read())

The server could not be found!


# Utilizando funções para tratar exceções:

In [17]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

In [26]:
def getTitle(url: str):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1.text
    except AttributeError as e:
        return None
    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')

if title == None:
    print('Title could not be found!')
else:
    print(title)

An Interesting Title


### Considerações sobre o tratamento de exceções acima:
* O primeiro "try" faz a validação se o servidor existir, mas se ele estiver com algum problema retorna a várialve "title = None";
* O segundo "try" é uma validação para caso o servidor não exista ou uma tag não exista;