https://realpython.com/python-web-scraping-practical-introduction/

In [1]:
from urllib.request import urlopen
url = "http://olympus.realpython.org/profiles/aphrodite"

In [15]:
page = urlopen(url)
page

<http.client.HTTPResponse at 0x2af6949d348>

In [16]:
html_bytes = page.read()
html_bytes[0], html_bytes

(60,
 b'<html>\n<head>\n<title>Profile: Aphrodite</title>\n</head>\n<body bgcolor="yellow">\n<center>\n<br><br>\n<img src="/static/aphrodite.gif" />\n<h2>Name: Aphrodite</h2>\n<br><br>\nFavorite animal: Dove\n<br><br>\nFavorite color: Red\n<br><br>\nHometown: Mount Olympus\n</center>\n</body>\n</html>\n')

In [19]:
html = html_bytes.decode("utf-8")
type(html), html[0], html[6], html

(str,
 '<',
 '\n',
 '<html>\n<head>\n<title>Profile: Aphrodite</title>\n</head>\n<body bgcolor="yellow">\n<center>\n<br><br>\n<img src="/static/aphrodite.gif" />\n<h2>Name: Aphrodite</h2>\n<br><br>\nFavorite animal: Dove\n<br><br>\nFavorite color: Red\n<br><br>\nHometown: Mount Olympus\n</center>\n</body>\n</html>\n')

In [18]:
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [20]:
title_index = html.find("<title>")
start_index = title_index + len("<title>")
title_index, start_index

(14, 21)

In [21]:
end_index = html.find("</title>")
title = html[start_index:end_index]
title

'Profile: Aphrodite'

In [22]:
url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
title

'\n<head>\n<title >Profile: Poseidon'

In [24]:
import re
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html, re.IGNORECASE)
title = match_results.group()

print(title)

<TITLE >Profile: Dionysus</title  / >


In [25]:
title = re.sub("<.*?>", "", title) # Remove HTML tags
title

'Profile: Dionysus'

In [None]:
# !pip install beautifulsoup4

In [26]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [27]:
print(soup.get_text())




Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






In [28]:
soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [30]:
image1, image2 = soup.find_all("img")
image1["src"]

'/static/dionysus.jpg'

In [31]:
soup.title

<title>Profile: Dionysus</title>

In [32]:
soup.title.string

'Profile: Dionysus'

In [33]:
soup.find_all("img", src="/static/dionysus.jpg")

[<img src="/static/dionysus.jpg"/>]

In [35]:
# !pip install MechanicalSoup

In [36]:
import mechanicalsoup
browser = mechanicalsoup.Browser()

In [37]:
url = "http://olympus.realpython.org/login"
page = browser.get(url)

In [38]:
type(page.soup)

bs4.BeautifulSoup

In [39]:
page.soup

<html>
<head>
<title>Log In</title>
</head>
<body bgcolor="yellow">
<center>
<br/><br/>
<h2>Please log in to access Mount Olympus:</h2>
<br/><br/>
<form action="/login" method="post" name="login">
Username: <input name="user" type="text"/><br/>
Password: <input name="pwd" type="password"/><br/><br/>
<input type="submit" value="Submit"/>
</form>
</center>
</body>
</html>

In [40]:
import mechanicalsoup

# 1
browser = mechanicalsoup.Browser()
url = "http://olympus.realpython.org/login"
login_page = browser.get(url)
login_html = login_page.soup

# 2
form = login_html.select("form")[0]
form.select("input")[0]["value"] = "zeus"
form.select("input")[1]["value"] = "ThunderDude"

# 3
profiles_page = browser.submit(form, login_page.url)

In [41]:
profiles_page.url

'http://olympus.realpython.org/profiles'

In [43]:
links = profiles_page.soup.select("a")
links

[<a href="/profiles/aphrodite">Aphrodite</a>,
 <a href="/profiles/poseidon">Poseidon</a>,
 <a href="/profiles/dionysus">Dionysus</a>]

In [44]:
for link in links:
    address = link["href"]
    text = link.text
    print(f"{text}: {address}")

Aphrodite: /profiles/aphrodite
Poseidon: /profiles/poseidon
Dionysus: /profiles/dionysus


In [45]:
base_url = "http://olympus.realpython.org"
for link in links:
    address = base_url + link["href"]
    text = link.text
    print(f"{text}: {address}")

Aphrodite: http://olympus.realpython.org/profiles/aphrodite
Poseidon: http://olympus.realpython.org/profiles/poseidon
Dionysus: http://olympus.realpython.org/profiles/dionysus


In [47]:
import mechanicalsoup

browser = mechanicalsoup.Browser()
page = browser.get("http://olympus.realpython.org/dice")
tag = page.soup.select("#result")[0]
result = tag.text

print(f"The result of your dice roll is: {result}")

The result of your dice roll is: 6


In [48]:
import time
import mechanicalsoup

browser = mechanicalsoup.Browser()

for i in range(4):
    page = browser.get("http://olympus.realpython.org/dice")
    tag = page.soup.select("#result")[0]
    result = tag.text
    print(f"The result of your dice roll is: {result}")
    time.sleep(1)

The result of your dice roll is: 3
The result of your dice roll is: 2
The result of your dice roll is: 5
The result of your dice roll is: 4


In [49]:
import time
import mechanicalsoup

browser = mechanicalsoup.Browser()

for i in range(4):
    page = browser.get("http://olympus.realpython.org/dice")
    tag = page.soup.select("#result")[0]
    result = tag.text
    print(f"The result of your dice roll is: {result}")

    # Wait 10 seconds if this isn't the last request
    if i < 3:
        time.sleep(3)

The result of your dice roll is: 2
The result of your dice roll is: 2
The result of your dice roll is: 3
The result of your dice roll is: 4
