## A Practical Introduction to Web Scraping in Python

https://realpython.com/python-web-scraping-practical-introduction/

In [1]:
# raw html manipulation
from urllib.request import urlopen

# regular expressions
import re



In [10]:
url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
page

<http.client.HTTPResponse at 0x1078db670>

In [11]:
html_bytes = page.read()
html = html_bytes.decode("utf-8")

In [13]:
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [15]:
tit = "<title>"
title_index = html.find(tit)
title_index

14

In [17]:
start_idx = title_index + len(tit)
start_idx

21

In [20]:
end_tit = "</title>"
end_idx = html.find(end_tit)
end_idx

39

In [21]:
title = html[start_idx:end_idx]
title

'Profile: Aphrodite'

-------
### Messier HTML Website

In [22]:
url2 = "http://olympus.realpython.org/profiles/poseidon"

def get_title(url):
    page = urlopen(url)
    html = page.read().decode("utf-8")
    start_index = html.find("<title>") + len("<title>")
    end_index = html.find("</title>")
    title = html[start_index:end_index]
    return title

In [25]:
title2 = get_title(url2)
title2

'\n<head>\n<title >Profile: Poseidon'

Oh no! `<title >` --> messy messy

We, can do, better!

----
## Regular Expressions
- **metacharacters**: special charaters that denote different patterns.
    - `*`: zero or more instances of what comes before the asterisk


In [27]:
re.findall("ab*c", "ac")


['ac']

"ab*c" --> matches:
- begin with "a"
- ends with "c"
- has zero or more instances of "b" between the two

In [35]:
re.findall("ab*c", "abcd")

['abc']

In [36]:
re.findall("ab*c", "acc")

['ac']

In [37]:
re.findall("ab*c", "abcac")

['abc', 'ac']

In [38]:
re.findall("ab*c", "abdc")

[]

#### Case sensitive
You can pass an arg to make it not case sensitive

In [44]:
re.findall("ab*c", "ABC")

[]

In [45]:
re.findall("ab*c", "ABC", re.IGNORECASE)

['ABC']

#### (.) period --> stands for any \*one\* single character

In [62]:
re.findall("a.c", "abc")

['abc']

In [51]:
re.findall("a.c", "abbc")

[]

In [52]:
re.findall("a.c", "ac")

[]

In [53]:
re.findall("a.c", "acc")

['acc']

#### .* --> any character, repeated any num times

In [58]:
re.findall("a.*c", "abc")

['abc']

In [59]:
re.findall("a.*c", "abbc")

['abbc']

In [60]:
re.findall("a.*c", "ac")

['ac']

In [61]:
re.findall("a.*c", "acc")

['acc']

#### re.search()
- More complicated than re.findall()
- re.search() --> returns a MatchObject with differen groups of data (there could be multiple matches inside of other matches)
- You probably just need the first and most inclusive result

In [66]:
match_results = re.search("ab*c", "ABC", re.IGNORECASE)
match_results.group()

'ABC'

#### re.sub()
- substitute: replace text in a string that matches a regular expression with new text. (kind of like .replace()
- Pass reg expression, replacement text, string

In [68]:
string = "Everything is <replaced> if it's in <tags>."
string = re.sub("<.*>", "ELEPHANTS", string)
string

'Everything is ELEPHANTS.'

--> greedy reg-exes, so try to find longest possible match when `*` is used

#### `*?` --> match shortest possible string of text


In [69]:
string = "Everything is <replaced> if it's in <tags>."
string = re.sub("<.*?>", "ELEPHANTS", string)
string

"Everything is ELEPHANTS if it's in ELEPHANTS."

-----
### Extract Text from HTML With RegEx
Messy title part 2: `<TITLE >Profile: Dionysus</title  / >`


In [81]:
import re
from urllib.request import urlopen

def messy_regex_html():
    url = "http://olympus.realpython.org/profiles/dionysus"
    page = urlopen(url)
    html = page.read().decode("utf-8")
    
    pattern = "<title.*?>.*?</title.*?>"
    match_results = re.search(pattern, html, re.IGNORECASE)
    print(match_results)
    
    title = match_results.group()
    print(title)
    
    title = re.sub("<.*?>", "", title) # Remove HTML tags
    print(title)

In [82]:
messy_regex_html()

<re.Match object; span=(14, 51), match='<TITLE >Profile: Dionysus</title  / >'>
<TITLE >Profile: Dionysus</title  / >
Profile: Dionysus


----
Quiz Time!


In [97]:
from urllib.request import urlopen

url = "http://olympus.realpython.org/profiles/dionysus"

page = urlopen(url)
html_pre = page.read()
print(html_pre)

html = html_pre.decode('utf-8')
print(f'\n{html}')

b'<html>\n<head>\n<TITLE >Profile: Dionysus</title  / >\n</head>\n<body bgcolor="yellow">\n<center>\n<br><br>\n<img src="/static/dionysus.jpg" />\n<h2>Name: Dionysus</h2>\n<img src="/static/grapes.png"><br><br>\nHometown: Mount Olympus\n<br><br>\nFavorite animal: Leopard <br>\n<br>\nFavorite Color: Wine\n</center>\n</body>\n</html>\n'

<html>
<head>
<TITLE >Profile: Dionysus</title  / >
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/dionysus.jpg" />
<h2>Name: Dionysus</h2>
<img src="/static/grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</center>
</body>
</html>



#### Melissa's solution lol
get name and favorite color

In [92]:
# html.find("<h2>.*?</h2>.*?<br><br>.*?<br>.<br>")
name_start = html.find("Name")
name_start
name_end = html.find("</h2")

In [98]:
html[name_start:name_end]

'Name: Dionysus'

In [107]:
fav_start = html.find("Favorite Color")
fav_start
fav_end = html.find("<", fav_start)

In [108]:
html[fav_start:fav_end].strip()

'Favorite Color: Wine'

#### Solution


In [103]:
# read in the same
from urllib.request import urlopen
url = "http://olympus.realpython.org/profiles/dionysus"
html_page = urlopen(url)
html_text = html_page.read().decode("utf-8")

In [104]:
# Lol, I didn't answer it how it wanted to
# Not supposed to keep the keys
for string in ["Name: ", "Favorite Color:"]:
    # I did this part at least
    string_start_idx = html_text.find(string)
    # Nope
    text_start_idx = string_start_idx + len(string)

    next_html_tag_offset = html_text[text_start_idx:].find("<")
    text_end_idx = text_start_idx + next_html_tag_offset

    raw_text = html_text[text_start_idx : text_end_idx]
    clean_text = raw_text.strip(" \r\n\t")
    print(clean_text)


Dionysus
Wine


#### nifty strip too

----

## Use an HTML Parser for Web Scraping in Python



In [109]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [110]:
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

In [113]:
# Sometimes is easier to get the raw text
soup.get_text()

'\n\nProfile: Dionysus\n\n\n\n\n\nName: Dionysus\n\nHometown: Mount Olympus\n\nFavorite animal: Leopard \n\nFavorite Color: Wine\n\n\n\n'

In [114]:
# Other times its more useful to keep the HTML tags
# Not strings, these are instances of Tag object
soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [115]:
img1, img2 = soup.find_all("img")

In [116]:
img1.name

'img'

In [117]:
img1['src']

'/static/dionysus.jpg'

In [121]:
# Beautiful Soup will clean up TITLE --> title
soup.title

<title>Profile: Dionysus</title>

In [122]:
soup.title.string

'Profile: Dionysus'

In [124]:
# Get specific tag
soup.find_all('img', src='/static/dionysus.jpg')

[<img src="/static/dionysus.jpg"/>]

#### Quiz Time!


In [127]:
page = urlopen("http://olympus.realpython.org/profiles")
html = page.read().decode('utf-8')
soupy = BeautifulSoup(html, 'html.parser')

In [132]:
soupy.find_all('a')

[<a href="/profiles/aphrodite">Aphrodite</a>,
 <a href="/profiles/poseidon">Poseidon</a>,
 <a href="/profiles/dionysus">Dionysus</a>]

In [135]:
# Solution
# I forgot you could just do the link['href'] to get the value
for link in soupy.find_all('a'):
    base = "base"
    print(base + link["href"])

base/profiles/aphrodite
base/profiles/poseidon
base/profiles/dionysus


-----
## Interact With HTML Forms
What if I need to click buttons to get what I need?!?

#### Mechanical Soup
Installs **headless browswer**: web browser with no graphical user interface. 




In [2]:
import mechanicalsoup
browser = mechanicalsoup.Browser(soup_config={'features': 'lxml'})

In [6]:
url = "http://olympus.realpython.org/login"

# 200 --> success
# 404 --> URL doesn't exist
# 500 --> server error when making the request
page = browser.get(url)
page

<Response [200]>

In [7]:
type(page.soup)

bs4.BeautifulSoup

In [8]:
page.soup

<html>
<head>
<title>Log In</title>
</head>
<body bgcolor="yellow">
<center>
<br/><br/>
<h2>Please log in to access Mount Olympus:</h2>
<br/><br/>
<form action="/login" method="post" name="login">
Username: <input name="user" type="text"/><br/>
Password: <input name="pwd" type="password"/><br/><br/>
<input type="submit" value="Submit"/>
</form>
</center>
</body>
</html>

In [9]:
# 1
browser = mechanicalsoup.Browser()
url = "http://olympus.realpython.org/login"
login_page = browser.get(url)
login_html = login_page.soup

In [10]:
# 2
form = login_html.select("form")[0]
form

<form action="/login" method="post" name="login">
Username: <input name="user" type="text"/><br/>
Password: <input name="pwd" type="password"/><br/><br/>
<input type="submit" value="Submit"/>
</form>

In [18]:
form.select("input")[0]["value"] = "zeus"
form.select("input")[1]["value"] = "ThunderDude"
form

<form action="/login" method="post" name="login">
Username: <input name="user" type="text" value="zeus"/><br/>
Password: <input name="pwd" type="password" value="ThunderDude"/><br/><br/>
<input type="submit" value="Submit"/>
</form>

In [19]:
# 3
profiles_page = browser.submit(form, login_page.url)
profiles_page, profiles_page.url


(<Response [200]>, 'http://olympus.realpython.org/profiles')

#### Woo! We authenticated
Now time to get the URL for each link ont he /profiles page


In [24]:
links = profiles_page.soup.select("a")
links

[<a href="/profiles/aphrodite">Aphrodite</a>,
 <a href="/profiles/poseidon">Poseidon</a>,
 <a href="/profiles/dionysus">Dionysus</a>]

In [26]:
base_url = "http://olympus.realpython.org"
for link in links:
    address = base_url + link['href']
    text = link.text
    print(f"{text}: {address}")

Aphrodite: http://olympus.realpython.org/profiles/aphrodite
Poseidon: http://olympus.realpython.org/profiles/poseidon
Dionysus: http://olympus.realpython.org/profiles/dionysus


#### Quiz Time!


In [27]:
# I cheated, I'm tired
# It was asking like.. the same info except
print(profiles_page.soup.title)

<title>All Profiles</title>


## Interact With Websites in Real Time
Almost done, so close! Then to the gym with me

Fetch real-time data from website that offers updated info (**I want dis**)


Look by **id** for the **h2 header**
* **#** means **id value**

In [34]:
import mechanicalsoup
import time

url = "http://olympus.realpython.org/dice"

In [30]:
browser = mechanicalsoup.Browser()
page = browser.get(url)

tag = page.soup.select("#result")[0]
tag

<Response [200]>


<h2 id="result">6</h2>

In [33]:
result = tag.text
print(f"The result of your dice roll is: {result}")


The result of your dice roll is: 6


In [40]:
def loop(sleep, loops):
    for i in range(loops):
        browser = mechanicalsoup.Browser()
        page = browser.get(url)
        tag = page.soup.select("#result")[0]
        result = tag.text
        print(f"The result of your dice roll is: {result}. Loop: {i}")
        if (i+1 < loops):
            time.sleep(sleep)



In [41]:
loop(5, 3)

The result of your dice roll is: 1. Loop: 0
The result of your dice roll is: 2. Loop: 1
The result of your dice roll is: 1. Loop: 2


#### Suspicious
Don't request the same page a bunch of times in a row.
Check the footer for terms and services of the site.

## Additional Resources

* [Beautiful Soup: Build a Web Scraper With Python](https://realpython.com/beautiful-soup-web-scraper-python/)
* [API Integration in Python](https://realpython.com/api-integration-in-python/)
* [Python & APIs: A Winning Combo for Reading Public Data](https://realpython.com/python-api/)

