Python for Everybody
## Chapter 12. Networked Programs



In [1]:
# 12.1 HyperText Transport Protocol - HTTP
# Read the textbook

### You can skip the coding samples in the following sections
- 12.2 The World's Simplest Web Browser
- 12.3 Retrieving an Image over HTTP



In [2]:
# 12.4 Retrieving web pages with urllib

import urllib.request                 # import the urllib.request module
                                      # Using this module, you can'talk' to web servers.
counts = dict()                       

# utlopen() is like open, except the file is on a remote server.
fhand = urllib.request.urlopen('http://www.py4inf.com/code/romeo.txt')   

for line in fhand:
    words = line.split()
    for word in words:
        counts[word] = counts.get(word,0) + 1
print(counts)

{b'sun': 2, b'fair': 1, b'and': 3, b'But': 1, b'grief': 1, b'east': 1, b'light': 1, b'is': 3, b'pale': 1, b'window': 1, b'sick': 1, b'soft': 1, b'what': 1, b'through': 1, b'already': 1, b'the': 3, b'Arise': 1, b'Juliet': 1, b'with': 1, b'It': 1, b'yonder': 1, b'breaks': 1, b'Who': 1, b'envious': 1, b'kill': 1, b'moon': 1}


In [3]:
# 12.5 & 12.5 Parsing HTML and Scraping the Web Using Regular Expressions

import urllib.request
import re

url = "http://www.cmu.edu"
fhand = urllib.request.urlopen(url)        # open a URL
# print(type(fhand))                       # fhand is a 'http.client.HTTPResponse' object
html  = fhand.read()
# print(type(html))                        # http.client.HTTPResponse.read() returns a 'bytes' objct
htmlStr = html.decode('utf-8', 'ignore')   # decode the bytes object into a UNICODE string, and
                                           # 'ignore' any erros.

links = re.findall('href="(http://.*?)"', htmlStr)    # find all the URLs (links) on this page
for link in links:
    print(link)
    

http://www.cmu.edu/news/feeds/news.rss
http://www.cmu.edu/
http://www.library.cmu.edu/
http://admission.enrollment.cmu.edu/
http://www.cmu.edu/graduate/admissions/
http://www.cmu.edu/leadership/
http://athletics.cmu.edu
http://www.giving.cmu.edu/s/1410/giving/16/landing.aspx?sid=1410&amp;gid=1&amp;pgid=7010
http://www.cmu.edu/news/stories/archives/2016/october/historymakers.html
http://cmtoday.cmu.edu/artsculture_entertainment/artistic-double-play/
http://cmtoday.cmu.edu/publicpolicy_innovation/improving-global-health-one-shot-at-a-time/
http://www.cmu.edu/news/stories/archives/2016/october/lackner-wins-200th%20.html
http://www.cmu.edu/news/stories/archives/2016/october/frontiers-post.html
http://www.cmu.edu/news/stories/archives/2016/october/colbert-obama.html
http://www.cmu.edu/news
http://www.cmu.edu/leadership/pres-fellow-scholar/
http://www.cmu.edu/brain
http://www.cmu.edu/energy
http://www.cmu.edu/simon
http://www.cmu.edu/strategic-plan/index.html
http://www.cmu.edu/news/stories/

***

## Install BeautifulSoup by executing the following command in your <u>virtual environment</u>:

pip install bs4

***

### Commonly used BeautifulSoup methods

#### soup methods
```
soup.find(id="xxxx")         # find a tag with a specific id. (There is only 1 tag with a specific ID in HTML.)
soup('a')                    # list all the 'a' tags in the page.
soup.find_all("div", { "class" : "topstories" })   # find all instances of a specific tag (e.g., <div>)
                                                   # with a specific class (e.g., "topstories")
```

#### tag methods
```
tag.get('href', None)        # get the tag's attribute (e.g., 'href'. If there is no attribute with the 
                             # specified name, return None.
tag.text                     # returns the tag's text content
```                               

In [6]:
import urllib.request
from bs4 import BeautifulSoup as bs

url = "http://www.oocities.org/rainforest/1035/"
html = urllib.request.urlopen(url).read()

soup = bs(html, "html.parser")

# Retrieve all the anchor tags
tags = soup('a')
for tag in tags:
    ## Look at the parts of a tag
    print("---")
    print('TAG:'+ str(tag))
    print('URL:' + str(tag.get('href', None)))
    print('Content:' + str(tag.contents[0]))
    print('Attrs:' + str(tag.attrs))
    print('Text:' + str(tag.text))
    
## Note: Unlike in the book, the 'tag' types in BeautifulSoup cannot be implicitly converted to string.

---
TAG:<a href="com.htm"><img border="0" src="icon2a.gif">Come in!</img></a>
URL:com.htm
Content:<img border="0" src="icon2a.gif">Come in!</img>
Attrs:{'href': 'com.htm'}
Text:Come in!
---
TAG:<a href="history.htm"><img border="0" src="icon5a.gif">Our history page</img></a>
URL:history.htm
Content:<img border="0" src="icon5a.gif">Our history page</img>
Attrs:{'href': 'history.htm'}
Text:Our history page
---
TAG:<a href="what.htm"><img border="0" src="icon4a.gif"> Whats` New</img></a>
URL:what.htm
Content:<img border="0" src="icon4a.gif"> Whats` New</img>
Attrs:{'href': 'what.htm'}
Text: Whats` New
---
TAG:<a href="yuv_home.htm"> y.</a>
URL:yuv_home.htm
Content: y.
Attrs:{'href': 'yuv_home.htm'}
Text: y.
---
TAG:<a href="guest.htm">guest book</a>
URL:guest.htm
Content:guest book
Attrs:{'href': 'guest.htm'}
Text:guest book
---
TAG:<a href="mailto:mossad@eindor.org.il">mossad@eindor.org.il</a>
URL:mailto:mossad@eindor.org.il
Content:mossad@eindor.org.il
Attrs:{'href': 'mailto:mossad@eind

***
### The following program visits the Post Gazette's Opinion page (http://www.post-gazette.com/opinion) and finds a list of URLs for the current top stories. Then, the program visits each of the top stories, retrieves the story, and saves it as a text file.

In [7]:
import urllib.request
from bs4 import BeautifulSoup as bs
# ------------------------------------------------------------
# Part I - Create a list of URLs for the current top stories in 
# the Opinions Section
# ------------------------------------------------------------
url = "http://www.post-gazette.com/opinion"    # URL
fhand = urllib.request.urlopen(url)            # open the URL
html = fhand.read()                            # read the page
soup = bs(html, "html.parser")                        # create a beautifulsoup object

topStories = soup.find_all("div", { "class" : "topstories" })  # find all the <div> tags, where class = top stories.

# Find all of the anchor ('a') tags in the <div>, and put it in the 'links' list.
links = list()                           # Initialize 'links' with an empty list.
tags = topStories[0].findAll('a')        # Find all the 'a' tags within the top stories <div>
for tag in tags:                         # For each 'a' tag in a list of 'a' tags
    c = str(tag.get('class', None))      # Get the value of the class attribute. Return None if there's no class attr.
                                         # We know that if it is not None, the 'a' tag is not a story.
    if c == "None":                      # if None (i.e., one of the top story linnks)
        url = tag.get('href', None)      # get the URL attribute in the 'a' tag.
        if url != None and url.startswith("http://"):   # if url is not None, and starts with "http://"
            links.append(url)
fhand.close()

# ------------------------------------------------------------
# Part II - Visit each top story page, retrieve the content, and 
# save its title and paragraphs (story) as a text file.
# ------------------------------------------------------------
for url in links:                            # for each URL in the list of URLs (top stories),
    print("URL: {}\n".format(url))           # print the URL (just check to make sure it looks right)
    fhand = urllib.request.urlopen(url)      # open the URL
    html = fhand.read()                      # read the page
    soup = bs(html, "lxml")                  # create a beautifulsoup object
    story = soup.find(id="story")            # find an element (tag) with id="story"
    
    if story != None:                        # if 'story' is not None
        header = story.findNext("h1")        # find the <h1> tag.
        body   = story.findNext("div", { "class" : "thisStory" })            # find <div> with class="thisStory"

        if header != None and body != None:  # if header and body are both valid (!= None)
            filename = "text/{}.txt".format(header.text.replace(" ", "-"))   # construct a filename
            fout = open(filename, "w")                  # open a new file to write            
            fout.write("{}\n\n".format(header.text))    # write the article's title

            paragraphs = body.findAll('p')              # find all the <p> tags (i.e., paragraphs)
            for p in paragraphs:                        # print each paragraph to the file.
                fout.write("{}\n\n".format(p.text))
            fout.close()
    fhand.close()
    

URL: http://www.post-gazette.com/opinion/editorials/2016/10/24/China-gears-up-As-plenary-session-opens-internal-issues-rule/stories/201610220039

URL: http://www.post-gazette.com/opinion/editorials/2016/10/24/Euro-Mars-angst-But-the-lander-s-flub-was-just-a-bump-on-its-mission/stories/201610240023

URL: http://www.post-gazette.com/opinion/editorials/2016/10/24/Walk-on-the-mild-side-Stressed-Angry-Breathe-take-a-mellow-stroll/stories/201610310050

URL: http://www.post-gazette.com/opinion/editorials/2016/10/23/Harrisburg-fiddles-Lawmakers-are-busy-avoiding-the-big-topics/stories/201610310046

URL: http://www.post-gazette.com/opinion/editorials/2016/10/23/Ukraine-cont-Let-the-Europeans-hash-out-the-mess-with-Russia/stories/201610220034

URL: http://blogs.post-gazette.com/opinion/rob-rogers-cartoons/47689-worthless-id

URL: http://blogs.post-gazette.com/opinion/rob-rogers-cartoons/47676-transfer-of-power

URL: http://blogs.post-gazette.com/opinion/rob-rogers-cartoons/47659-supreme-election