Python for Everybody
## Chapter 12. Networked Programs



In [1]:
# 12.1 HyperText Transport Protocol - HTTP
# Read the textbook

### You can skip the coding samples in the following sections
- 12.2 The World's Simplest Web Browser
- 12.3 Retrieving an Image over HTTP



In [6]:
# 12.4 Retrieving web pages with urllib

import urllib.request                 # import the urllib.request module
                                      # Using this module, you can'talk' to web servers.

# utlopen() is like open, except the file is on a remote server.
http_resp = urllib.request.urlopen('http://www.py4inf.com/code/romeo.txt')   
print(http_resp)
counts = dict()                       
for line in http_resp:
    words = line.split()       # line is a sequence of bytes.
    for word in words:
        counts[word] = counts.get(word,0) + 1
print(counts)
http_resp.close()

print("---------------")
http_resp = urllib.request.urlopen('http://www.py4inf.com/code/romeo.txt')   
counts = dict()
for line in http_resp:
    line = line.decode('utf-8')       # convert the line to a unicode string
    words = line.split()
    for word in words:
        counts[word] = counts.get(word,0) + 1
print(counts)
http_resp.close()

<http.client.HTTPResponse object at 0x1079f9f28>
{b'But': 1, b'soft': 1, b'what': 1, b'light': 1, b'through': 1, b'yonder': 1, b'window': 1, b'breaks': 1, b'It': 1, b'is': 3, b'the': 3, b'east': 1, b'and': 3, b'Juliet': 1, b'sun': 2, b'Arise': 1, b'fair': 1, b'kill': 1, b'envious': 1, b'moon': 1, b'Who': 1, b'already': 1, b'sick': 1, b'pale': 1, b'with': 1, b'grief': 1}
---------------
{'But': 1, 'soft': 1, 'what': 1, 'light': 1, 'through': 1, 'yonder': 1, 'window': 1, 'breaks': 1, 'It': 1, 'is': 3, 'the': 3, 'east': 1, 'and': 3, 'Juliet': 1, 'sun': 2, 'Arise': 1, 'fair': 1, 'kill': 1, 'envious': 1, 'moon': 1, 'Who': 1, 'already': 1, 'sick': 1, 'pale': 1, 'with': 1, 'grief': 1}


In [3]:
# 12.5 & 12.5 Parsing HTML and Scraping the Web Using Regular Expressions

import urllib.request
import re

url = "http://www.cmu.edu"
fhand = urllib.request.urlopen(url)        # open a URL
# print(type(fhand))                       # fhand is a 'http.client.HTTPResponse' object
html  = fhand.read()
# print(type(html))                        # http.client.HTTPResponse.read() returns a seqquence of'bytes'
htmlStr = html.decode('utf-8', 'ignore')   # decode the bytes object into a UNICODE string, and
                                           # 'ignore' any erros.

links = re.findall('href="(http://.*?)"', htmlStr)    # find all the URLs (links) on this page
for link in links:
    print(link)
    

http://www.cmu.edu/news
http://www.cmu.edu/diversity
http://www.cmu.edu/strategic-plan/index.html
http://miller-ica.cmu.edu/exhibitions/#48/upcoming-paradox-the-body-in-the-age-of-ai
http://www.art.cmu.edu/event/zoe-leonard-with-rhea-anastas/
http://music.cmu.edu/events/1055
http://www.alumni.cmu.edu/s/1410/alumni/index-social.aspx?sid=1410&amp;gid=1&amp;pgid=10156&amp;content_id=8922
http://www.cmu.edu/events/
http://www.cmu.edu/jobs/
http://www.cmu.edu/global/presence/
http://www.cmu.edu/news/
http://www.cmu.edu/title-ix/
http://www.alumni.cmu.edu/s/1410/alumni/start.aspx
http://www.facebook.com/carnegiemellonu
http://www.twitter.com/carnegiemellon
http://www.youtube.com/carnegiemellonu
http://www.library.cmu.edu/
http://admission.enrollment.cmu.edu/
http://www.cmu.edu/graduate/admissions/
http://www.cmu.edu/leadership/
http://www.cmu.edu/diversity
http://athletics.cmu.edu
http://www.giving.cmu.edu/s/1410/giving/16/landing.aspx?sid=1410&amp;gid=1&amp;pgid=7010


***

## Install BeautifulSoup by executing the following command in your <u>virtual environment</u>:

pip install bs4

***

### Commonly used BeautifulSoup methods

#### soup methods
```
soup.find(id="xxxx")         # find a tag with a specific id. (There is only 1 tag with a specific ID in HTML.)
soup('a')                    # list all the 'a' tags in the page.
soup.find_all("div", { "class" : "topstories" })   # find all instances of a specific tag (e.g., <div>)
                                                   # with a specific class (e.g., "topstories")
```

#### tag methods
```
tag.get('href', None)        # get the tag's attribute (e.g., 'href'. If there is no attribute with the 
                             # specified name, return None.
tag.text                     # returns the tag's text content
```                               

In [4]:
import urllib.request
from bs4 import BeautifulSoup as bs

url = "https://www.cmu.edu/dietrich/english/"
html = urllib.request.urlopen(url).read()

soup = bs(html, "html.parser")

# Retrieve all the anchor tags
tags = soup('a')
for tag in tags:
    ## Look at the parts of a tag
    print("---")
    print('TAG:'+ str(tag))
    print('URL:' + str(tag.get('href', None)))
    print('Content:' + str(tag.contents[0]))
    print('Attrs:' + str(tag.attrs))
    print('Text:' + str(tag.text))
    
## Note: Unlike in the book, the 'tag' types in BeautifulSoup cannot be implicitly converted to string.

---
TAG:<a class="wordmark" href="//www.cmu.edu/">Carnegie Mellon University</a>
URL://www.cmu.edu/
Content:Carnegie Mellon University
Attrs:{'class': ['wordmark'], 'href': '//www.cmu.edu/'}
Text:Carnegie Mellon University
---
TAG:<a aria-haspopup="true" aria-label="Menu" class="menu-btn" href="#nav"><b>—</b><b>—</b><b>—</b></a>
URL:#nav
Content:<b>—</b>
Attrs:{'aria-haspopup': 'true', 'aria-label': 'Menu', 'class': ['menu-btn'], 'href': '#nav'}
Text:———
---
TAG:<a href="index.html">Department of English</a>
URL:index.html
Content:Department of English
Attrs:{'href': 'index.html'}
Text:Department of English
---
TAG:<a href="https://www.cmu.edu/dietrich/index.html" target="_blank">Dietrich College of Humanities and Social Sciences</a>
URL:https://www.cmu.edu/dietrich/index.html
Content:Dietrich College of Humanities and Social Sciences
Attrs:{'href': 'https://www.cmu.edu/dietrich/index.html', 'target': '_blank'}
Text:Dietrich College of Humanities and Social Sciences
---
TAG:<a class="b

IndexError: list index out of range

***
### The following program visits the CMU's News Stories page (https://www.cmu.edu/news/stories/index.html) and finds a list of URLs for the current top stories. Then, the program visits each of the top stories, retrieves the story, and saves it as a text file.

#### About Robot Exclusion Standard
robots.txt file is used by the owner of the website to indicate which sections of their website may or may not be accessed by computer programs. If you are intested, the following web sites provide you with more information: 
- http://www.robotstxt.org/orig.html
- https://en.wikipedia.org/wiki/Robots_exclusion_standard

Try opening the following file on the CMU's website: https://www.cmu.edu/robots.txt


In [8]:
import urllib.request
from bs4 import BeautifulSoup as bs
# ------------------------------------------------------------
# Part I - Create a list of URLs for the current top stories in 
# the Opinions Section
# ------------------------------------------------------------
url = "https://www.cmu.edu/news/stories/index.html"    # URL
url_hand = urllib.request.urlopen(url)                 # open the URL
html = url_hand.read()                                 # read the page
soup = bs(html, "html.parser")                         # create a beautifulsoup object

# find the <div> tags, where claass = news. We assume that there is only one.
topStories = soup.find("div", { "class" : "news" })
#print(type(topStories))
# Find all of the anchor ('a') tags in the <div>, and put it in the 'links' list.
links = list()                           # Initialize 'links' with an empty list.
tags = topStories.findAll('a')           # Find all the 'a' tags within the top stories <div>

for tag in tags[:10]:                     # For each 'a' tag in a list of 'a' tags
    url = tag.get('href', None)          # get the URL attribute in the 'a' tag.
    if url != None and url.startswith("archive"):   # if url is not None, and starts with "http://"
        links.append("https://www.cmu.edu/news/stories/" + url)
url_hand.close()
#print(links)

# ------------------------------------------------------------
# Part II - Visit each top story page, retrieve the content, and 
# save its title and paragraphs (story) as a text file.
# ------------------------------------------------------------
for url in links:                            # for each URL in the list of URLs (top stories),
    print("URL: {}\n".format(url))           # print the URL (just check to make sure it looks right)
    fhand = urllib.request.urlopen(url)      # open the URL
    html = fhand.read()                      # read the page
    soup = bs(html, "lxml")                  # create a beautifulsoup object
    content = soup.find("div", { "class" : "content" })  # find the div tag, where class = content.
    if content != None:                      # if 'story' is not None
        header = content.findNext("h1")      # find a <h1> tag.
        body   = content.findNext("div")     # find a <div> tag.
#         print("header =", header)
#         print("body =", body)
        if header != None and body != None:  # if header and body are both valid (!= None)
            filename = "text/{}.txt".format(header.text.replace(" ", "-"))   # construct a filename
            fout = open(filename, "w")                  # open a new file to write            
            fout.write("{}\n\n".format(header.text))    # write the article's title

            paragraphs = body.findAll('p')              # find all the <p> tags (i.e., paragraphs)
            for p in paragraphs:                        # print each paragraph to the file.
                fout.write("{}\n\n".format(p.text))
            fout.close()
    fhand.close()
    

URL: https://www.cmu.edu/news/stories/archives/2018/october/durand-nsf-grant.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/warhol-exhibit.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/machine-automation-degree.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/patient-records-access.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/grace-hopper-wrapup.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/edgar-mendoza.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/solar-racing.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/ai-medical-images.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/nae-fellows.html

URL: https://www.cmu.edu/news/stories/archives/2018/october/president-inauguration.html

