# Python Programming 4-days class
## Web Scraping with Python

Author: Paul Yang
Date: June, 2016 






### Retrieving HTML data

In [None]:
from urllib.request import urlopen
import urllib
html = urllib.request.urlopen("http://google.com")
print(html.read())


In [None]:
from urllib.request import urlopen
import urllib
html = urllib.request.urlopen("http://localhost:5000/")
print(html.read())

### BeautifulSoup

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
html = urllib.request.urlopen("http://localhost:5000/static/demo1.html")
bsObj = bs(html.read(),"html.parser")
print(bsObj.h1)

In [None]:
type(bsObj.h1)

In [None]:
html = urllib.request.urlopen("http://localhost:5000/static/demo1.html")
html.read()

### Connecting Reliably

In [None]:
# first situation (HTTP error)
from urllib.error import HTTPError
try:
    htmlConn = urlopen("http://localhost:5000/static/demo1.html")
except HTTPError as e:
    print(e)
    #return null, break, or do some other "Plan B"
else:
    #program continues. Note: If you return or break in the
    #exception catch, you do not need to use the "else" statement
    bsObj = bs(htmlConn.read(),"html.parser")
    print(bsObj.h1)

In [None]:
# second situation (the server is not found at all like down, get None)
htmlConn = urlopen("http://localhost:5000/static/de1.html")
if htmlConn is None: #If the server is not found at all urlopen returns a None object. This object is analogous to null in other programming languages.
    print("URL is not found")
else:
    #program continues
    pass

In [None]:
## the third situation (the tag dooesn't exist, it presents None object 
html = urlopen("http://localhost:5000/static/demo1.html")
bsObj = bs(htmlConn.read(),"html.parser")
print(bsObj.fooTag)
print(bsObj.fooTag.someTag)

In [None]:
try:#guard against these two situations
    badContent = bsObj.foo.anotherTag
except AttributeError as e:
    print("Tag was not found")
else:
    if badContent == None:
        print("Tag was not found")
    else:
        print(badContent)

### More on scaping 

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup as bs
htmlConn = urlopen("http://localhost:5000/static/demo2.html")
bsObj = bs(htmlConn,"html.parser")
name_list = bsObj.findAll("span",{"class":"green"})


In [None]:
type(name_list)

In [None]:
name_list

In [None]:
for n in name_list:
    print(type(n))
    print(type(n.string))
    print(type(n.get_text()))

In [None]:
for n in name_list:
    print(n.get_text())

In [None]:
nameList = bsObj.findAll(text="the prince")
print(len(nameList))

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup as bs
htmlConn = urlopen("http://localhost:5000/static/demo1.html")
bsObj = bs(htmlConn,"html.parser")
name_list = bsObj.findAll({'h1','h2','h3'})
print(name_list)

### DEMOS of children and other descendants and siblings, parents

In [None]:
htmlConn = urlopen("http://localhost:5000/static/demo3.html")
bsObj = bs(htmlConn,"html.parser")
name_list = bsObj.find('table',{'id':"giftList"}).children


In [None]:
bsObj.find('table',{'id':"giftList"}).children

In [None]:
for c in bsObj.find('table',{'id':"giftList"}).children:
    print(type(c))
    print(c)

In [None]:
for c in bsObj.find('table',{'id':"giftList"}).descendants:
    print(type(c))
    print(c)

In [None]:
for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
    print(sibling)

In [None]:
htmlConn = urlopen("http://localhost:5000/static/demo3.html")
bsObj = bs(htmlConn,"html.parser")
print(bsObj.find("img",{"src":"./demo3_files/img1.jpg"}).parent.previous_sibling.get_text())


In [None]:
import re
html = urlopen("http://localhost:5000/static/demo3.html")
bsObj = bs(html,"html.parser")
images = bsObj.findAll("img", {"src":re.compile("\.*\img.*\.jpg")})
for image in images:
    print(image["src"])