# Python Programming 4-days class
## Web Scraping with Python
Author: Paul Yang
Date: June, 2016 

### Exercise 14

In [None]:
from urllib.request import urlopen
html = urlopen("http://google.com/")
print(html.read())

### Exercise 15  easy and organized way to handle exception

In [1]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup as bs

def getTitle(url):
    try:
        htmlConn = urlopen(url)
    except HTTPError as e:
        #print("HTTP error happened ", e, sep=":") #logging error 
        return(None)
    try:
        bsObj = bs(htmlConn.read(),"html.parser") #if the server did not exist, html would be a None object, and html.read() would throw an AttributeError
        title = bsObj.body.h1
    except AttributeError as e:
        #print("Tag was not found due to %s" %(e)) #logging error
        return None
    return title

title = getTitle("http://localhost:5000/static/demo1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

<h1>An Interesting Title1</h1>


### Exercise 16

In [2]:
htmlConn = urlopen("http://localhost:5000/static/demo2.html")
bsObj = bs(htmlConn,"html.parser")
name_list = bsObj.findAll('span',{'class':'red','class':'blue','class':'green'})
for n in name_list:
    print(n.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
What answer did Novosiltsev get? None.
The English have not understood and cannot understand the
self-abnegation of our Emperor who wants nothing for himself, but only
desires the good of mankind.
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


### Exercise 17 - scaping all media files

In [3]:
#The following will download all internal files, linked to by any tag’s src attribute
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
downloadDirectory = "download"

def retrieve_absoluteURL(baseUrl, source):
    print("baseUrl:", baseUrl)
    print("source: ", source)
    #incase of incosistent
    if source.startswith("/"):
        source = source[1:]
        print("source: ", source)
    if source.startswith("http://www."):
        url = "http://"+source[11:]
    elif source.startswith("http://"):
        url = source
    elif source.startswith("www."):
        url = source[4:]
        url = "http://"+source
    elif source.startswith("https://"):
        url = "http://"+source[8:]
    elif source.startswith("https://www."):
        url = "http://"+source[12:]
    else:
        url = baseUrl+"/"+source
    if baseUrl not in url:
        return None
    print("url2: ", url)
    return url



In [4]:
def create_localFilePath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace("www.", "")
    print("1.path: ", path)
    path = re.sub('\?', '_', path)
    print("2.path: ", path)
    
    path = path.replace(baseUrl, "")
    
    print("3.path: ", path)
    path = downloadDirectory+path
    directory = os.path.dirname(path)
    print("4.directory: ", directory)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

def retrieve_mediaDownloadPath(baseUrl):
    html = urlopen(baseUrl)
    bsObj = BeautifulSoup(html,'html.parser')
    mediaDownloadList = bsObj.findAll(src=True)
    return mediaDownloadList

In [7]:
baseUrl = "http://www.tutorialspoint.com".replace("www.", "")
downloadList = retrieve_mediaDownloadPath(baseUrl)
for download in downloadList:
    fileUrl = retrieve_absoluteURL(baseUrl, download["src"])
    print("fileUrl:",fileUrl)
    if fileUrl is not None:
        print(fileUrl)
        urlretrieve(fileUrl, create_localFilePath(baseUrl, fileUrl, downloadDirectory))

baseUrl: http://tutorialspoint.com
source:  /theme/js/script-min-v4.js
source:  theme/js/script-min-v4.js
url2:  http://tutorialspoint.com/theme/js/script-min-v4.js
fileUrl: http://tutorialspoint.com/theme/js/script-min-v4.js
http://tutorialspoint.com/theme/js/script-min-v4.js
1.path:  http://tutorialspoint.com/theme/js/script-min-v4.js
2.path:  http://tutorialspoint.com/theme/js/script-min-v4.js
3.path:  /theme/js/script-min-v4.js
4.directory:  download/theme/js
baseUrl: http://tutorialspoint.com
source:  /images/loading-cg.gif
source:  images/loading-cg.gif
url2:  http://tutorialspoint.com/images/loading-cg.gif
fileUrl: http://tutorialspoint.com/images/loading-cg.gif
http://tutorialspoint.com/images/loading-cg.gif
1.path:  http://tutorialspoint.com/images/loading-cg.gif
2.path:  http://tutorialspoint.com/images/loading-cg.gif
3.path:  /images/loading-cg.gif
4.directory:  download/images
baseUrl: http://tutorialspoint.com
source:  /green/images/logo.png
source:  green/images/logo.png


### Exercise 18

In [8]:
wiki_url = 'https://zh.wikipedia.org/wiki/%E5%B7%B2%E9%96%8B%E7%99%BC%E5%9C%8B%E5%AE%B6'
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
filename = "countries1.csv"
html = urlopen(wiki_url)
bsObj = BeautifulSoup(html,"html.parser")
#The main comparison table is currently the first table on the page
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")
csvFile = open(filename, 'w', encoding = 'utf8')
writer = csv.writer(csvFile,lineterminator='\n')
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['th','td']):
            csvRow.append(cell.get_text())
            print(csvRow[-1])
            #print(cell.get_text())
        #writer.writerow([r.encode('utf-8') for r in csvRow])
        writer.writerow(csvRow)
finally:
    csvFile.close()

成為發達經濟體年份
成為高收入經濟體年份
國家或地區
區域
人口[28]
面積（km²）[29]
人口密度
1990年以前
1987–
 美國
美洲
323,140,000
9,629,091
33.6
1990年以前
1987–
 加拿大
美洲
35,873,000
9,984,670
3.59
不適用[註 1]
1987–
 百慕大
美洲
66,364
54
1220
2016年[註 2]
1989, 2002–
 波多黎各
美洲
3,725,789
9,104
409
1990年以前
1987–
 澳大利亚
大洋洲
23,957,000
7,692,024
3.11
1990年以前
1987–
 新西蘭
大洋洲
4,648,500
270,534
17.2
1990年以前
1987–
 日本
亞洲
126,810,000
377,972
335
1997年
1995–1997, 2001–
 韩国
亞洲
50,464,000
99,600
507
1997年
1987–
 新加坡
亞洲
5,488,500
716
7665
1997年
1987–
 臺灣
亞洲
23,492,074
36,197
649
1997年
1987–
 香港
亞洲
7,304,100
1,105.6
6606
1997年
1987–
 以色列
亞洲
8,424,400
20,770
406
2016年[註 2]
1994–
 澳門
亞洲
649,100
30.4
21352
不適用[註 1]
1990–
 安道尔
歐洲
79,403
468
170
1990年以前
1987–
 奥地利
歐洲
8,441,500
83,858
101
1990年以前
1987–
 比利時
歐洲
11,160,000
30,528
366
2001年
1988–
 賽普勒斯
歐洲
1,203,900
9,251
130
2009年
2006–
 捷克
歐洲
10,460,000
78,866
133
1990年以前
1987–
 丹麥
歐洲
5,648,000
43,094
131
2011年
2006–
 爱沙尼亚
歐洲
1,303,600
45,100
28.9
1990年以前
1987–
 芬兰
歐洲
5,491,500
338,145
16.2
1990年以前
1987–
 法国
歐洲
67,0

### Execise19 - Inserting/Reading images

In [17]:
#1. create db and table
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sqlite3 as lite
import sys

con = lite.connect('test2.db')

with con:
    
    cur = con.cursor()    
    cur.execute("CREATE TABLE Images(Id INTEGER PRIMARY KEY, Data BLOB)")

In [18]:
import sqlite3 as lite
import sys


def readImage():

    try:
        fin = open("google_play.jpg", "rb")
        img = fin.read()
        return img
        
    except IOError as e:

        print("Error %d: %s" % (e.args[0],e.args[1]))
        sys.exit(1)

    finally:
        
        if fin:
            fin.close()

In [19]:
#Inserting images
try:
    con = lite.connect('test2.db')
    
    cur = con.cursor()
    data = readImage()
    binary = lite.Binary(data)
    cur.execute("INSERT INTO Images(Data) VALUES (?)", (binary,) )
    con.commit()    
    
except lite.Error as e:
    
    if con:
        con.rollback()
        
    print("Error %s:" % e.args[0])
    sys.exit(1)
    
finally:
    
    if con:
        con.close()  

In [21]:
import sqlite3 as lite
import sys


def writeImage(data):
    
    try:
        fout = open('google_play_clone.jpg','wb')
        fout.write(data)
    
    except IOError as e:    
        print("Error %d: %s" % (e.args[0], e.args[1]))
        sys.exit(1)
        
    finally:
        
        if fout:
            fout.close()       

In [22]:
try:
    con = lite.connect('test2.db')
    
    cur = con.cursor()    
    cur.execute("SELECT Data FROM Images LIMIT 1")
    data = cur.fetchone()[0]
    
    writeImage(data)

    
except lite.Error as e:
    
    print("Error %s:" % e.args[0])
    sys.exit(1)
    
finally:
    
    if con:
        con.close()      