In [1]:
import requests
import records
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from sqlalchemy.exc import IntegrityError



In [3]:
# p164
db = records.Database("sqlite:///wikipedia.db")

db.query("""CREATE TABLE IF NOT EXISTS pages(
            url text PRIMARY KEY,
            page_title text NULL,
            created_at datetime,
            visited_at datetime NULL)""")
db.query("""CREATE TABLE IF NOT EXISTS links(
            url text, url_to text,
            PRIMARY KEY (url, url_to))""")

base_url = "https://en.wikipedia.org/wiki/"

def store_page(url):
    try:
        db.query("""INSERT INTO pages (url, created_at)
                    VALUES (:url, CURRENT_TIMESTAMP)""", url=url)
    except IntegrityError as ie:
        # このページは既に存在する
        pass
    
def store_link(url, url_to):
    try:
        db.query("""INSERT INTO links (url, url_to)
                    VALUES (:url, :url_to)""", url=url, url_to=url_to)
    except IntegrityError as ie:
        # このリンクは既に存在する
        pass
    
def set_visited(url):
    db.query("""UPDATE pages SET visited_at=CURRENT_TIMESTAMP
                WHERE url=:url""", url=url)
    
def set_title(url, page_title):
    db.query("UPDATE pages SET page_title =:page_title WHERE url=:url",url = url, page_title = page_title)
    
# 165P
def get_random_unvisited_page():
    link_first = db.query("""SELECT * FROM pages
    WHERE visited_at IS NULL
    ORDER BY RANDOM() LIMIT 1""")
    link = link_first[0]
    # ここの1つ上の行にfirst()が使われていた
    return None if link is None else link.url

def visit(url):
    print("Now visiting:", url)
    html = requests.get(url).text
    html_soup = BeautifulSoup(html, "html.parser")
    page_title = html_soup.find(id="firstHeading")
    page_title = page_title.text if page_title else ""
    print("page title:", page_title)
    set_title(url, page_title)
    for link in html_soup.find_all("a"):
        link_url = link.get("href")
        if link_url is None:
            # hrefがないのでスキップする
            continue
        full_url = urljoin(base_url, link_url)
        # フラグメント識別子の部分を削除する
        full_url = urldefrag(full_url)[0]
        if not full_url.startswith(base_url):
            # これは外部リンクなのでスキップする
            continue
        store_link(url, full_url)
        store_page(full_url)
    set_visited(url)
    
store_page(base_url)
url_to_visit = get_random_unvisited_page()
while url_to_visit is not None:
    visit(url_to_visit)
    url_to_visit = get_random_unvisited_page()
    
    
    

Now visiting: https://en.wikipedia.org/wiki/
page title: Main Page
Now visiting: https://en.wikipedia.org/wiki/Peter_Lehel
page title: Peter Lehel
Now visiting: https://en.wikipedia.org/wiki/Port_Huron,_Michigan
page title: Port Huron, Michigan
Now visiting: https://en.wikipedia.org/wiki/File:Fort_Gratiot_Lighthouse_postcard_-_Port_Huron_Michigan.jpg
page title: File:Fort Gratiot Lighthouse postcard - Port Huron Michigan.jpg
Now visiting: https://en.wikipedia.org/wiki/List_of_municipalities_in_Michigan_(by_population)
page title: List of municipalities in Michigan (by population)
Now visiting: https://en.wikipedia.org/wiki/John_Swainson
page title: John Swainson
Now visiting: https://en.wikipedia.org/wiki/Garfield_Township,_Grand_Traverse_County,_Michigan
page title: Garfield Township, Grand Traverse County, Michigan
Now visiting: https://en.wikipedia.org/wiki/Flint,_MI
page title: Flint, Michigan


OperationalError: (sqlite3.OperationalError) disk I/O error
(Background on this error at: http://sqlalche.me/e/e3q8)