## wikiからリンクを取得する

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
#bsオブジェクトを作成してhref属性のものだけ抜き出す
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])
#無駄なものも多く存在することに気づく

/wiki/Wikipedia:Protection_policy#semi
#mw-head
#p-search
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia
/wiki/Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
http://baconbros.com/
#cite_note-1
#cite_note-actor-2
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/The_Guardian
/wiki/Academy_Award
#cite_note-3
/wiki/Hollywood_Walk_of_Fame
#cite_note-4
/wiki/Social_networks
/wiki/Six_Degrees_of_Kevin_Bacon
/wiki/SixDegrees.org
#cite_note-walk-5
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Advertising_work
#Personal_life
#Six_Degrees_of_Kevin_Bacon
#Music
#Awards_and_nom

## Retrieving Articles Only
上記を工夫して記事のリンクだけ抜き出すことに挑戦する。

**リンクの共通点**
1. divの中にあり、idがbodyContentに設定っされている
2. URLの中に:がない
3. URLは/wiki/から始まる

In [2]:
from urllib.request import urlopen 
from bs4 import BeautifulSoup 
import re

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
# Kevin_Baconのページのbsオブジェクトを作る。

for link in bs.find('div', {'id':'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')):
    # wikiのリンクを探す。
    if 'href' in link.attrs:
        print(link.attrs['href'])
        #リンクの内容を表示

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/The_Guardian
/wiki/Academy_Award
/wiki/Hollywood_Walk_of_Fame
/wiki/Social_networks
/wiki/Six_Degrees_of_Kevin_Bacon
/wiki/SixDegrees.org
/wiki/Philadelphia
/wiki/Edmund_Bacon_(architect)
/wiki/Pennsylvania_Governor%27s_School_for_the_Arts
/wiki/Bucknell_University
/wiki/Glory_Van_Scott
/wiki/Circle_in_the_Square
/wiki/Nancy_Mills
/wiki/Cosmopolitan_(magazine)
/wiki/Fraternities_and_sororities
/wiki/Animal_House
/wiki/Search_for_Tomorrow
/wiki/Guiding_Light
/wiki/Friday_the_13th_(1980_film)

## Random Walk

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    '''
    上記のwikiのリンクを取得するコードを関数化したもの
    '''
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))

# ランダムにリンクを選んで、そのページの中の記事をまたランダムに選ぶことを繰り返す処理
links = getLinks('/wiki/Kevin_Bacon')
end_cnt=0
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)
    end_cnt+=1
    if end_cnt>10:
        #終了用に自分で追加した
        break

/wiki/Michael_Greif
/wiki/The_New_York_Times
/wiki/Jim_Hoagland
/wiki/Anthony_Shadid
/wiki/Westview_Press
/wiki/Textbook
/wiki/Kirtsaeng_v._John_Wiley_%26_Sons,_Inc.
/wiki/2d_Cir.
/wiki/John_F._Keenan
/wiki/Frederick_Scullin
/wiki/Lewis_A._Kaplan


## 3.2 サイト全体をクローリングする
## Recursively crawling an entire site
サイト全体を知るために再帰的にクローリングする

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set() #ここですでに訪れたサイトの履歴を保持しておく
def getLinks(pageUrl):
    global pages
    #自分で追加した強制終了条件
    if len(pages)>5:
        return 0
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage) #訪れたページは格納しておく
                getLinks(newPage) #再帰処理で再びクローリング
                #深さ優先探索みたいなことを行っている
getLinks('') #トップページからはじめる

/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Protection_policy#template
/wiki/Wikipedia:Lists_of_protected_pages
/wiki/Wikipedia:Perennial_proposals
/wiki/Wikipedia:WPPP
/wiki/Wikipedia:Child_protection
/wiki/Wikipedia:WikiProject_Protected_areas
/wiki/Wikipedia:Policies_and_guidelines
/wiki/Wikipedia:What_%22Ignore_all_rules%22_means#Use_common_sense
/wiki/Wikipedia:Consensus
/wiki/Wikipedia:Shortcut
/wiki/Wikipedia:Edit_requests
/wiki/Wikipedia:ADMIN
/wiki/Wikipedia:Administrator_intervention_against_vandalism
/wiki/Wikipedia:Username_policy
/wiki/Wikipedia:Usernames_for_administrator_attention
/wiki/Wikipedia:Appealing_a_block
/wiki/Wikipedia:Edit_warring
/wiki/Wikipedia:3RR
/wiki/Wikipedia:Administrators%27_noticeboard/Edit_warring
/wiki/Wikipedia:BLOCK
/wiki/Wikipedia:Administrators%27_noticeboard/Incidents
/wiki/Wikipedia:Enforcement
/wiki/Wikipedia:Administrators
/wiki

## 3.2.1 サイト全体でデータを収集する
## Collecting Data Across an Entire Site

先程はページから他のページに飛ぶだけだったが、今度は情報を表示しながら飛び回る。

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages

    # 自分で追加した終了条件
    if len(pages)>10:
        return

    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print('title is', pageUrl)
        #ここでh1を取得することを試みている。
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        #h1がない場合errorになるので、exceptを用意している。
        print('This page is missing something! Continuing.')
    
    #でまた深さ優先探索的な
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
#                 print('-'*20)
#                 print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('') 

title is 
Main Page
<p><b><a href="/wiki/Pitta" title="Pitta">Pittas</a></b> (Pittidae) are a <a href="/wiki/Family_(biology)" title="Family (biology)">family</a> of <a href="/wiki/Bird" title="Bird">birds</a> found in Asia, Australasia and Africa. There are numerous species in three <a href="/wiki/Genus" title="Genus">genera</a>, <i><a href="/wiki/Pitta_(genus)" title="Pitta (genus)">Pitta</a></i>, <i><a href="/wiki/Erythropitta" title="Erythropitta">Erythropitta</a></i> and <i><a href="/wiki/Hydrornis" title="Hydrornis">Hydrornis</a></i>, all similar in general appearance and habits. They are Old World <a href="/wiki/Tyranni" title="Tyranni">suboscines</a>, closely related to the <a href="/wiki/Broadbill" title="Broadbill">broadbills</a>. Pittas are medium-sized by <a href="/wiki/Passerine" title="Passerine">passerine</a> standards, at 15 to 25 cm (5.9–9.8 in) in length, and stocky, with strong, longish legs and long feet. They have very short <a href="/wiki/Tail" title="Tail">tails<

## 3.3 インターネットをクローリング
## Crawling across the Internet

単一ドメインだけでなく外部のサイトもクローリングする。
自分の目的とは沿わないので飛ばす

In [6]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)
            
followExternalOnly('http://oreilly.com')

Random external link is: http://twitter.com/oreillymedia


KeyboardInterrupt: 

## Collect all External Links from a Site

In [8]:
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
                              urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)

    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)


allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
http://www.oreilly.com/ideas
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav
http://www.oreilly.com/conferences/
http://shop.oreilly.com/
http://members.oreilly.com
https://www.oreilly.com/topics
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now
https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in
https://www.safaribooksonline.com/live-training/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+take+a+live+online+course
https://www.safaribooksonline.com/learning-paths/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+follow+a+path
https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_c

KeyboardInterrupt: 