# Web scraping multiple pages

Scrape at least 3 of these sites.

- Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'
- Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'
- Create a Python list with the top ten FBI's Most Wanted names: url = 'https://www.fbi.gov/wanted/topten'
- Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: url = 'https://emsc-csem.org/Earthquake_information/#1'
- List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'
- A list with the different kind of datasets available in data.gov.uk: url = 'https://data.gov.uk/'
- Display the top 10 languages by number of native speakers stored in a pandas dataframe: url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'

In [2]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Display the top 10 languages by number of native speakers stored in a pandas dataframe

In [3]:
# find url and store it in a variable
url = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"

In [4]:
# download html with a get request
response = requests.get(url)

In [5]:
response.status_code # 200 status code means OK!

200

In [9]:
# response.content

In [7]:
# parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [10]:
# check that the html code looks like it should
# soup

In [21]:
# retrieve/extract the desired info 
soup.select("a")['title']

TypeError: list indices must be integers or slices, not str

In [37]:
soup.select("table")[0]

In [26]:
prestab = soup.select("table")[0]
prestab
prestab.select("tr td a")

[<a class="mw-redirect" href="/wiki/ISO_639:cmn" title="ISO 639:cmn">Mandarin Chinese</a>,
 <a href="/wiki/Sino-Tibetan_languages" title="Sino-Tibetan languages">Sino-Tibetan</a>,
 <a href="/wiki/Sinitic_languages" title="Sinitic languages">Sinitic</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:spa" title="ISO 639:spa">Spanish</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Romance_languages" title="Romance languages">Romance</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:eng" title="ISO 639:eng">English</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Germanic_languages" title="Germanic languages">Germanic</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:hin" title="ISO 639:hin">Hindi</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Indo-Aryan_languages" title="Indo-Aryan languages">Indo-Ary

In [52]:
prestab.select("tr td a")

[<a class="mw-redirect" href="/wiki/ISO_639:cmn" title="ISO 639:cmn">Mandarin Chinese</a>,
 <a href="/wiki/Sino-Tibetan_languages" title="Sino-Tibetan languages">Sino-Tibetan</a>,
 <a href="/wiki/Sinitic_languages" title="Sinitic languages">Sinitic</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:spa" title="ISO 639:spa">Spanish</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Romance_languages" title="Romance languages">Romance</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:eng" title="ISO 639:eng">English</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Germanic_languages" title="Germanic languages">Germanic</a>,
 <a class="mw-redirect" href="/wiki/ISO_639:hin" title="ISO 639:hin">Hindi</a>,
 <a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>,
 <a href="/wiki/Indo-Aryan_languages" title="Indo-Aryan languages">Indo-Ary

In [48]:
pre_list=[]
for i in prestab.select("tr td a"):
    pre_list.append(i.get_text())

In [49]:
pre_list

['Mandarin Chinese',
 'Sino-Tibetan',
 'Sinitic',
 'Spanish',
 'Indo-European',
 'Romance',
 'English',
 'Indo-European',
 'Germanic',
 'Hindi',
 'Indo-European',
 'Indo-Aryan',
 'Portuguese',
 'Indo-European',
 'Romance',
 'Bengali',
 'Indo-European',
 'Indo-Aryan',
 'Russian',
 'Indo-European',
 'Balto-Slavic',
 'Japanese',
 'Japonic',
 'Japanese',
 'Yue Chinese',
 'Sino-Tibetan',
 'Sinitic',
 'Vietnamese',
 'Austroasiatic',
 'Vietic',
 'Turkish',
 'Turkic',
 'Oghuz',
 'Wu Chinese',
 'Sino-Tibetan',
 'Sinitic',
 'Marathi',
 'Indo-European',
 'Indo-Aryan',
 'Telugu',
 'Dravidian',
 'Korean',
 'Koreanic',
 'French',
 'Indo-European',
 'Romance',
 'Tamil',
 'Dravidian',
 'Egyptian Arabic',
 'Afroasiatic',
 'Semitic',
 'Standard German',
 'Indo-European',
 'Germanic',
 'Urdu',
 'Indo-European',
 'Indo-Aryan',
 'Javanese',
 'Austronesian',
 'Malayo-Polynesian',
 'Western Punjabi',
 'Indo-European',
 'Indo-Aryan',
 'Italian',
 'Indo-European',
 'Romance',
 'Gujarati',
 'Indo-European',
 'I

In [51]:
languages = pre_list[::3]
languages

['Mandarin Chinese',
 'Spanish',
 'English',
 'Hindi',
 'Portuguese',
 'Bengali',
 'Russian',
 'Japanese',
 'Yue Chinese',
 'Vietnamese',
 'Turkish',
 'Wu Chinese',
 'Marathi',
 'Telugu',
 'Koreanic',
 'Romance',
 'Egyptian Arabic',
 'Standard German',
 'Urdu',
 'Javanese',
 'Western Punjabi',
 'Italian',
 'Gujarati',
 'Iranian Persian',
 'Bhojpuri',
 'Hausa']

In [56]:
top10_languages = languages[:10]

In [57]:
len(top10_languages)

10

In [1]:
# soup.select("table")[0]

In [2]:
# prestab.select("td")

In [65]:
pre_list_numb_speak=[]
for i in prestab.select("td"):
    pre_list_numb_speak.append(i.get_text().strip())

In [66]:
number_speakers = pre_list_numb_speak[1::4]
number_speakers

['939',
 '485',
 '380',
 '345',
 '236',
 '234',
 '147',
 '123',
 '86.1',
 '85.0',
 '84.0',
 '83.4',
 '83.2',
 '83.0',
 '81.7',
 '80.8',
 '78.6',
 '77.4',
 '75.3',
 '70.6',
 '68.3',
 '66.7',
 '64.6',
 '57.1',
 '57.2',
 '52.3',
 '51.7']

In [68]:
top10_number_speakers = number_speakers[:10]

In [69]:
# each list becomes a column
top_10_speaked_languages = pd.DataFrame({"language":top10_languages,
                           "number_of_speakers":top10_number_speakers
                          })

In [70]:
top_10_speaked_languages

Unnamed: 0,language,number_of_speakers
0,Mandarin Chinese,939.0
1,Spanish,485.0
2,English,380.0
3,Hindi,345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese,86.1
9,Vietnamese,85.0


### Display 4 pages of latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe

In [71]:
# url: we start with the 'first' page. Show that you can start whenever you want
url2 = "https://emsc-csem.org/Earthquake_information/#1"

In [72]:
# download html with a get request
response2 = requests.get(url2)
response2.status_code # 200 status code means OK!

200

In [73]:
# parse html (create the 'soup')
soup2 = BeautifulSoup(response2.content, "html.parser")

In [75]:
# check that the html code looks like it should
# soup

In [80]:
range(1,4)

range(1, 4)

In [98]:
from random import randint
from time import sleep
pages = []

for i in range(1,4):
    # assemble the url:
    start_at= str(i)
    url3 = "https://emsc-csem.org/Earthquake_information/#"+str(i)

    # download html with a get request:
    response3 = requests.get(url3)
    #response = requests.get(url, headers = {"Accept-Language": "en-US"}) to get responses in English

    # monitor the process by printing the status code
    print("Status code: " + str(response3.status_code))

    # store response into "pages" list
    pages.append(response3)

    # respectful nap:
    wait_time = randint(1,4000)
    print("I will sleep for " + str(wait_time/1000) + " seconds.")
    sleep(wait_time/1000)

Status code: 200
I will sleep for 1.646 seconds.
Status code: 200
I will sleep for 0.025 seconds.
Status code: 200
I will sleep for 0.051 seconds.


In [118]:
# Parse just the first page, for testing purposes
soup3 = BeautifulSoup(pages[1].content, "html.parser")

In [3]:
# soup3

In [115]:
prestab3 = soup3.select("table")[0]
prestab3
prestab3.select("a")

[]

In [117]:
soup3.select("div.content > div.htab > table > tbody > tr.lilist.e_1623460")

[]