In [1]:
import pandas as pd  # keywords are highlighted in green, other strings in red, etc.
import numpy as np
from bs4 import BeautifulSoup
import requests



# Making beautiful soups into beautiful tables and Dataframe

In [2]:
url= "https://www.fbi.gov/wanted/topten"

In [3]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
#soup

In [6]:
soup.select("#query-results-0f737222c5054a81a120bce207b0446a > ul > li:nth-child(1) > h3 > a")

[<a href="https://www.fbi.gov/wanted/topten/omar-alexander-cardenas">OMAR ALEXANDER CARDENAS</a>]

In [7]:
(soup.select("h3 a"))

[<a href="https://www.fbi.gov/wanted/topten/omar-alexander-cardenas">OMAR ALEXANDER CARDENAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/alexis-flores">ALEXIS FLORES</a>,
 <a href="https://www.fbi.gov/wanted/topten/bhadreshkumar-chetanbhai-patel">BHADRESHKUMAR CHETANBHAI PATEL</a>,
 <a href="https://www.fbi.gov/wanted/topten/alejandro-castillo">ALEJANDRO ROSALES CASTILLO</a>,
 <a href="https://www.fbi.gov/wanted/topten/yulan-adonay-archaga-carias">YULAN ADONAY ARCHAGA CARIAS</a>,
 <a href="https://www.fbi.gov/wanted/topten/ruja-ignatova">RUJA IGNATOVA</a>,
 <a href="https://www.fbi.gov/wanted/topten/arnoldo-jimenez">ARNOLDO JIMENEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/michael-james-pratt">MICHAEL JAMES PRATT</a>,
 <a href="https://www.fbi.gov/wanted/topten/jose-rodolfo-villarreal-hernandez">JOSE RODOLFO VILLARREAL-HERNANDEZ</a>,
 <a href="https://www.fbi.gov/wanted/topten/rafael-caro-quintero">RAFAEL CARO-QUINTERO</a>]

In [8]:
soup.select("h3 a")[0].get_text()

'OMAR ALEXANDER CARDENAS'

In [9]:
#initialize empty lists
names = []



# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set
# (this is equivalent but more robust than just explicitly defining 250 iterations)
num_iter = len(soup.select("h3 a"))

name_list = soup.select("h3 a")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    names.append(name_list[i].get_text())
    

print(names)




['OMAR ALEXANDER CARDENAS', 'ALEXIS FLORES', 'BHADRESHKUMAR CHETANBHAI PATEL', 'ALEJANDRO ROSALES CASTILLO', 'YULAN ADONAY ARCHAGA CARIAS', 'RUJA IGNATOVA', 'ARNOLDO JIMENEZ', 'MICHAEL JAMES PRATT', 'JOSE RODOLFO VILLARREAL-HERNANDEZ', 'RAFAEL CARO-QUINTERO']


In [10]:
FBI = pd.DataFrame({"names":names})

In [11]:
FBI

Unnamed: 0,names
0,OMAR ALEXANDER CARDENAS
1,ALEXIS FLORES
2,BHADRESHKUMAR CHETANBHAI PATEL
3,ALEJANDRO ROSALES CASTILLO
4,YULAN ADONAY ARCHAGA CARIAS
5,RUJA IGNATOVA
6,ARNOLDO JIMENEZ
7,MICHAEL JAMES PRATT
8,JOSE RODOLFO VILLARREAL-HERNANDEZ
9,RAFAEL CARO-QUINTERO


# Display the top 10 languages by number of native speakers stored in a pandas dataframe

In [12]:
url= "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"

In [13]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [14]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [15]:
soup.select("#mw-content-text > div.mw-parser-output > table.wikitable.sortable.static-row-numbers.jquery-tablesorter > thead > tr > th:nth-child(1)")

[]

In [16]:
prestab = soup.select("table")[0]
prestab.select("tr")[1:10]

[<tr>
 <td><a class="mw-redirect" href="/wiki/ISO_639:cmn" title="ISO 639:cmn">Mandarin Chinese</a><br/>(incl. <a href="/wiki/Standard_Chinese" title="Standard Chinese">Standard Chinese</a>, but excl. <a href="/wiki/Varieties_of_Chinese" title="Varieties of Chinese">other varieties</a>)
 </td>
 <td>939
 </td>
 <td><a href="/wiki/Sino-Tibetan_languages" title="Sino-Tibetan languages">Sino-Tibetan</a>
 </td>
 <td><a href="/wiki/Varieties_of_Chinese" title="Varieties of Chinese">Sinitic</a>
 </td></tr>,
 <tr>
 <td><a class="mw-redirect" href="/wiki/ISO_639:spa" title="ISO 639:spa">Spanish</a>
 </td>
 <td>485
 </td>
 <td><a href="/wiki/Indo-European_languages" title="Indo-European languages">Indo-European</a>
 </td>
 <td><a href="/wiki/Romance_languages" title="Romance languages">Romance</a>
 </td></tr>,
 <tr>
 <td><a class="mw-redirect" href="/wiki/ISO_639:eng" title="ISO 639:eng">English</a>
 </td>
 <td>380
 </td>
 <td><a href="/wiki/Indo-European_languages" title="Indo-European language

In [17]:
language = []
speakers = []
for lan in prestab.select("tr")[1:11]:
    language.append(lan("td")[0].get_text())
    speakers.append(lan("td")[1].get_text())
    
    
#print(lan("td")[0].get_text())
#print(lan("td")[1].get_text())

In [18]:
Native= pd.DataFrame({"language":language,
               "native speakers":speakers})

In [19]:
Native

Unnamed: 0,language,native speakers
0,"Mandarin Chinese(incl. Standard Chinese, but e...",939\n
1,Spanish\n,485\n
2,English\n,380\n
3,"Hindi(excl. Urdu, and other languages)\n",345\n
4,Portuguese\n,236\n
5,Bengali\n,234\n
6,Russian\n,147\n
7,Japanese\n,123\n
8,Yue Chinese(incl. Cantonese)\n,86.1\n
9,Vietnamese\n,85.0\n


# Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that pag

In [20]:
url= "https://en.wikipedia.org/wiki/Python"

In [21]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [22]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [23]:
soup.select("#mw-content-text > div.mw-parser-output > ul a")

[<a href="/wiki/Pythonidae" title="Pythonidae">Pythonidae</a>,
 <a href="/wiki/Python_(genus)" title="Python (genus)"><i>Python</i> (genus)</a>,
 <a href="/wiki/Python_(mythology)" title="Python (mythology)">Python (mythology)</a>,
 <a href="/wiki/Python_(programming_language)" title="Python (programming language)">Python (programming language)</a>,
 <a href="/wiki/CMU_Common_Lisp" title="CMU Common Lisp">CMU Common Lisp</a>,
 <a href="/wiki/PERQ#PERQ_3" title="PERQ">PERQ 3</a>,
 <a href="/wiki/Python_of_Aenus" title="Python of Aenus">Python of Aenus</a>,
 <a href="/wiki/Python_(painter)" title="Python (painter)">Python (painter)</a>,
 <a href="/wiki/Python_of_Byzantium" title="Python of Byzantium">Python of Byzantium</a>,
 <a href="/wiki/Python_of_Catana" title="Python of Catana">Python of Catana</a>,
 <a href="/wiki/Python_Anghelo" title="Python Anghelo">Python Anghelo</a>,
 <a href="/wiki/Python_(Efteling)" title="Python (Efteling)">Python (Efteling)</a>,
 <a href="/wiki/Python_(Bus

In [30]:
python_link = []
for pr in soup.select("#mw-content-text > div.mw-parser-output > ul a"):
    link = pr.get("href")
    url = "https://en.wikipedia.org" + link
    python_link   .append(url) 

In [31]:
python_link

['https://en.wikipedia.org/wiki/Pythonidae',
 'https://en.wikipedia.org/wiki/Python_(genus)',
 'https://en.wikipedia.org/wiki/Python_(mythology)',
 'https://en.wikipedia.org/wiki/Python_(programming_language)',
 'https://en.wikipedia.org/wiki/CMU_Common_Lisp',
 'https://en.wikipedia.org/wiki/PERQ#PERQ_3',
 'https://en.wikipedia.org/wiki/Python_of_Aenus',
 'https://en.wikipedia.org/wiki/Python_(painter)',
 'https://en.wikipedia.org/wiki/Python_of_Byzantium',
 'https://en.wikipedia.org/wiki/Python_of_Catana',
 'https://en.wikipedia.org/wiki/Python_Anghelo',
 'https://en.wikipedia.org/wiki/Python_(Efteling)',
 'https://en.wikipedia.org/wiki/Python_(Busch_Gardens_Tampa_Bay)',
 'https://en.wikipedia.org/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)',
 'https://en.wikipedia.org/wiki/Python_(automobile_maker)',
 'https://en.wikipedia.org/wiki/Python_(Ford_prototype)',
 'https://en.wikipedia.org/wiki/Python_(missile)',
 'https://en.wikipedia.org/wiki/Python_(nuclear_primary)',
 'https://en.wiki