# Extracting Data from HTML with BeautifulSoup

In [7]:
# importing the libraries
from bs4 import BeautifulSoup
import requests

url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "html.parser")
print(soup.prettify()) # print the parsed data of html

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries by GDP (nominal) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9febc81c-d12e-4675-b615-72546ceb2ee5","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_by_GDP_(nominal)","wgTitle":"List of countries by GDP (nominal)","wgCurRevisionId":978859236,"wgRevisionId":978859236,"wgArticleId":380845,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using the EasyTimeline extension","Wikipedia indefinitely semi-protected pages","Ar

In [8]:
soup.title

<title>List of countries by GDP (nominal) - Wikipedia</title>

In [9]:
soup.title.text

'List of countries by GDP (nominal) - Wikipedia'

## Now, let's get all the links in the page along with its attributes, such as href, title, and its inner Text.



In [11]:
for link in soup.find_all("a"):
    print("Inner Text: {}".format(link.text))
    print("Title: {}".format(link.get("title")))
    print("href: {}".format(link.get("href")))

Inner Text: 
Title: None
href: None
Inner Text: 
Title: This article is semi-protected.
href: /wiki/Wikipedia:Protection_policy#semi
Inner Text: Jump to navigation
Title: None
href: #mw-head
Inner Text: Jump to search
Title: None
href: #searchInput
Inner Text: List of countries by GDP (PPP)
Title: List of countries by GDP (PPP)
href: /wiki/List_of_countries_by_GDP_(PPP)
Inner Text: improve it
Title: None
href: https://en.wikipedia.org/w/index.php?title=List_of_countries_by_GDP_(nominal)&action=edit
Inner Text: talk page
Title: Talk:List of countries by GDP (nominal)
href: /wiki/Talk:List_of_countries_by_GDP_(nominal)
Inner Text: Learn how and when to remove these template messages
Title: Help:Maintenance template removal
href: /wiki/Help:Maintenance_template_removal
Inner Text: confusing or unclear
Title: Wikipedia:Vagueness
href: /wiki/Wikipedia:Vagueness
Inner Text: clarify the article
Title: Wikipedia:Please clarify
href: /wiki/Wikipedia:Please_clarify
Inner Text: the talk page
Titl

Title: Togo
href: /wiki/Togo
Inner Text: Barbados
Title: Barbados
href: /wiki/Barbados
Inner Text: Eswatini
Title: Eswatini
href: /wiki/Eswatini
Inner Text: Sierra Leone
Title: Sierra Leone
href: /wiki/Sierra_Leone
Inner Text: Guyana
Title: Guyana
href: /wiki/Guyana
Inner Text: Suriname
Title: Suriname
href: /wiki/Suriname
Inner Text: Burundi
Title: Burundi
href: /wiki/Burundi
Inner Text: Andorra
Title: Andorra
href: /wiki/Andorra
Inner Text: Aruba
Title: Aruba
href: /wiki/Aruba
Inner Text: Curaçao
Title: Curaçao
href: /wiki/Cura%C3%A7ao
Inner Text: Greenland
Title: Greenland
href: /wiki/Greenland
Inner Text: Djibouti
Title: Djibouti
href: /wiki/Djibouti
Inner Text: Bhutan
Title: Bhutan
href: /wiki/Bhutan
Inner Text: Lesotho
Title: Lesotho
href: /wiki/Lesotho
Inner Text: Timor-Leste
Title: East Timor
href: /wiki/East_Timor
Inner Text: Central African Republic
Title: Central African Republic
href: /wiki/Central_African_Republic
Inner Text: Liberia
Title: Liberia
href: /wiki/Liberia
Inne

In [12]:
gdp_table = soup.find("table", attrs={"class": "wikitable"})
gdp_table_data = gdp_table.tbody.find_all("tr")  # contains 2 rows

# Get all the headings of Lists
headings = []
for td in gdp_table_data[0].find_all("td"):
    # remove any newlines and extra spaces from left and right
    headings.append(td.b.text.replace('\n', ' ').strip())

print(headings)

['Per the International Monetary Fund (2019 estimates)', 'Per the World Bank (2019)', 'Per the United Nations (2018)']


In [13]:
data = {}
for table, heading in zip(gdp_table_data[1].find_all("table"), headings):
    # Get headers of table i.e., Rank, Country, GDP.
    t_headers = []
    for th in table.find_all("th"):
        # remove any newlines and extra spaces from left and right
        t_headers.append(th.text.replace('\n', ' ').strip())
    # Get all the rows of table
    table_data = []
    for tr in table.tbody.find_all("tr"): # find all tr's from table's tbody
        t_row = {}
        # Each table row is stored in the form of
        # t_row = {'Rank': '', 'Country/Territory': '', 'GDP(US$million)': ''}

        # find all td's(3) in tr and zip it with t_header
        for td, th in zip(tr.find_all("td"), t_headers): 
            t_row[th] = td.text.replace('\n', '').strip()
        table_data.append(t_row)

    # Put the data for the table with his heading.
    data[heading] = table_data

print(data)

{'Per the International Monetary Fund (2019 estimates)': [{}, {'Rank': '', 'Country/Territory': 'World[19]', 'GDP(US$million)': '87,265,226'}, {'Rank': '1', 'Country/Territory': 'United States', 'GDP(US$million)': '21,439,453'}, {'Rank': '—', 'Country/Territory': 'European Union[22][n 1]', 'GDP(US$million)': '18,705,132'}, {'Rank': '2', 'Country/Territory': 'China[n 2]', 'GDP(US$million)': '14,140,163'}, {'Rank': '3', 'Country/Territory': 'Japan', 'GDP(US$million)': '5,154,475'}, {'Rank': '4', 'Country/Territory': 'Germany', 'GDP(US$million)': '3,863,344'}, {'Rank': '5', 'Country/Territory': 'India', 'GDP(US$million)': '2,935,570'}, {'Rank': '6', 'Country/Territory': 'United Kingdom', 'GDP(US$million)': '2,743,586'}, {'Rank': '7', 'Country/Territory': 'France', 'GDP(US$million)': '2,707,074'}, {'Rank': '8', 'Country/Territory': 'Italy', 'GDP(US$million)': '1,988,636'}, {'Rank': '9', 'Country/Territory': 'Brazil', 'GDP(US$million)': '1,847,020'}, {'Rank': '10', 'Country/Territory': 'Can

In [14]:
import csv

for topic, table in data.items():
    # Create csv file for each table
    with open(f"{topic}.csv", 'w') as out_file:
        # Each 3 table has headers as following
        headers = [ 
            "Country/Territory",
            "GDP(US$million)",
            "Rank"
        ] # == t_headers
        writer = csv.DictWriter(out_file, headers)
        # write the header
        writer.writeheader()
        for row in table:
            if row:
                writer.writerow(row)

In [17]:
import pandas as pd 
df=pd.read_csv("Per the International Monetary Fund (2019 estimates).csv")
df

Unnamed: 0,Country/Territory,GDP(US$million),Rank
0,World[19],87265226,
1,United States,21439453,1
2,European Union[22][n 1],18705132,—
3,China[n 2],14140163,2
4,Japan,5154475,3
...,...,...,...
189,Palau,291,182
190,Marshall Islands,220,183
191,Kiribati,184,184
192,Nauru,108,185


In [18]:
df.shape

(194, 3)

In [19]:
Per_UN=pd.read_csv("Per the United Nations (2018).csv")
Per_UN

Unnamed: 0,Country/Territory,GDP(US$million),Rank
0,World[24],85085189,
1,United States,20580223,1
2,China[n 5],13608152,2
3,Japan,4971323,3
4,Germany,3949549,4
...,...,...,...
209,Marshall Islands,214,190
210,Kiribati,189,191
211,Nauru,127,192
212,Montserrat,64,—


In [20]:
Per_UN.shape

(214, 3)

In [21]:
per_WB=pd.read_csv("Per the World Bank (2019).csv")
per_WB

Unnamed: 0,Country/Territory,GDP(US$million),Rank
0,World,87751541,
1,United States,21427700,1
2,China[n 5],14342903,2
3,Japan,5081770,3
4,Germany,3845630,4
...,...,...,...
186,Palau (2018),284,181
187,Marshall Islands (2018),221,182
188,Kiribati,195,183
189,Nauru,118,184
