In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [2]:
#fetch the HTML of the website of interest using request library
web_url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
response = requests.get(web_url)

In [3]:
#parse the HTML response into beautiful soup to pick the right table and read the HTML into dataframe using 
soup = BeautifulSoup(response.text, "html.parser")
tables = soup.find("table", {'class':"wikitable sortable"})
table_df= pd.read_html(str(tables))[0] #create df using pandas library
table_df.head(10)

Unnamed: 0,Rank,Country/Territory,GDP(US$million)
0,,World[19],83844988
1,1.0,United States,20807269
2,2.0,China[n 2][n 3],14860775
3,3.0,Japan,4910580
4,4.0,Germany,3780553
5,5.0,United Kingdom,2638296
6,6.0,India,2592583
7,7.0,France,2551451
8,8.0,Italy,1848222
9,9.0,Canada,1600264


In [4]:
#few cleanups
table_df = table_df.dropna(axis = 0, subset=["Rank"]) #remove rows where Rank has missing value
table_df = table_df.drop(columns = 'Rank') #remove rank
table_df = table_df[table_df["GDP(US$million)"]!="N/a"] #Syria nominal GDP wasn't reported and we drop it

#convert GDP(US$million) to numeric value
table_df["GDP(US$million)"] = table_df["GDP(US$million)"].apply(pd.to_numeric) 

#create a new rank for each country using their nominal GDP
table_df['Rank'] = table_df["GDP(US$million)"].rank(method='max', ascending=False)

# Few elements in Country/territory column have some reference character imported from wikipedia and we should remove them 
# Function to clean up character country/territory column

def country_name(country): 
    # search for braces in the column and other character that follows
    if re.search('\\[.*', country): 
  
        # get the beginning of the partern
        patt = re.search('\\[.*', country).start() 
  
        #return the country name
        return country[:patt] 
    else: 
        # returned country name after clean up
        return country

#Apply the cleanup function to the table_df dataframe
table_df['Country/Territory'] = table_df['Country/Territory'].apply(country_name)
table_df.head(5)

Unnamed: 0,Country/Territory,GDP(US$million),Rank
1,United States,20807269,1.0
2,China,14860775,2.0
3,Japan,4910580,3.0
4,Germany,3780553,4.0
5,United Kingdom,2638296,5.0
