1. Create a python file that webscrapes [GDP by
   country](https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal))
   and plots a stacked interactive bar plot using plotly. Stack
   countries within regions using the IMF numbers. Please include this in your ipython notebook
   and output your plot to an html file containing the plot.
2. Look at the [chapter on interactive
   graphics](https://smart-stats.github.io/ds4bio_book/book/_build/html/interactive.html)
   and, specifically, the code to display a subject's MRICloud data as
   a sunburst plot. Do the following. Display this subject's data as a
   [Sankey
   diagram](https://plotly.com/python/sankey-diagram/). Display as
   many levels as you can (at least 3) for Type = 1, starting from the
   intracranial volume.
3. Create a simple webpage containing the Sankey graphic and host it on
   github pages. **Do not- host this off of your assignment repo from
   github classroom, since this is not public**. Instead, you'll have to
   create a new public repo from your regular github account and add
   this file. Put the link to your live web page in a markdown cell of
   your `hw5.ipynb` file as a text block.


In [1]:
import requests as rq
import bs4
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
page = rq.get(url)
page.text[0:99]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-l'

In [3]:
#make HTML searchable
bs4page = bs4.BeautifulSoup(page.text, 'html.parser')

#find table
tables = bs4page.find_all('table',{'class':"wikitable"})

In [4]:
from io import StringIO

In [5]:
gdp = pd.read_html(StringIO(str(tables[0])))[0]
gdp = gdp.dropna()
gdp.head()

Unnamed: 0_level_0,Country/Territory,IMF[1][12],IMF[1][12],World Bank[13],World Bank[13],United Nations[14],United Nations[14]
Unnamed: 0_level_1,Country/Territory,Forecast,Year,Estimate,Year,Estimate,Year
0,World,115494312,2025,105435540,2023,100834796,2022
1,United States,30338000,2025,27360935,2023,25744100,2022
2,China,19535000,[n 1]2025,17794782,[n 3]2023,17963170,[n 1]2022
3,Germany,4922000,2025,4456081,2023,4076923,2022
4,Japan,4390000,2025,4212945,2023,4232173,2022


In [6]:
import plotly.express as px
import numpy as np
import pycountry
import pycountry_convert as convertcountry

In [7]:
print(gdp.columns.tolist())


[('Country/Territory', 'Country/Territory'), ('IMF[1][12]', 'Forecast'), ('IMF[1][12]', 'Year'), ('World Bank[13]', 'Estimate'), ('World Bank[13]', 'Year'), ('United Nations[14]', 'Estimate'), ('United Nations[14]', 'Year')]


In [8]:
#Select only the IMF Forecast column
gdp_imf = gdp[[('Country/Territory', 'Country/Territory'), ('IMF[1][12]', 'Forecast')]].copy()

#Rename
gdp_imf.columns = ['Country', 'GDP_IMF']

#Drop 'World' row bc we’re only interested in countries
gdp_imf = gdp_imf[gdp_imf['Country'] != 'World']

#Clean GDP values (remove notes and commas, convert to numeric)
gdp_imf['GDP_IMF'] = gdp_imf['GDP_IMF'].astype(str).str.replace(r'\[.*?\]', '', regex=True)
gdp_imf['GDP_IMF'] = gdp_imf['GDP_IMF'].str.replace(',', '')
gdp_imf['GDP_IMF'] = pd.to_numeric(gdp_imf['GDP_IMF'], errors='coerce')

gdp_imf.head()


Unnamed: 0,Country,GDP_IMF
1,United States,30338000.0
2,China,19535000.0
3,Germany,4922000.0
4,Japan,4390000.0
5,India,4270000.0


In [9]:
# Fix common name mismatches manually by Mapping fixes: Wikipedia names → pycountry-compatible names
country_fixes = {
    'United States': 'United States of America',
    'Russia': 'Russian Federation',
    'South Korea': 'Korea, Republic of',
    'North Korea': 'Korea, Democratic People\'s Republic of',
    'Czech Republic': 'Czechia',
    'Ivory Coast': "Côte d'Ivoire",
    'DR Congo': 'Congo, The Democratic Republic of the',
    'Syria': 'Syrian Arab Republic',
    'Iran': 'Iran, Islamic Republic of',
    'Venezuela': 'Venezuela, Bolivarian Republic of',
    'Vietnam': 'Viet Nam',
    'Tanzania': 'Tanzania, United Republic of',
    'Bolivia': 'Bolivia, Plurinational State of',
    'Brunei': 'Brunei Darussalam',
    'Laos': "Lao People's Democratic Republic",
    'Moldova': 'Moldova, Republic of',
    'Palestine': 'Palestine, State of',
    'Macau': 'Macao',
    'Cape Verde': 'Cabo Verde',
    'East Timor': 'Timor-Leste',
    'Micronesia': 'Micronesia, Federated States of',
    'São Tomé and Príncipe': 'Sao Tome and Principe',
    'Swaziland': 'Eswatini',
    'Burma': 'Myanmar',
    'Kosovo': 'Serbia',  # Not officially recognized by pycountry
    'Zanzibar': 'Tanzania, United Republic of',
}

# Replace with fixed names
gdp_imf['Country_fixed'] = gdp_imf['Country'].replace(country_fixes)


In [10]:
country_map = convertcountry.map_countries(cn_name_format="default")

print(country_map)

{'Aruba': {'alpha_2': 'AW', 'alpha_3': 'ABW', 'numeric': '533'}, 'Afghanistan': {'alpha_2': 'AF', 'alpha_3': 'AFG', 'numeric': '004'}, 'Islamic Republic of Afghanistan': {'alpha_2': 'AF', 'alpha_3': 'AFG', 'numeric': '004'}, 'Angola': {'alpha_2': 'AO', 'alpha_3': 'AGO', 'numeric': '024'}, 'Republic of Angola': {'alpha_2': 'AO', 'alpha_3': 'AGO', 'numeric': '024'}, 'Anguilla': {'alpha_2': 'AI', 'alpha_3': 'AIA', 'numeric': '660'}, 'Åland Islands': {'alpha_2': 'AX', 'alpha_3': 'ALA', 'numeric': '248'}, 'Albania': {'alpha_2': 'AL', 'alpha_3': 'ALB', 'numeric': '008'}, 'Republic of Albania': {'alpha_2': 'AL', 'alpha_3': 'ALB', 'numeric': '008'}, 'Andorra': {'alpha_2': 'AD', 'alpha_3': 'AND', 'numeric': '020'}, 'Principality of Andorra': {'alpha_2': 'AD', 'alpha_3': 'AND', 'numeric': '020'}, 'United Arab Emirates': {'alpha_2': 'AE', 'alpha_3': 'ARE', 'numeric': '784'}, 'Argentina': {'alpha_2': 'AR', 'alpha_3': 'ARG', 'numeric': '032'}, 'Argentine Republic': {'alpha_2': 'AR', 'alpha_3': 'ARG

In [11]:
def convert_country_name_to_continent(country_name):
    try:
        country = pycountry.countries.lookup(country_name)
        alpha2 = country.alpha_2
        continent_code = convertcountry.country_alpha2_to_continent_code(alpha2)
        continent_name = convertcountry.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except:
        return None


In [12]:
# Map region to each country in your GDP data
gdp_imf['Region'] = gdp_imf['Country_fixed'].apply(convert_country_name_to_continent)
gdp_imf.tail()

Unnamed: 0,Country,GDP_IMF,Country_fixed,Region
205,Kiribati,311.0,Kiribati,Oceania
206,Palau,308.0,Palau,Oceania
207,Marshall Islands,305.0,Marshall Islands,Oceania
208,Nauru,161.0,Nauru,Oceania
209,Tuvalu,66.0,Tuvalu,Oceania


In [13]:
print(gdp_imf.shape)
print(gdp_imf['Region'].isna().sum())
print(gdp_imf['Region'].unique())


(209, 4)
3
['North America' 'Asia' 'Europe' 'South America' 'Oceania' None 'Africa']


In [14]:
# Show unmapped countries
unmapped = gdp_imf[gdp_imf['Region'].isna()][['Country', 'Country_fixed']]
print("Unmapped countries (still NaN in Region):")
print(unmapped.sort_values('Country').to_string(index=False))


Unmapped countries (still NaN in Region):
     Country Country_fixed
Sint Maarten  Sint Maarten
 Timor-Leste   Timor-Leste
      Turkey        Turkey


In [15]:
# creating manual fix dictionary
manual_region_fixes = {
    'Sint Maarten': 'North America',
    'Timor-Leste': 'Asia',
    'Turkey': 'Europe'
}

gdp_imf['Region'] = gdp_imf['Region'].fillna(gdp_imf['Country_fixed'].map(manual_region_fixes))


In [16]:
gdp_imf[gdp_imf['Region'].isna()]  # should return an empty DataFrame


Unnamed: 0,Country,GDP_IMF,Country_fixed,Region


In [17]:
fig = px.bar(
    gdp_imf, 
    x= "Region", 
    y= "GDP_IMF", 
    hover_data="Country", 
    title='Stacked Bar Plot of GDP by Region (IMF Forecast)',
    labels={'GDP_IMF': 'GDP (Millions USD)'})
fig.show()

In [18]:
fig.write_html("stacked_gdp_by_region.html")