# Import relevant libraries

In [1]:
import zipfile
import pandas as pd
import plotly.express as px

from pprint import pprint
from ipywidgets import interact, Dropdown, SelectMultiple, Output, VBox
from IPython.display import display

# Some data inspection settings

In [2]:
pd.options.display.max_rows=999
pd.options.display.max_columns=999
pd.options.display.max_colwidth=None

------
## Life expectancy dataset

In [3]:
with zipfile.ZipFile("temp/API_SP.DYN.LE00.IN_DS2_en_CSV_v2_76065.zip", 'r') as zip_ref:
  zip_ref.extractall("temp/")

In [4]:
life_expect_df = pd.read_csv("temp/API_SP.DYN.LE00.IN_DS2_en_CSV_v2_76065.csv", 
                             skiprows=4, 
                             encoding='utf-8-sig', # byte order mark
                             sep=",",
                             na_values=['']
                       )

# checking the shape & dropping columns with all NaNs
print(life_expect_df.shape)
life_expect_df = life_expect_df.dropna(axis=1, how='all')
print(life_expect_df.shape)

life_expect_df.sample(3)

(266, 69)
(266, 67)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
102,IBRD only,IBD,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,45.8334,48.685931,52.393595,52.867856,53.390622,53.101684,53.794909,54.284031,54.977101,55.387523,55.93196,56.563379,57.01718,57.682027,58.238987,58.839288,59.385582,59.937195,60.412762,60.909762,61.341548,61.783128,62.241836,62.69093,63.094108,63.510723,64.004278,64.358506,64.658858,65.003806,65.208781,65.4875,65.783948,66.034504,66.303142,66.641841,66.965287,67.338344,67.702994,67.910194,68.250279,68.662133,68.994504,69.262422,69.505518,69.927566,70.283516,70.574323,70.791984,71.169258,71.453081,71.794753,72.140332,72.488461,72.844583,73.144135,73.411428,73.633052,73.924935,74.120581,73.370079,72.126212,72.913491
257,Viet Nam,VNM,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,59.657,60.261,60.131,58.701,59.323,58.692,58.467,58.161,54.07,56.151,56.222,56.749,53.103,56.786,55.83,62.882,64.775,65.049,65.529,66.001,66.195,66.645,67.106,67.148,67.305,67.601,68.013,68.392,68.469,69.076,69.213,69.849,69.958,70.621,70.759,71.384,71.51,71.802,72.107,72.317,72.462,72.647,72.801,72.98,73.135,73.271,73.319,73.436,73.411,73.498,73.513,73.692,73.704,73.775,73.855,73.876,73.938,73.963,73.976,74.093,75.378,73.618,74.58
113,Iraq,IRQ,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,52.618,53.411,54.326,54.948,55.633,56.616,57.348,58.37,58.781,59.32,60.186,60.909,61.388,62.03,60.898,61.464,63.715,64.228,64.401,64.791,61.5,59.331,59.733,60.061,60.219,60.68,60.874,61.176,57.466,58.235,58.435,62.519,66.708,66.897,66.744,66.464,66.391,65.837,65.486,66.182,66.817,67.034,67.084,65.643,65.031,64.844,63.587,63.553,64.942,66.446,67.062,67.659,68.023,68.253,68.914,69.44,68.988,70.413,71.514,71.576,69.123,70.378,71.336


------
# GDP per capita

In [5]:
with zipfile.ZipFile("temp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_76317.zip", 'r') as zip_ref:
  zip_ref.extractall("temp/")

In [6]:
gdppercap = pd.read_csv("temp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_76317.csv", 
                        skiprows=4, 
                        encoding='utf-8-sig',
                        sep=",",
                        na_values=['']
                       )
# checking the shape & dropping columns with all NaNs
print(gdppercap.shape)
gdppercap = gdppercap.dropna(axis=1, how='all')
print(gdppercap.shape)

gdppercap.sample(3)

(266, 69)
(266, 68)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
215,Sub-Saharan Africa (excluding high income),SSA,GDP per capita (current US$),NY.GDP.PCAP.CD,158.078684,160.884658,169.676012,188.266459,182.631051,196.07086,207.107448,202.019784,210.543035,231.472623,251.67494,251.865223,276.281204,342.616805,422.5969,451.080682,474.419492,506.992088,551.450412,630.812794,766.877737,1006.000156,907.531683,791.006403,686.748673,636.8562,579.667944,630.539769,644.961182,638.113189,726.948476,756.185816,662.720392,651.833415,649.431033,804.855409,875.305863,897.049941,875.411918,608.282578,627.981256,585.719813,620.758666,761.062606,919.622807,1061.939152,1218.881466,1373.031875,1514.162216,1407.110708,1619.291457,1770.933553,1790.538098,1851.425794,1878.738715,1627.664043,1445.514085,1535.068876,1604.663747,1610.17142,1472.996712,1612.770038,1686.891375,1621.261481
158,Mali,MLI,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,,45.985445,56.513753,55.022686,57.320787,61.966463,75.052104,85.444088,80.225221,121.497573,134.927575,148.052695,169.123319,216.320734,233.674747,200.054496,169.684645,161.526103,150.200065,166.239053,217.102721,240.823694,245.533347,242.426327,292.184532,290.949914,296.295758,288.912651,208.873284,265.717052,267.161312,253.332747,267.442574,306.457427,256.199555,291.159498,318.042432,371.627016,416.278344,461.45792,493.628706,564.157958,658.493183,662.883388,670.369099,789.062725,733.738935,758.475533,797.154511,704.821646,730.854938,775.536492,835.086711,820.197391,804.343957,862.4674,813.965034,869.270234
83,Ghana,GHA,GDP per capita (current US$),NY.GDP.PCAP.CD,174.858857,181.870011,188.421016,205.037422,224.968154,260.505595,263.334589,211.219297,196.668572,225.772622,248.159869,263.408788,223.802864,309.509552,289.425857,272.976715,260.998296,292.289798,325.598477,346.653404,372.252026,343.799127,319.20691,311.383438,329.998889,329.714605,410.711658,355.153725,355.121845,349.964942,382.542464,418.254121,396.426408,359.800844,320.503903,371.537243,389.271268,377.841936,400.3668,403.037326,253.746936,263.542156,297.461645,358.399271,406.132105,478.611909,906.441028,1050.124615,1182.657613,1047.704301,1263.892518,1507.435732,1543.775242,2294.025645,1953.645136,1721.699644,1913.186733,2012.970258,2196.612906,2186.189241,2196.547893,2445.500818,2240.279451,2260.287413


### Number of years for which no countries or few countries have data.
> Interested in only last 25 to 30 years of data.    
> If some countries have even less number of years, the threshold is `15 years`, if some country has less than 15 years, will not include it.

In [7]:
# Select the gdppc columns
gdppc_cols = [col for col in gdppercap.columns if col.isdigit()]

# Identify years with complete data (no missing values)
complete_years = [col for col in gdppc_cols if gdppercap[col].notna().all()]

print("Complete year columns:", complete_years)
print("Number of complete years:", len(complete_years))

Complete year columns: []
Number of complete years: 0


> Not even a single country has all years of data

- Prioritising last 21 years of data. Selecting only those countries which have last 21 years of data in the World Bank dataset.
- Life expectancy dataset has data till 2022. The range will be different for Life expectancy.

In [8]:
gdppercap['gdppc_non_null'] = gdppercap[gdppc_cols].notna().sum(axis=1)
# gdppercap[['Country Name', 'gdppc_non_null']]

In [9]:
last21 = [f"{year}" for year in range(2003, 2024)]
gdppercap['last21_complete'] = gdppercap[last21].notna().all(axis=1)
gdppercap_incomplete_countries = gdppercap.loc[~gdppercap['last21_complete'], 'Country Name']

In [10]:
# Countries missing last 21 years of data
pprint(gdppercap_incomplete_countries.tolist())

['American Samoa',
 'Bhutan',
 'Channel Islands',
 'Cuba',
 'Cayman Islands',
 'Eritrea',
 'Gibraltar',
 'Greenland',
 'Guam',
 'Isle of Man',
 'Not classified',
 'Lebanon',
 'Liechtenstein',
 'St. Martin (French part)',
 'Northern Mariana Islands',
 'New Caledonia',
 "Korea, Dem. People's Rep.",
 'French Polynesia',
 'San Marino',
 'South Sudan',
 'Sint Maarten (Dutch part)',
 'Syrian Arab Republic',
 'Tonga',
 'Venezuela, RB',
 'British Virgin Islands',
 'Virgin Islands (U.S.)',
 'Kosovo']


> *Similarly, selecting same number of years and same countries from `life-expectancy` dataset also.*

In [11]:
leb_cols = [col for col in life_expect_df.columns if col.isdigit()]

# Identify years with complete data (no missing values)
complete_years = [col for col in leb_cols if life_expect_df[col].notna().all()]

print("Complete year columns:", complete_years)
print("Number of complete years:", len(complete_years))

Complete year columns: []
Number of complete years: 0


In [12]:
life_expect_df['leb_non_null'] = life_expect_df[leb_cols].notna().sum(axis=1)

In [13]:
last20 = [f"{year}" for year in range(2003, 2023)]
life_expect_df['last20_complete'] = life_expect_df[last20].notna().all(axis=1)
life_expect_df_incomplete_countries = life_expect_df.loc[~life_expect_df['last20_complete'], 'Country Name']

In [14]:
pprint(life_expect_df_incomplete_countries.tolist())

['Andorra',
 'American Samoa',
 'Curacao',
 'Cayman Islands',
 'Not classified',
 'Monaco',
 'Northern Mariana Islands',
 'Palau',
 'San Marino']


#### Dropping these countries and number of unneccessary years from these 2 datasets.

> Also, I am clear about `Indicator Name` & `Indicator Code`. Let's get rid of these columns also from both datasets.

In [15]:
gdppercap = gdppercap[~gdppercap['Country Name'].isin(gdppercap_incomplete_countries)]
life_expect_df = life_expect_df[~life_expect_df['Country Name'].isin(life_expect_df_incomplete_countries)]

In [16]:
gdppercap = gdppercap[['Country Name', 'Country Code'] +
    [col for col in gdppercap.columns 
     if col.isdigit() and int(col) >= 2003]]

life_expect_df = life_expect_df[['Country Name', 'Country Code'] +
    [col for col in life_expect_df.columns 
     if col.isdigit() and int(col) >= 2003]]

In [17]:
gdppercap.shape, life_expect_df.shape

((239, 23), (257, 22))

In [18]:
print("Any NaNs in gdppercap:", gdppercap.isnull().values.any())
print("Any NaNs in life_expect_df:", life_expect_df.isnull().values.any())

Any NaNs in gdppercap: False
Any NaNs in life_expect_df: False


**Perfecto.!**
> Now, both of these datasets have `year numbers` like **2003, 2004.... 2024** and so on.

In [19]:
gdp = gdppercap.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=[f"{year}" for year in list(range(2003, 2023))],
        var_name='year',
        value_name='gdp_per_capita')

leb = life_expect_df.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=[f'{year}' for year in list(range(2003, 2023))],
        var_name='year',
        value_name='life_expectancy'
    )

print(gdp.shape, leb.shape)
gdp.sample(3)

(4780, 4) (5140, 4)


Unnamed: 0,Country Name,Country Code,year,gdp_per_capita
3667,Greece,GRC,2018,19873.401524
2299,Montenegro,MNE,2012,6586.399703
64,Spain,ESP,2003,21522.017099


In [20]:
leb.sample(3)

Unnamed: 0,Country Name,Country Code,year,life_expectancy
4093,Uganda,UGA,2018,62.714
3853,Zambia,ZMB,2017,62.12
1806,United Arab Emirates,ARE,2010,78.334


#### Data preparation and cleaning is done.
> Time to merge the data now.

### Merging `GDP per capita` & `Life expectancy` datasets
------------

> - It is usually a good practice to expect the outcome of your code.    
> - My expectation is that the number of rows will certainly remain the same because there are 235 countries (after cleaning) in both and we are joining on them.    
> - Expected number of columns will be 5.
> - Expected number of rows will be 4700 `(235 X 20)`
> - *FYI: 2023 GDP data is also removed because we do not have Life Expectancy for 2023.*

In [21]:
merged_df = pd.merge(left = gdp,
                     right = leb,
                     on = ['Country Name', 'Country Code', 'year'],
                     how = 'inner')

merged_df.shape, merged_df.drop_duplicates().shape

((4700, 5), (4700, 5))

In [22]:
merged_df.isna().any()

Country Name       False
Country Code       False
year               False
gdp_per_capita     False
life_expectancy    False
dtype: bool

In [23]:
merged_df.nunique()

Country Name        235
Country Code        235
year                 20
gdp_per_capita     4660
life_expectancy    4441
dtype: int64

In [24]:
merged_df.sample(3)

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy
4273,Cabo Verde,CPV,2021,3971.444074,74.052
1352,Post-demographic dividend,PST,2008,40878.795306,79.145118
55,Early-demographic dividend,EAR,2003,1368.161337,65.517524


> Perfect, everything looks under control.

In [25]:
merged_df[merged_df['Country Name'].str.contains('Euro area')].head()

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy
61,Euro area,EMU,2003,27017.620907,78.652026
296,Euro area,EMU,2004,30816.540152,79.228237
531,Euro area,EMU,2005,31800.443033,79.370612
766,Euro area,EMU,2006,33671.931401,79.803183
1001,Euro area,EMU,2007,38580.616203,80.043325


## Creating or identifying individual regions
1. Top 5 economies (as per 2022) *e.g. United States, China, Germany, etc.*
2. Income group *e.g. Early-demographic dividend, Pre-demographic dividend, Europe & Central Asia (excluding high income), etc.*
3. Regional group *e.g. Africa, Asia, Europe, etc*.
4. Economic group *e.g. Caribbean small states, Euro area, etc.*

In [26]:
def classify_country(name):
    top5_economies = {'United States', 'China', 'Germany', 'Japan', 'India'}
    if name in top5_economies:
        return 'is_top5'
    
    # Income Groups
    if any(keyword in name for keyword in ['income', 'dividend']):
        return 'income_group'
    
    # Regional Groups
    if any(keyword in name for keyword in ['Africa', 'Asia', 'Europe', 
                                         'America', 'Middle East', 'Pacific',
                                         'Caribbean', 'World', 'area']):
        return 'region_group'
    
    # Economic Groups
    if any(keyword in name for keyword in ['IBRD', 'IDA', 'OECD', 'Fragile',
                                         'small states', 'Euro area', 'EU']):
        return 'economic_group'
    
    # remaining countries as 'country'
    return 'country'

In [27]:
merged_df['country_type'] = merged_df['Country Name'].apply(classify_country)

In [28]:
merged_df.sample(5)

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,country_type
1359,Sudan,SDN,2008,1527.582397,61.206,country
800,Indonesia,IDN,2006,1558.315628,67.914,country
2813,Vanuatu,VUT,2014,2967.70667,69.465,country
1728,Guyana,GUY,2010,4581.761797,66.743,country
2744,Nepal,NPL,2014,821.455225,68.085,country


## Visualisation in the form of animation
> - Now plotly doesn't directly support animated line chart, but there are always hack around it and I've tried one here.
> - Can clearly see that the `line` plot is kind of wavy and not showing any trend. 

In [29]:
# first one is created as a separate flag because it is something I wanted to see, rest exist in the data itself
# so, only classification is required.

merged_df['is_top5'] = merged_df['country_type'] == 'is_top5'

countries = merged_df[merged_df['country_type'] == 'country']['Country Name'].unique()
income_groups = merged_df[merged_df['country_type'] == 'income_group']['Country Name'].unique()
regions = merged_df[merged_df['country_type'] == 'region_group']['Country Name'].unique()

In [30]:
cumulative_data = []
years = sorted(merged_df['year'].unique())

for current_year in years:
    temp_df = merged_df[merged_df['year'] <= current_year].copy()
    temp_df['frame_year'] = current_year
    cumulative_data.append(temp_df)

cumulative_df = pd.concat(cumulative_data)

plot_output = Output()

# widget options (can see these in dropdowns)
analysis_modes = [
    ('Top 5 Economies', 'top5'),
    ('Income Groups', 'income'),
    ('Regions', 'region'),
    ('Custom Countries', 'custom')
]

In [31]:
# Create widgets
analysis_dropdown = Dropdown(options=analysis_modes, value='top5', description='Mode:')
selection_widget = SelectMultiple(options=[], description='Select:', disabled=True)

In [32]:
def update_components(change):
    """The function will make sure that as you are selecting different options,
    the components are updating"""
    with plot_output:
        plot_output.clear_output(wait=True)
    
    mode = analysis_dropdown.value
    if mode == 'top5':
        selection_widget.disabled = True
        selection_widget.options = []
        update_plot(mode, [])
    else:
        selection_widget.disabled = False
        if mode == 'income':
            options = merged_df[merged_df['country_type'] == 'income_group']['Country Name'].unique()
        elif mode == 'region':
            options = merged_df[merged_df['country_type'] == 'region_group']['Country Name'].unique()
        else:
            options = merged_df[merged_df['country_type'] == 'country']['Country Name'].unique()
        selection_widget.options = [(o, o) for o in options]
        selection_widget.value = [options[0]] if len(options) > 0 else []

In [33]:
def update_plot(mode, selections):
    with plot_output:
        plot_output.clear_output(wait=True)
        
        if mode == 'top5':
            plot_data = cumulative_df[cumulative_df['is_top5']]
            title = "Top 5 Economies Development Over Time"
        else:
            plot_data = cumulative_df[cumulative_df['Country Name'].isin(selections)]
            title = {
                'income': 'Income Group Trends',
                'region': 'Regional Development',
                'custom': 'Country Comparison'
            }[mode]
        
        if not plot_data.empty:
            fig = px.line(
                plot_data,
                x='gdp_per_capita',
                y='life_expectancy',
                color='Country Name',
                line_group='Country Name',
                hover_name='Country Name',
                animation_frame='frame_year',
                animation_group='Country Name',
                markers=True,
                title=f'<b>{title}</b>',
                log_x=True,
                range_x=[cumulative_df['gdp_per_capita'].min()*0.8, 
                        cumulative_df['gdp_per_capita'].max()*1.2],
                range_y=[cumulative_df['life_expectancy'].min()-5, 
                        cumulative_df['life_expectancy'].max()+5],
                template='plotly_white'
            )
            # a lot of this code is copied directly from plotly website
            fig.layout.updatemenus = [{
                'type': 'buttons',
                'showactive': False,
                'buttons': [{
                    'label': '▶ Play',
                    'method': 'animate',
                    'args': [None, {'frame': {'duration': 150, 'redraw': True}}]
                }]
            }]

            # I was kind of tired writing this, but yes, a pause button can also be added.
            
            # dynamic year
            fig.update_layout(
                annotations=[{
                    'x': 0.95,
                    'y': 0.05,
                    'xref': 'paper',
                    'yref': 'paper',
                    'text': 'Year: 2003',
                    'showarrow': False,
                    'font': {'size': 18, 'color': 'darkblue'}
                }]
            )
            
            for frame in fig.frames:
                frame.layout.annotations = [{
                    'text': f'Year: {frame.name}',
                    'showarrow': False,
                    'x': 0.95,
                    'y': 0.05,
                    'font': {'size': 18, 'color': 'darkblue'}
                }]
            
            fig.update_traces(
                mode='lines+markers',
                line=dict(width=2.5),
                marker=dict(size=8, opacity=0.8)
            )
            
            fig.update_layout(
                xaxis_title='GDP per Capita (USD, Log Scale)',
                yaxis_title='Life Expectancy (Years)',
                hovermode='closest',
                height=600
            )
            
            fig.show()
        else:
            print("Select items from the dropdown to view data")

In [34]:
analysis_dropdown.observe(update_components, names='value')
selection_widget.observe(lambda c: update_plot(analysis_dropdown.value, c.new), names='value')

In [35]:
update_components(None)

In [36]:
# finally display it.
display(VBox([analysis_dropdown, selection_widget, plot_output]))

VBox(children=(Dropdown(description='Mode:', options=(('Top 5 Economies', 'top5'), ('Income Groups', 'income')…

# Script Complete