# Import relevant libraries

In [1]:
import zipfile
import pandas as pd
import plotly.express as px

from pprint import pprint
from ipywidgets import interact, Dropdown, SelectMultiple, Output, VBox
from IPython.display import display

In [2]:
# these libraries are specific to generating gifs

import matplotlib.pyplot as plt

from matplotlib import colormaps
from matplotlib.animation import FuncAnimation
from matplotlib.colors import rgb2hex

%matplotlib inline

# Some data inspection settings

In [3]:
pd.options.display.max_rows=999
pd.options.display.max_columns=999
pd.options.display.max_colwidth=None

------
## Life expectancy dataset

In [4]:
with zipfile.ZipFile("temp/API_SP.DYN.LE00.IN_DS2_en_CSV_v2_76065.zip", 'r') as zip_ref:
  zip_ref.extractall("temp/")

In [5]:
life_expect_df = pd.read_csv("temp/API_SP.DYN.LE00.IN_DS2_en_CSV_v2_76065.csv", 
                             skiprows=4, 
                             encoding='utf-8-sig', # byte order mark
                             sep=",",
                             na_values=['']
                       )

# checking the shape & dropping columns with all NaNs
print(life_expect_df.shape)
life_expect_df = life_expect_df.dropna(axis=1, how='all')
print(life_expect_df.shape)

life_expect_df.sample(3)

(266, 69)
(266, 67)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
25,Belarus,BLR,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,69.25461,69.73839,69.462317,69.70978,70.619244,70.597488,70.768927,70.92978,70.81278,70.978024,70.744488,71.011707,70.849317,71.000659,70.854683,70.457098,70.396829,70.289024,70.081732,69.933634,69.927707,69.907512,70.076902,69.794854,69.563195,70.992683,71.549512,70.990244,71.341463,71.482927,70.836585,70.378049,70.021951,68.970732,68.768293,68.460976,68.512195,68.460976,68.407317,67.907317,68.912195,68.507317,68.056098,68.553659,68.956098,68.85122,69.404878,70.207317,70.456098,70.407317,70.404878,70.553659,71.965854,72.470732,72.970732,73.62439,73.826829,74.129268,74.17561,74.226829,72.45722,72.370683,73.111463
217,Sub-Saharan Africa,SSF,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,41.417413,41.734981,42.197081,42.434913,42.888275,43.023923,42.979434,43.222156,43.660286,43.899089,44.358007,44.865162,44.983003,45.712946,45.929501,46.314379,46.987936,47.449958,47.750374,48.210106,48.574648,48.944645,49.189711,48.360862,48.39624,48.649955,49.04503,49.355379,49.024311,49.899604,49.836882,49.706058,49.470813,49.696244,50.011683,50.110809,50.041304,50.207314,49.91711,50.640805,51.263404,51.535231,51.889081,52.361821,52.835105,53.445397,54.196429,54.827885,55.424506,56.192569,56.818006,57.527173,58.107298,58.632535,59.105918,59.523425,60.027155,60.437551,60.826044,61.211033,60.821637,60.219059,60.744131
231,Europe & Central Asia (IDA & IBRD countries),TEC,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,64.815502,65.041466,64.884397,65.45265,65.9352,65.869868,66.040433,65.837088,65.962493,65.625552,66.226273,66.471313,66.641437,66.777333,66.935338,66.747517,66.754008,66.751168,66.817782,66.776506,66.923647,67.237399,67.567647,67.611678,67.502956,67.856313,68.744301,68.893397,68.922698,69.012267,68.916286,68.641314,67.773993,66.982022,66.978945,67.032962,67.596664,68.08522,68.5088,68.190808,68.385486,68.56119,68.655678,68.777056,69.113457,69.200446,69.783722,70.244165,70.547655,71.094457,71.410535,71.881002,72.253734,72.688072,72.901371,73.118919,73.434186,73.86019,74.0284,74.32279,72.681106,71.999408,73.913721


------
# GDP per capita

In [6]:
with zipfile.ZipFile("temp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_76317.zip", 'r') as zip_ref:
  zip_ref.extractall("temp/")

In [7]:
gdppercap = pd.read_csv("temp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_76317.csv", 
                        skiprows=4, 
                        encoding='utf-8-sig',
                        sep=",",
                        na_values=['']
                       )
# checking the shape & dropping columns with all NaNs
print(gdppercap.shape)
gdppercap = gdppercap.dropna(axis=1, how='all')
print(gdppercap.shape)

gdppercap.sample(3)

(266, 69)
(266, 68)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
192,Puerto Rico,PRI,GDP per capita (current US$),NY.GDP.PCAP.CD,717.514843,777.215027,854.744805,931.751666,1006.434446,1110.832691,1207.811824,1335.274112,1480.693176,1661.866885,1852.354673,2044.319906,2246.476714,2432.414219,2614.500941,2738.243153,2946.461953,3208.771356,3567.752629,4024.50942,4502.838428,4920.722937,5115.008514,5217.722729,5730.117009,6008.054486,6455.184461,6980.490143,7595.444656,8033.089441,8652.507492,9064.018517,9659.3389,10212.276763,10876.418824,11579.184997,12173.163689,12817.644962,14304.404987,15220.991342,16192.126972,18123.198702,18731.45939,19557.120249,20988.992333,21959.322697,22935.941159,23664.882349,24898.334586,25768.725888,26435.748786,27278.88305,27944.733894,28513.165735,28981.457331,29763.488301,30627.163402,31108.752751,31615.066792,32916.866801,31427.429114,32619.250417,35268.079102,36779.059491
207,Senegal,SEN,GDP per capita (current US$),NY.GDP.PCAP.CD,300.425115,308.166384,307.092565,308.629849,317.850427,314.318146,314.534344,305.220481,311.196147,287.253149,290.486425,291.088357,341.835352,381.439007,417.4076,547.378149,540.594007,540.126849,589.134045,716.281437,771.53397,682.97207,651.937123,564.369336,536.231765,571.417166,783.940525,916.094841,880.487755,848.555548,957.314337,913.399576,951.20496,877.529003,583.782776,714.994396,723.46995,650.776321,684.588106,677.516339,603.233162,637.213531,669.430447,817.888034,918.10208,979.846102,1017.201967,1188.938778,1398.79218,1308.694043,1275.883575,1374.675919,1326.837988,1379.831966,1398.891376,1218.006136,1266.439572,1356.805184,1452.556126,1431.181289,1461.087204,1598.10677,1564.745221,1706.44218
23,"Bahamas, The",BHS,GDP per capita (current US$),NY.GDP.PCAP.CD,1459.253539,1555.046304,1651.218295,1759.236577,1883.443491,2029.523016,2239.23964,2452.747632,2668.01498,3026.727572,2916.499984,3004.343566,3013.950167,3341.285217,3079.740141,2840.589653,2993.459238,3253.375434,3720.040579,4994.085352,5742.583625,6028.80641,6559.468693,7081.381948,8201.701338,9167.146875,9600.599528,10361.348808,10575.797154,11291.476447,11473.300839,11082.352857,10873.40475,10613.065879,10991.234667,11374.60567,11783.797197,20367.57327,21667.448822,24041.093319,24940.077509,25371.923767,26781.619594,26429.124779,26650.293423,28602.418187,29184.786509,30051.848959,29392.382442,27512.913095,27473.100431,27090.83768,28552.024268,27432.953427,28705.617084,30289.402251,30266.585922,31337.183246,32124.405106,32979.540128,25155.672543,28682.327,33044.388209,35896.505107


### Number of years for which no countries or few countries have data.
> Interested in only last 25 to 30 years of data.    
> If some countries have even less number of years, the threshold is `15 years`, if some country has less than 15 years, will not include it.

In [8]:
# Select the gdppc columns
gdppc_cols = [col for col in gdppercap.columns if col.isdigit()]

# Identify years with complete data (no missing values)
complete_years = [col for col in gdppc_cols if gdppercap[col].notna().all()]

print("Complete year columns:", complete_years)
print("Number of complete years:", len(complete_years))

Complete year columns: []
Number of complete years: 0


> Not even a single country has all years of data

- Prioritising last 21 years of data. Selecting only those countries which have last 21 years of data in the World Bank dataset.
- Life expectancy dataset has data till 2022. The range will be different for Life expectancy.

In [9]:
gdppercap['gdppc_non_null'] = gdppercap[gdppc_cols].notna().sum(axis=1)
# gdppercap[['Country Name', 'gdppc_non_null']]

In [10]:
last21 = [f"{year}" for year in range(2003, 2024)]
gdppercap['last21_complete'] = gdppercap[last21].notna().all(axis=1)
gdppercap_incomplete_countries = gdppercap.loc[~gdppercap['last21_complete'], 'Country Name']

In [11]:
# Countries missing last 21 years of data
pprint(gdppercap_incomplete_countries.tolist())

['American Samoa',
 'Bhutan',
 'Channel Islands',
 'Cuba',
 'Cayman Islands',
 'Eritrea',
 'Gibraltar',
 'Greenland',
 'Guam',
 'Isle of Man',
 'Not classified',
 'Lebanon',
 'Liechtenstein',
 'St. Martin (French part)',
 'Northern Mariana Islands',
 'New Caledonia',
 "Korea, Dem. People's Rep.",
 'French Polynesia',
 'San Marino',
 'South Sudan',
 'Sint Maarten (Dutch part)',
 'Syrian Arab Republic',
 'Tonga',
 'Venezuela, RB',
 'British Virgin Islands',
 'Virgin Islands (U.S.)',
 'Kosovo']


> *Similarly, selecting same number of years and same countries from `life-expectancy` dataset also.*

In [12]:
leb_cols = [col for col in life_expect_df.columns if col.isdigit()]

# Identify years with complete data (no missing values)
complete_years = [col for col in leb_cols if life_expect_df[col].notna().all()]

print("Complete year columns:", complete_years)
print("Number of complete years:", len(complete_years))

Complete year columns: []
Number of complete years: 0


In [13]:
life_expect_df['leb_non_null'] = life_expect_df[leb_cols].notna().sum(axis=1)

In [14]:
last20 = [f"{year}" for year in range(2003, 2023)]
life_expect_df['last20_complete'] = life_expect_df[last20].notna().all(axis=1)
life_expect_df_incomplete_countries = life_expect_df.loc[~life_expect_df['last20_complete'], 'Country Name']

In [15]:
pprint(life_expect_df_incomplete_countries.tolist())

['Andorra',
 'American Samoa',
 'Curacao',
 'Cayman Islands',
 'Not classified',
 'Monaco',
 'Northern Mariana Islands',
 'Palau',
 'San Marino']


#### Dropping these countries and number of unneccessary years from these 2 datasets.

> Also, I am clear about `Indicator Name` & `Indicator Code`. Let's get rid of these columns also from both datasets.

In [16]:
gdppercap = gdppercap[~gdppercap['Country Name'].isin(gdppercap_incomplete_countries)]
life_expect_df = life_expect_df[~life_expect_df['Country Name'].isin(life_expect_df_incomplete_countries)]

In [17]:
gdppercap = gdppercap[['Country Name', 'Country Code'] +
    [col for col in gdppercap.columns 
     if col.isdigit() and int(col) >= 2003]]

life_expect_df = life_expect_df[['Country Name', 'Country Code'] +
    [col for col in life_expect_df.columns 
     if col.isdigit() and int(col) >= 2003]]

In [18]:
gdppercap.shape, life_expect_df.shape

((239, 23), (257, 22))

In [19]:
print("Any NaNs in gdppercap:", gdppercap.isnull().values.any())
print("Any NaNs in life_expect_df:", life_expect_df.isnull().values.any())

Any NaNs in gdppercap: False
Any NaNs in life_expect_df: False


**Perfecto.!**
> Now, both of these datasets have `year numbers` like **2003, 2004.... 2024** and so on.

In [20]:
gdp = gdppercap.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=[f"{year}" for year in list(range(2003, 2023))],
        var_name='year',
        value_name='gdp_per_capita')

leb = life_expect_df.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=[f'{year}' for year in list(range(2003, 2023))],
        var_name='year',
        value_name='life_expectancy'
    )

print(gdp.shape, leb.shape)
gdp.sample(3)

(4780, 4) (5140, 4)


Unnamed: 0,Country Name,Country Code,year,gdp_per_capita
1871,Small states,SST,2010,10087.51447
3966,Middle income,MIC,2019,5267.927627
1116,Netherlands,NLD,2007,52100.799629


In [21]:
leb.sample(3)

Unnamed: 0,Country Name,Country Code,year,life_expectancy
2363,Czechia,CZE,2012,78.07561
477,Chad,TCD,2004,48.232
2757,Paraguay,PRY,2013,72.757


#### Data preparation and cleaning is done.
> Time to merge the data now.

### Merging `GDP per capita` & `Life expectancy` datasets
------------

> - It is usually a good practice to expect the outcome of your code.    
> - My expectation is that the number of rows will certainly remain the same because there are 235 countries (after cleaning) in both and we are joining on them.    
> - Expected number of columns will be 5.
> - Expected number of rows will be 4700 `(235 X 20)`
> - *FYI: 2023 GDP data is also removed because we do not have Life Expectancy for 2023.*

In [22]:
merged_df = pd.merge(left = gdp,
                     right = leb,
                     on = ['Country Name', 'Country Code', 'year'],
                     how = 'inner')

merged_df.shape, merged_df.drop_duplicates().shape

((4700, 5), (4700, 5))

In [23]:
merged_df.isna().any()

Country Name       False
Country Code       False
year               False
gdp_per_capita     False
life_expectancy    False
dtype: bool

In [24]:
merged_df.nunique()

Country Name        235
Country Code        235
year                 20
gdp_per_capita     4660
life_expectancy    4441
dtype: int64

In [25]:
merged_df.sample(3)

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy
747,Comoros,COM,2006,1154.904223,59.875
1704,Ecuador,ECU,2010,4520.30959,75.43
1146,Togo,TGO,2007,607.588224,56.511


> Perfect, everything looks under control.

In [26]:
merged_df[merged_df['Country Name'].str.contains('Euro area')].head()

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy
61,Euro area,EMU,2003,27017.620907,78.652026
296,Euro area,EMU,2004,30816.540152,79.228237
531,Euro area,EMU,2005,31800.443033,79.370612
766,Euro area,EMU,2006,33671.931401,79.803183
1001,Euro area,EMU,2007,38580.616203,80.043325


## Creating or identifying individual regions
1. Top 5 economies (as per 2022) *e.g. United States, China, Germany, etc.*
2. Income group *e.g. Early-demographic dividend, Pre-demographic dividend, Europe & Central Asia (excluding high income), etc.*
3. Regional group *e.g. Africa, Asia, Europe, etc*.
4. Economic group *e.g. Caribbean small states, Euro area, etc.*

In [27]:
def classify_country(name):
    top5_economies = {'United States', 'China', 'Germany', 'Japan', 'India'}
    if name in top5_economies:
        return 'is_top5'
    
    # Income Groups
    if any(keyword in name for keyword in ['income', 'dividend']):
        return 'income_group'
    
    # Regional Groups
    if any(keyword in name for keyword in ['Africa', 'Asia', 'Europe', 
                                         'America', 'Middle East', 'Pacific',
                                         'Caribbean', 'World', 'area']):
        return 'region_group'
    
    # Economic Groups
    if any(keyword in name for keyword in ['IBRD', 'IDA', 'OECD', 'Fragile',
                                         'small states', 'Euro area', 'EU']):
        return 'economic_group'
    
    # remaining countries as 'country'
    return 'country'

In [28]:
merged_df['country_type'] = merged_df['Country Name'].apply(classify_country)

In [29]:
merged_df.sample(15)

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,country_type
2995,West Bank and Gaza,PSE,2015,3272.154324,74.406,country
4372,Malta,MLT,2021,38027.380173,82.507317,country
2721,Middle East & North Africa,MEA,2014,8372.742628,72.540657,region_group
3744,Tanzania,TZA,2018,1023.106262,66.535,country
1009,France,FRA,2007,41486.190783,81.112195,country
3356,Fragile and conflict affected situations,FCS,2017,1702.908452,61.615911,economic_group
366,"Macao SAR, China",MAC,2004,23407.199369,81.691,country
1681,China,CHN,2010,4550.473944,75.599,is_top5
461,St. Vincent and the Grenadines,VCT,2004,4873.489477,73.035,country
3590,European Union,EUU,2018,36002.564052,81.029426,region_group


## Visualisation in the form of animation
> - Now plotly doesn't directly support animated line chart, but there are always hack around it and I've tried one here.
> - Can clearly see that the `line` plot is kind of wavy and not showing any trend. 

In [30]:
# first one is created as a separate flag because it is something I wanted to see, rest exist in the data itself
# so, only classification is required.

merged_df['is_top5'] = merged_df['country_type'] == 'is_top5'

countries = merged_df[(merged_df['country_type'] == 'country')|(merged_df['country_type'] == 'is_top5')]['Country Name'].unique()
income_groups = merged_df[merged_df['country_type'] == 'income_group']['Country Name'].unique()
regions = merged_df[merged_df['country_type'] == 'region_group']['Country Name'].unique()

In [31]:
cumulative_data = []
years = sorted(merged_df['year'].unique())

for current_year in years:
    temp_df = merged_df[merged_df['year'] <= current_year].copy()
    temp_df['frame_year'] = current_year
    cumulative_data.append(temp_df)

cumulative_df = pd.concat(cumulative_data)

plot_output = Output()

# widget options (can see these in dropdowns)
analysis_modes = [
    ('Top 5 Economies', 'top5'),
    ('Income Groups', 'income'),
    ('Regions', 'region'),
    ('Custom Countries', 'custom')
]

In [32]:
# Create widgets
analysis_dropdown = Dropdown(options=analysis_modes, value='top5', description='Mode:')
selection_widget = SelectMultiple(options=[], description='Select:', disabled=True)

In [33]:
def update_components(change):
    """The function will make sure that as you are selecting different options,
    the components are updating"""
    with plot_output:
        plot_output.clear_output(wait=True)
    
    mode = analysis_dropdown.value
    if mode == 'top5':
        selection_widget.disabled = True
        selection_widget.options = []
        update_plot(mode, [])
    else:
        selection_widget.disabled = False
        if mode == 'income':
            options = merged_df[merged_df['country_type'] == 'income_group']['Country Name'].unique()
        elif mode == 'region':
            options = merged_df[merged_df['country_type'] == 'region_group']['Country Name'].unique()
        else:
            options = merged_df[(merged_df['country_type'] == 'country')|(merged_df['country_type'] == 'is_top5')]['Country Name'].unique()
        selection_widget.options = [(o, o) for o in options]
        selection_widget.value = [options[0]] if len(options) > 0 else []

In [34]:
def update_plot(mode, selections):
    with plot_output:
        plot_output.clear_output(wait=True)
        
        if mode == 'top5':
            plot_data = cumulative_df[cumulative_df['is_top5']]
            title = "Top 5 Economies Development Over Time"
        else:
            plot_data = cumulative_df[cumulative_df['Country Name'].isin(selections)]
            title = {
                'income': 'Income Group Trends',
                'region': 'Regional Development',
                'custom': 'Country Comparison'
            }[mode]
        
        if not plot_data.empty:
            fig = px.line(
                plot_data,
                x='gdp_per_capita',
                y='life_expectancy',
                color='Country Name',
                line_group='Country Name',
                hover_name='Country Name',
                animation_frame='frame_year',
                animation_group='Country Name',
                markers=True,
                title=f'<b>{title}</b>',
                log_x=True,
                range_x=[cumulative_df['gdp_per_capita'].min()*0.8, 
                        cumulative_df['gdp_per_capita'].max()*1.2],
                range_y=[cumulative_df['life_expectancy'].min()-5, 
                        cumulative_df['life_expectancy'].max()+5],
                template='plotly_white'
            )
            # a lot of this code is copied directly from plotly website
            fig.layout.updatemenus = [{
                'type': 'buttons',
                'showactive': False,
                'buttons': [{
                    'label': '▶ Play',
                    'method': 'animate',
                    'args': [None, {'frame': {'duration': 150, 'redraw': True}}]
                }]
            }]

            # I was kind of tired writing this, but yes, a pause button can also be added.
            
            # dynamic year
            fig.update_layout(
                annotations=[{
                    'x': 0.95,
                    'y': 0.05,
                    'xref': 'paper',
                    'yref': 'paper',
                    'text': 'Year: 2003',
                    'showarrow': False,
                    'font': {'size': 18, 'color': 'darkblue'}
                }]
            )
            
            for i, frame in enumerate(fig.frames):
                frame.layout.annotations = [{
                    'text': f'Year: {frame.name}',
                    'showarrow': False,
                    'x': 0.95,
                    'y': 0.05,
                    'font': {'size': 18, 'color': 'darkblue'}
                }]
            
            fig.update_traces(
                mode='lines+markers',
                line=dict(width=2.5),
                marker=dict(size=8, opacity=0.8)
            )
            
            fig.update_layout(
                xaxis_title='GDP per Capita (USD, Log Scale)',
                yaxis_title='Life Expectancy (Years)',
                hovermode='closest',
                height=600
            )
            
            
            fig.show()
        else:
            print("Select items from the dropdown to view data")

In [35]:
analysis_dropdown.observe(update_components, names='value')
selection_widget.observe(lambda c: update_plot(analysis_dropdown.value, c.new), names='value')

In [36]:
update_components(None)

In [37]:
# finally display it.
display(VBox([analysis_dropdown, selection_widget, plot_output]))

VBox(children=(Dropdown(description='Mode:', options=(('Top 5 Economies', 'top5'), ('Income Groups', 'income')…

## Generating gifs for the Blogpost
### [Beyond static charts](https://nikhilsingh.io/2025/03/17/python-animations-in-action-economic-and-health-trends.html)

### 1. Luxembourg & India (a simple comparison)

In [38]:
def create_animated_scatterplot(filtered_df, 
                                filename='animation.gif',
                                figsize=(12, 7),
                                fps=3,
                                interval=300,
                                colormap='tab10'):
    """
    Create animated GDP vs Life Expectancy plot for multiple countries
    
    Parameters:
    - filtered_df (pd.DataFrame): Must contain columns ['Country Code', 'year', 
                                  'gdp_per_capita', 'life_expectancy']
    - filename (str): Output filename
    - figsize (tuple): Figure dimensions
    - fps (int): Frames per second for animation
    - interval (int): Delay between frames in milliseconds
    - colormap (str): Matplotlib colormap name for country colors
    """
    
    # input df validation
    required_columns = ['Country Code', 'year', 'gdp_per_capita', 'life_expectancy']
    if not all(col in filtered_df.columns for col in required_columns):
        raise ValueError(f"DataFrame must contain columns: {required_columns}")
        
    # data prep for gifs
    df = filtered_df.sort_values(['Country Code', 'year']).copy()
    countries = df['Country Code'].unique()
    years = df['year'].unique()
    
    # color mapping
    # you can use cmap directly, although, it is outdated.
    cmap = colormaps.get_cmap(colormap)  # Changed this line
    country_colors = {
        code: rgb2hex(cmap(i)) 
        for i, code in enumerate(sorted(countries))
    }
    
    # figure setting
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_xscale('log')
    ax.set_xlabel('GDP per Capita (USD) - Log Scale', fontsize=12)
    ax.set_ylabel('Life Expectancy (Years)', fontsize=12)
    ax.grid(True, linestyle='--', alpha=0.7)
    
    # annotations
    lines = {}
    texts = []
    
    # line objects + static country labels
    for code in countries:
        country_data = df[df['Country Code'] == code]
        lines[code], = ax.plot([], [], 
                             color=country_colors[code],
                             marker='o',
                             markersize=8,
                             linewidth=2,
                             label=country_data['Country Name'].iloc[0])
        
        # permanent country label for final
        final_point = country_data.iloc[-1]
        texts.append(ax.text(
            x=final_point['gdp_per_capita'],
            y=final_point['life_expectancy'],
            s=country_data['Country Code'].iloc[0],
            color=country_colors[code],
            fontsize=10,
            visible=False  # Start hidden
        ))
    
    # Year annotation
    year_text = ax.text(0.02, 0.95, '', transform=ax.transAxes, 
                        fontsize=14, weight='bold')
    
    # setting axis limits with buffer
    ax.set_xlim(df['gdp_per_capita'].min() * 0.7, 
                df['gdp_per_capita'].max() * 1.3)
    ax.set_ylim(df['life_expectancy'].min() - 3,
                df['life_expectancy'].max() + 2)
    
    # Legend
    ax.legend(ncol=2, loc='lower right', frameon=True, 
             facecolor='white', framealpha=0.8)

    def init():
        """Initialize animation elements"""
        for line in lines.values():
            line.set_data([], [])
        for text in texts:
            text.set_visible(False)
        year_text.set_text('')
        return list(lines.values()) + texts + [year_text]

    def update(frame):
        """Update animation for each frame"""
        current_year = sorted(years)[frame]
        
        for code in countries:
            country_data = df[(df['Country Code'] == code) & 
                             (df['year'] <= current_year)]
            lines[code].set_data(country_data['gdp_per_capita'],
                                country_data['life_expectancy'])
        
        # year display
        year_text.set_text(f'Year: {current_year}')
        
        # country labels when their data exists
        for i, code in enumerate(countries):
            texts[i].set_visible(current_year >= df[df['Country Code'] == code]['year'].max())
        
        return list(lines.values()) + texts + [year_text]

    # save animations
    ani = FuncAnimation(fig, update, frames=len(years),
                       init_func=init, blit=True, interval=interval)
    
    ani.save(filename, writer='pillow', fps=fps, dpi=100)
    plt.close()
    
    print(f"Animation saved to {filename}")

In [39]:
countries = ['LUX', 'IND']
filtered = merged_df[merged_df['Country Code'].isin(countries)].copy()
print(filtered.shape)

create_animated_scatterplot(filtered, 
                            filename='lux_ind.gif', 
                            figsize=(15, 7),
                            colormap='Dark2')

(40, 7)
Animation saved to lux_ind.gif


### 2. Top 5 economies

In [41]:
filtered = merged_df[merged_df['is_top5']==True].copy()
print(filtered.shape)

create_animated_scatterplot(filtered, 
                            filename='top-5-economies_cm.gif', 
                            figsize=(20, 7),)

(100, 7)
Animation saved to top-5-economies_cm.gif


### 3. All income groups to show pattern

In [44]:
filtered = merged_df[merged_df['country_type']=='income_group'].copy()
print(filtered.shape)

create_animated_scatterplot(filtered, 
                            filename='all_income_groups.gif', 
                            figsize=(20, 7),)

(300, 7)
Animation saved to all_income_groups.gif


# Script Complete