In [1]:
import warnings
import pandas as pd

POLIO = '/kaggle/input/new-polio-vaccines/paralytic-polio-cases new.csv'

warnings.filterwarnings(action='ignore', category=FutureWarning)
df = pd.read_csv(filepath_or_buffer=POLIO)

df.head()

Unnamed: 0,country,Year,Estimated polio cases
0,Afghanistan,1980,6160.0
1,Africa,1980,79660.0
2,Albania,1980,7.0
3,Algeria,1980,812.0
4,Angola,1980,224.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7919 entries, 0 to 7918
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                7919 non-null   object 
 1   Year                   7919 non-null   int64  
 2   Estimated polio cases  7919 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 185.7+ KB


In [3]:
df.nunique()

country                  207
Year                      44
Estimated polio cases    638
dtype: int64

If we try to plot all the data we get something that has too many lines to say much.

In [4]:
from plotly import express

express.line(data_frame=df[df['Year'] > 1980], x='Year', y='Estimated polio cases', color='country', log_y=True, height=800)

If we focus on regions (continents + World) we get a simpler story: 
* Polio has almost been eradicated outside of Asia and Africa
* World polio cases are driven by dynamics in Asia early in our time series, and a combination of Asia and Africa later.

In [5]:
REGION = {'Asia', 'Europe', 'Africa', 'World', 'Oceania', 'North America', 'South America'}

express.line(data_frame=df[df['country'].isin(REGION)], x='Year', y='Estimated polio cases', color='country', log_y=True, height=800)

In [6]:
scatter_df = df.drop(columns=['Year']).groupby(by=['country']).agg(func={'mean', 'std'}).reset_index()
scatter_df.columns = ['country', 'mean', 'std']

In [7]:
from plotly import express
express.scatter(data_frame=scatter_df, x='mean', y='std', log_x=True, log_y=True, hover_name='country', trendline='lowess')

If we remove the impact of time we see that the mean and variance are closely related; their trendline looks very linear on a log-log plot.