In [1]:
import pandas as pd
t1_df = pd.read_csv(filepath_or_buffer='/kaggle/input/mortality-rate-infant-per-1000-live-births/T1.csv')
# we need to make the top row the header
columns = t1_df.iloc[0]
columns = [int(item) if isinstance(item, float) else item for item in columns]
t1_df = t1_df[1:]
t1_df.columns = columns
# now we can join in the region and income group information
df = t1_df.merge(right=pd.read_csv(filepath_or_buffer='/kaggle/input/mortality-rate-infant-per-1000-live-births/T2.csv'), right_on='TableName', left_on='Country Name', how='inner')
# we can drop three columns because they add no value to our analysis
df = df.drop(columns=['Indicator Name', 'Indicator Code', 'TableName'])
df['max'] = df[list(range(1960, 2022, 1))].max(axis=1)
df['mean'] = df[list(range(1960, 2022, 1))].mean(axis=1)
df['min'] = df[list(range(1960, 2022, 1))].min(axis=1)
df['stdev'] = df[list(range(1960, 2022, 1))].std(axis=1)
df.head()

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2019,2020,2021,Region,IncomeGroup,SpecialNotes,max,mean,min,stdev
0,Australia,AUS,20.3,19.9,19.5,19.1,18.7,18.5,18.3,18.2,...,3.2,3.2,3.2,East Asia & Pacific,High income,The reporting period for national accounts dat...,20.3,9.341935,3.2,5.735754
1,Austria,AUT,37.3,34.9,32.9,31.1,29.6,28.3,27.2,26.3,...,2.9,3.0,3.0,Europe & Central Asia,High income,A simple multiplier is used to convert the nat...,37.3,12.125806,2.9,10.02814
2,Belgium,BEL,29.4,28.1,27.0,26.0,25.0,24.0,23.2,22.5,...,3.4,3.4,3.4,Europe & Central Asia,High income,A simple multiplier is used to convert the nat...,29.4,10.719355,3.3,7.796423
3,Bangladesh,BGD,174.8,170.7,166.9,163.4,160.4,157.8,155.7,154.2,...,25.3,24.1,22.9,South Asia,Lower middle income,The reporting period for national accounts dat...,211.7,98.395161,22.9,52.109533
4,Canada,CAN,27.8,26.9,26.0,25.0,24.0,23.1,22.1,21.2,...,4.5,4.5,4.4,North America,High income,Fiscal year end: March 31; reporting period fo...,27.8,10.220968,4.4,6.964668


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from plotly.express import treemap

NGRAM_RANGE = (1, 1)
TOPN = 100
model = CountVectorizer(encoding='utf-8', stop_words='english', min_df=3, max_df=1.0, lowercase=True, ngram_range=NGRAM_RANGE)
model_result = model.fit_transform(raw_documents=df['SpecialNotes'].tolist())
note_df = pd.DataFrame(data={'word': model.get_feature_names_out().tolist(), 'count': model_result.toarray().sum(axis=0).tolist()})
note_df['numeric'] = note_df['word'].str.isnumeric()
print(note_df.shape)
treemap(data_frame=note_df[~note_df['numeric']].sort_values(ascending=False, by='count').head(n=TOPN), path=['word'], names='word', values='count', color='count', height=800,
       color_continuous_scale='bluered', title = '{} {}-grams by count'.format(min(len(note_df), TOPN), NGRAM_RANGE))

(96, 3)


Not surprisingly the notes are all about the methodology used to combine data from different sources.

In [3]:
from plotly.express import choropleth
for column in [1960, 2021, 'Region', 'IncomeGroup', 'min', 'max', 'mean', 'stdev']:
    choropleth(data_frame=df, locations='Country Code', color=column).show()

Remember here small is good, so low values are good; and stdev because it is a magnitude doesn't tell us whether the change is an improvement or a decline. We need line plots.

In [4]:
from plotly.express import line
line(data_frame=df.set_index(keys=['Country Name'])[list(range(1960, 2022, 1))].T.reset_index(), x='index', y=df['Country Name'].values, height=1000)

This is probably the nut graf; infant mortality has been declining almost everywhere, but there are places where it has not been declining steadily.

In [5]:
from plotly.express import scatter
scatter(data_frame=df, x='min', y='max', hover_name='Country Name', color='IncomeGroup')

This also tells us what we expect: rich countries (generally) have low infant mortality and low variability in infant mortality, while poor countries (generally) have higher infant mortality and higher variability; Romania, Portugal, Lebanon, and Samoa being the edge cases. However this chart is harder to understand because plotting the max against the min and undertanding the dispersion as variability is less intuitive than the lines above.

In [6]:
scatter(data_frame=df, x='mean', y='stdev', hover_name='Country Name', color='IncomeGroup')

Plotting the mean vs the std tells us essentially the same story and has the same issues.