<a href="https://www.kaggle.com/madeelbadar/child-mortality-and-gdp-per-capita?scriptVersionId=89242289" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

In [2]:
#import data as csv
df=pd.read_csv('../input/childmortalitygdppercapita/child-mortality-gdp-per-capita.csv')

In [3]:
#show first 30 rows of dataframe
df.head(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,145446-annotations,Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,,Asia
1,Afghanistan,AFG,1957,38.1194,1253.0,,8535157.0,
2,Afghanistan,AFG,1958,37.519,1298.0,,8680097.0,
3,Afghanistan,AFG,1959,36.9361,1307.0,,8833947.0,
4,Afghanistan,AFG,1960,36.37,1326.0,,8996967.0,
5,Afghanistan,AFG,1961,35.75,1309.0,,9169406.0,
6,Afghanistan,AFG,1962,35.15,1302.0,,9351442.0,
7,Afghanistan,AFG,1963,34.55,1298.0,,9543200.0,
8,Afghanistan,AFG,1964,33.97,1291.0,,9744772.0,
9,Afghanistan,AFG,1965,33.39,1290.0,,9956318.0,


In [4]:
#shape of the data frame
df.shape

(59844, 8)

In [5]:
#count N/A values in each column
df.isna().sum()

Entity                                                0
Code                                               2008
Year                                                  0
Child mortality (Select Gapminder, v10) (2017)    46332
GDP per capita                                    39968
145446-annotations                                59823
Population (historical estimates)                  4188
Continent                                         59559
dtype: int64

In [6]:
#drop these too column becuase they have too many missing values
df=df.drop(columns=['145446-annotations','Continent'])

In [7]:
#now drop any row that has any N/A values
df=df.dropna(how='any')

In [8]:
#the new shape of the dataframe
df.shape

(11168, 6)

In [9]:
#create a new dataframe that only has values where year is 2016
df2016=df.loc[df['Year'] == 2016]

In [10]:
#taking another look at the data 
df2016.head()

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates)
60,Afghanistan,AFG,2016,7.04,1929.0,35383028.0
558,Albania,ALB,2016,1.35,10342.0,2886427.0
841,Algeria,DZA,2016,2.52,14331.0,40551398.0
1447,Angola,AGO,2016,8.25,8453.0,28842482.0
2151,Argentina,ARG,2016,1.11,18875.0,43508459.0


In [11]:
#plotting a interactive choropleth by child mortality rate
fig = px.choropleth(df2016, locations="Code",
                    color="Child mortality (Select Gapminder, v10) (2017)",
                    hover_name="Entity", 
                    color_continuous_scale='Turbo')
                    
fig.show()
fig.write_html("child_mortality_choropleth.html")

In [12]:
#choropleth showing gdp per capita by country
fig = px.choropleth(df2016, locations="Code",
                    color="GDP per capita",
                    hover_name="Entity", 
                    color_continuous_scale='Turbo')
fig.show()
fig.write_html("gdp_per_capita_choropleth.html")

In [13]:
#this is another csv file that i found that contains country code and a bunch of geographic information
#this will make data cleaning a lot easier
continent=pd.read_csv('../input/iso3-and-continent-data/isocontinent.csv')
continent.head()


Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [14]:
#renaming column because i prefer it to say code instead of alpha-3
continent=continent.rename(columns={'alpha-3':'Code'})

In [15]:
#list all the columns in this dataframe
continent.columns

Index(['name', 'alpha-2', 'Code', 'country-code', 'iso_3166-2', 'region',
       'sub-region', 'intermediate-region', 'region-code', 'sub-region-code',
       'intermediate-region-code'],
      dtype='object')

In [16]:
#creating a new dataframe by merging df2016 and continent 
df2016 = pd.merge(left=df2016, right=continent, how='left', left_on='Code', right_on='Code')

In [17]:
#show me the last few rows of the dataframe
df2016.tail()

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates),name,alpha-2,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
160,Venezuela,VEN,2016,1.63,15219.0,29851249.0,Venezuela (Bolivarian Republic of),VE,862,ISO 3166-2:VE,Americas,Latin America and the Caribbean,South America,19.0,419.0,5.0
161,Vietnam,VNM,2016,2.16,6062.0,93640435.0,Viet Nam,VN,704,ISO 3166-2:VN,Asia,South-eastern Asia,,142.0,35.0,
162,Yemen,YEM,2016,5.53,2506.0,27168210.0,Yemen,YE,887,ISO 3166-2:YE,Asia,Western Asia,,142.0,145.0,
163,Zambia,ZMB,2016,6.34,3479.0,16363449.0,Zambia,ZM,894,ISO 3166-2:ZM,Africa,Sub-Saharan Africa,Eastern Africa,2.0,202.0,14.0
164,Zimbabwe,ZWE,2016,5.64,1534.0,14030338.0,Zimbabwe,ZW,716,ISO 3166-2:ZW,Africa,Sub-Saharan Africa,Eastern Africa,2.0,202.0,14.0


In [18]:
#scatter plot showing child mortality on the y axis and gdp per capita on the x axis
#bubble size represents population
fig = px.scatter(df2016, y="Child mortality (Select Gapminder, v10) (2017)", x="GDP per capita", color="sub-region",hover_name="Entity", log_x=True,size_max=60,size="Population (historical estimates)")
fig.show()
fig.write_html("country_gdp_childmortality_scatter.html")

In [19]:
#printing the number of N/A values by column
df2016.isna().sum()

Entity                                             0
Code                                               0
Year                                               0
Child mortality (Select Gapminder, v10) (2017)     0
GDP per capita                                     0
Population (historical estimates)                  0
name                                               0
alpha-2                                            1
country-code                                       0
iso_3166-2                                         0
region                                             0
sub-region                                         0
intermediate-region                               94
region-code                                        0
sub-region-code                                    0
intermediate-region-code                          94
dtype: int64

In [20]:
#selcting the columns that i will need in each dataframe
df2016_continet=df2016[['GDP per capita','sub-region','Child mortality (Select Gapminder, v10) (2017)']]
df2016_population=df2016[['sub-region','Population (historical estimates)']]

In [21]:
#group by 'sub region' and find mean of the other columns
df2016_continent=df2016_continet.groupby('sub-region').mean().reset_index()

#group by 'sub region' and find sum of the other columns
df2016_population=df2016_population.groupby('sub-region').sum().reset_index()

In [22]:
#merge the two dataframes after grouping
df_merged = pd.merge(left=df2016_continent, right=df2016_population, how='left', left_on='sub-region', right_on='sub-region')

In [23]:
#the first 30 rows of dataframe
df_merged.head(30)

Unnamed: 0,sub-region,GDP per capita,"Child mortality (Select Gapminder, v10) (2017)",Population (historical estimates)
0,Australia and New Zealand,41570.0,0.455,28921980.0
1,Central Asia,13381.2,3.014,69672930.0
2,Eastern Asia,24761.503717,0.936367,1628404000.0
3,Eastern Europe,19366.8,0.722,294252000.0
4,Latin America and the Caribbean,13241.153846,1.896458,626018500.0
5,Northern Africa,9423.5,2.778333,227768300.0
6,Northern America,48380.0,0.57,359398900.0
7,Northern Europe,41725.6,0.35,103802300.0
8,South-eastern Asia,15890.333333,2.707778,639827600.0
9,Southern Asia,6749.714286,4.077143,1849357000.0


In [24]:
#plot scatter plot showing sub region by gdp per capita and chil mortality
#bubble size represents population
fig = px.scatter(df_merged, x="GDP per capita", y="Child mortality (Select Gapminder, v10) (2017)", size="Population (historical estimates)", color="sub-region",
           hover_name="sub-region", log_x=False, size_max=60, trendline="ols", trendline_scope="overall")
fig.show()
fig.write_html("region_gdp_childmortality_scatter.html")

In [25]:
#from df selct only rows where year is between 2006 and 2016
df=df.loc[(df['Year'] >= 2006) & (df['Year'] <= 2016)]

In [26]:
#first 30 rows of df
df.head(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates)
50,Afghanistan,AFG,2006,10.63,1057.0966,26433058.0
51,Afghanistan,AFG,2007,10.22,1259.9967,27100542.0
52,Afghanistan,AFG,2008,9.82,1319.6074,27722281.0
53,Afghanistan,AFG,2009,9.41,1557.3206,28394806.0
54,Afghanistan,AFG,2010,9.02,1627.6716,29185511.0
55,Afghanistan,AFG,2011,8.64,1792.0,30117411.0
56,Afghanistan,AFG,2012,8.28,1945.0,31161378.0
57,Afghanistan,AFG,2013,7.93,2025.0,32269592.0
58,Afghanistan,AFG,2014,7.61,2022.0,33370803.0
59,Afghanistan,AFG,2015,7.32,1928.0,34413603.0


In [27]:
#finding correlation of the last 11 rows between child mortality and gdp per capity
df['corr']=df['Child mortality (Select Gapminder, v10) (2017)'].rolling(11).corr(df['GDP per capita'])

In [28]:
#first 30 rows of df
df.head(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates),corr
50,Afghanistan,AFG,2006,10.63,1057.0966,26433058.0,
51,Afghanistan,AFG,2007,10.22,1259.9967,27100542.0,
52,Afghanistan,AFG,2008,9.82,1319.6074,27722281.0,
53,Afghanistan,AFG,2009,9.41,1557.3206,28394806.0,
54,Afghanistan,AFG,2010,9.02,1627.6716,29185511.0,
55,Afghanistan,AFG,2011,8.64,1792.0,30117411.0,
56,Afghanistan,AFG,2012,8.28,1945.0,31161378.0,
57,Afghanistan,AFG,2013,7.93,2025.0,32269592.0,
58,Afghanistan,AFG,2014,7.61,2022.0,33370803.0,
59,Afghanistan,AFG,2015,7.32,1928.0,34413603.0,


In [29]:
#select rows where year is 2016
df_corr_2016=df.loc[df['Year']==2016]

In [30]:
#show last 30 rows of dataframe
df_corr_2016.tail(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates),corr
47404,Slovakia,SVK,2016,0.59,25364.0,5442001.0,-0.940284
47635,Slovenia,SVN,2016,0.23,26908.0,2074205.0,0.440332
48411,South Africa,ZAF,2016,4.33,12139.0,56207649.0,-0.978706
49038,South Korea,KOR,2016,0.34,36103.0,50983446.0,-0.957036
49613,Spain,ESP,2016,0.33,30110.0,46634131.0,0.771715
49851,Sri Lanka,LKA,2016,0.94,11149.0,21021177.0,-0.970614
50123,Sudan,SDN,2016,6.51,3651.0,39847433.0,-0.609728
50847,Sweden,SWE,2016,0.29,44659.0,9836003.0,-0.490494
51516,Switzerland,CHE,2016,0.41,59662.0,8379915.0,-0.920004
51702,Syria,SYR,2016,1.9123,3091.0,17465567.0,-0.833845


In [31]:
#plot the correlation between gdp per capita and child mortality rates
fig = px.choropleth(df_corr_2016, locations="Code",
                    color="corr",
                    hover_name="Entity", 
                    color_continuous_scale='Turbo',
                    title='correlation between gdp per capita & child mortality rates',
                    height=800
                    )

   

                    
fig.show()

In [32]:
#checking out why italy has a positive correlation
italy=df.loc[df['Entity']=='Italy']
italy.tail(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates),corr
24968,Italy,ITA,2006,0.43,35713.8042,58542619.0,-0.593875
24969,Italy,ITA,2007,0.42,36310.7349,58747862.0,-0.426079
24970,Italy,ITA,2008,0.41,35942.9393,58922116.0,-0.30325
24971,Italy,ITA,2009,0.41,34055.2256,59105622.0,-0.108425
24972,Italy,ITA,2010,0.4,34765.9379,59325232.0,0.136383
24973,Italy,ITA,2011,0.39,35151.0,59589070.0,0.344941
24974,Italy,ITA,2012,0.38,34068.0,59879469.0,0.528431
24975,Italy,ITA,2013,0.37,33094.0,60166828.0,0.714717
24976,Italy,ITA,2014,0.36,32829.0,60409622.0,0.79986
24977,Italy,ITA,2015,0.34,33118.0,60578489.0,0.849304


In [33]:
#checking why finland has possitive correlation
sweden=df.loc[df['Entity']=='Finland']
sweden.tail(30)

Unnamed: 0,Entity,Code,Year,"Child mortality (Select Gapminder, v10) (2017)",GDP per capita,Population (historical estimates),corr
17383,Finland,FIN,2006,0.36,38160.7026,5277490.0,-0.760738
17384,Finland,FIN,2007,0.35,39998.5254,5297741.0,-0.893104
17385,Finland,FIN,2008,0.33,40129.6533,5319452.0,-0.930678
17386,Finland,FIN,2009,0.32,36662.6133,5342266.0,-0.949675
17387,Finland,FIN,2010,0.3,37615.1134,5365784.0,-0.963279
17388,Finland,FIN,2011,0.29,38432.0,5390036.0,-0.972845
17389,Finland,FIN,2012,0.28,37704.0,5414769.0,-0.981623
17390,Finland,FIN,2013,0.26,37246.0,5438984.0,-0.996664
17391,Finland,FIN,2014,0.25,36858.0,5461410.0,-0.995974
17392,Finland,FIN,2015,0.24,36836.0,5481128.0,-0.992293
