In [290]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pylab as plt

%matplotlib inline

In [209]:
obs = pd.read_csv('./obesity_clean.csv', index_col = 0)
obs_means = pd.read_csv('./obesity_means.csv', index_col = 0)
obs_years = pd.read_csv('./obesity_years.csv', index_col = 0)
pop = pd.read_csv('./pop_clean.csv', index_col = 0)

In [210]:
obs.head(3)

Unnamed: 0,country,year,pct_obese,country_code,sex,latest
7,Afghanistan,2016,4.5,AFG,BTSX,True
9,Afghanistan,2015,4.3,AFG,BTSX,False
11,Afghanistan,2014,4.1,AFG,BTSX,False


In [211]:
obs_means.head(3)

Unnamed: 0,year,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela (Bolivarian Republic of),Viet Nam,Yemen,Yemen Arab Republic (until 1990),Zambia,Zimbabwe,mean_yr
0,1975,0.4,5.7,5.9,13.4,0.7,5.1,11.1,6.6,10.5,...,11.8,3.9,4.7,8.3,,,2.5,1.3,3.0,6.074346
1,1976,0.4,5.8,6.1,14.0,0.8,5.3,11.5,6.8,10.7,...,12.1,4.0,4.9,8.6,,,2.6,1.4,3.2,6.270681
2,1977,0.5,6.0,6.2,14.5,0.8,5.4,11.8,7.0,11.0,...,12.4,4.2,5.1,8.8,0.1,,2.7,1.5,3.3,6.467539


In [212]:
obs_years.head(3)

Unnamed: 0,country,1975,1976,1977,1978,1979,1980,1981,1982,1983,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,0.4,0.4,0.5,0.5,0.5,0.6,0.6,0.6,0.7,...,2.8,2.9,3.1,3.3,3.5,3.7,3.9,4.1,4.3,4.5
1,Albania,5.7,5.8,6.0,6.1,6.3,6.4,6.6,6.8,7.0,...,16.9,17.5,18.1,18.7,19.3,19.9,20.5,21.1,21.7,22.3
2,Algeria,5.9,6.1,6.2,6.4,6.7,6.9,7.2,7.4,7.8,...,19.2,19.9,20.7,21.4,22.2,23.1,23.9,24.8,25.7,26.6


In [213]:
pop.head(3)

Unnamed: 0,country,country_code,year,SP.POP.TOTL,SP.POP.0014.TO,adult_pop
0,Afghanistan,AFG,1965,9956320.0,4279406.0,5676914.0
1,Afghanistan,AFG,1966,10174836.0,4408352.0,5766484.0
2,Afghanistan,AFG,1967,10399926.0,4537722.0,5862204.0


In [214]:
# Merge the datasets on country code and year so we can work with both if needed.
merge = obs.merge(pop[['country_code', 'year', 'SP.POP.TOTL', 'SP.POP.0014.TO', 'adult_pop']], 
          how = 'left', 
          left_on = ['country_code', 'year'], 
          right_on = ['country_code', 'year'])

In [215]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8022 entries, 0 to 8021
Data columns (total 9 columns):
country           8022 non-null object
year              8022 non-null int64
pct_obese         8022 non-null float64
country_code      8022 non-null object
sex               8022 non-null object
latest            8022 non-null bool
SP.POP.TOTL       7847 non-null float64
SP.POP.0014.TO    7847 non-null float64
adult_pop         7847 non-null float64
dtypes: bool(1), float64(4), int64(1), object(3)
memory usage: 571.9+ KB


In [217]:
# Check which countries having missing data
merge[merge.isna().any(axis = 1) == True].country.unique()

array(['Cook Islands', 'Eritrea', 'Germany, Federal Republic (former)',
       'Kiribati (until 1984)', 'Kuwait', 'Niue', 'Serbia',
       'South Viet Nam (former)', 'Sudan (until 2011)'], dtype=object)

In [231]:
# We have a few countries missing data. Most appear to be historical so ISO codes wont apply
# Kuwait does not fit this assumption and needs investigation as it exists in both dataframes
merge[merge['country_code'] == 'KWT'][pd.isna(merge[merge['country_code'] == 'KWT']).any(axis = 1)]

Unnamed: 0,country,year,pct_obese,country_code,sex,latest,SP.POP.TOTL,SP.POP.0014.TO,adult_pop
3844,Kuwait,1994,25.7,KWT,BTSX,False,,,
3845,Kuwait,1993,25.6,KWT,BTSX,False,,,
3846,Kuwait,1992,25.4,KWT,BTSX,False,,,


In [232]:
# Looks like Gulf War conflict years, these can be ignored. What about Eritrea?
merge[merge['country_code'] == 'ERI'][pd.isna(merge[merge['country_code'] == 'ERI']).any(axis = 1)]

Unnamed: 0,country,year,pct_obese,country_code,sex,latest,SP.POP.TOTL,SP.POP.0014.TO,adult_pop
2352,Eritrea,2016,4.1,ERI,BTSX,True,,,
2353,Eritrea,2015,3.8,ERI,BTSX,False,,,
2354,Eritrea,2014,3.6,ERI,BTSX,False,,,
2355,Eritrea,2013,3.4,ERI,BTSX,False,,,
2356,Eritrea,2012,3.2,ERI,BTSX,False,,,


In [233]:
# 2011 was the last data provided, so this too makes sense. Cook Islands has no data reported but 
# the current population is under 20000. Shouldn't affect much. Serbia?
merge[merge['country_code'] == 'SRB'][pd.isna(merge[merge['country_code'] == 'SRB']).any(axis = 1)]

Unnamed: 0,country,year,pct_obese,country_code,sex,latest,SP.POP.TOTL,SP.POP.0014.TO,adult_pop
6300,Serbia,1981,9.8,SRB,BTSX,False,,,
6301,Serbia,1980,9.5,SRB,BTSX,False,,,
6302,Serbia,1979,9.1,SRB,BTSX,False,,,
6303,Serbia,1978,8.8,SRB,BTSX,False,,,
6304,Serbia,1977,8.5,SRB,BTSX,False,,,
6305,Serbia,1976,8.2,SRB,BTSX,False,,,
6306,Serbia,1975,7.9,SRB,BTSX,False,,,
6334,Serbia,1989,12.7,SRB,BTSX,False,,,
6335,Serbia,1988,12.3,SRB,BTSX,False,,,
6336,Serbia,1987,12.0,SRB,BTSX,False,,,


In [235]:
# Serbia didn't officailly exist until 2006, was recorded as Yugoslavia prior to 1990. Looks like
# all NA's are valid. This shouldn't be a problem. We need a column showing the obese population
merge = merge.assign(obese_pop = lambda x: x.adult_pop * x.pct_obese * .01)

In [236]:
merge.head(3)

Unnamed: 0,country,year,pct_obese,country_code,sex,latest,SP.POP.TOTL,SP.POP.0014.TO,adult_pop,obese_pop
0,Afghanistan,2016,4.5,AFG,BTSX,True,35383128.0,15664859.0,19718269.0,887322.105
1,Afghanistan,2015,4.3,AFG,BTSX,False,34413603.0,15443806.0,18969797.0,815701.271
2,Afghanistan,2014,4.1,AFG,BTSX,False,33370794.0,15230627.0,18140167.0,743746.847


In [254]:
# Pivotting the merged set on years to look at rate of growth in obese population numbers
obs_sums = merge.pivot_table(index = 'year', columns = 'country', values = 'obese_pop').\
                                    reset_index().\
                                    rename_axis('', axis = 'columns')

In [255]:
obs_sums.head(3)

Unnamed: 0,year,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela (Bolivarian Republic of),Viet Nam,Yemen,Yemen Arab Republic (until 1990),Zambia,Zimbabwe
0,1975,27738.248,83895.393,521474.391,4114.47,26463.808,1873.638,2026515.012,122889.228,1057115.85,...,18950340.0,241543.05,307490.391,2582.603,620170.937,,,87875.5,33133.776,95983.8
1,1976,28151.132,87965.642,554596.75,4448.78,31160.8,1968.42,2126952.83,130830.64,1092576.679,...,19906240.0,248831.781,326432.24,2781.142,666747.164,,,93129.504,36806.154,105372.064
2,1977,35650.355,93913.2,580828.214,4751.795,32280.848,2027.484,2208714.442,139175.96,1143350.12,...,20741600.0,257016.412,355657.848,2990.079,707066.008,29521.042,,99022.149,40746.735,111814.956


In [256]:
obs_sums['sum_yr'] = obs_sums.loc[:, obs_sums.columns != 'year'].sum(axis = 1, skipna = True)

In [257]:
obs_sums[['year', 'sum_yr']].head(3)

Unnamed: 0,year,sum_yr
0,1975,101690300.0
1,1976,107017700.0
2,1977,112218400.0


In [259]:
# Pivotting the merged set on years to look at rate of growth in adult total population numbers
adult_sums = merge.pivot_table(index = 'year', columns = 'country', values = 'adult_pop').\
                                    reset_index().\
                                    rename_axis('', axis = 'columns')

In [261]:
adult_sums['sum_yr'] = adult_sums.loc[:, adult_sums.columns != 'year'].sum(axis = 1, skipna = True)

In [263]:
adult_sums[['year', 'sum_yr']].head(3)

Unnamed: 0,year,sum_yr
0,1975,2445823000.0
1,1976,2498616000.0
2,1977,2583692000.0


In [268]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = adult_sums.year,
                        y = adult_sums.sum_yr,
                        name = 'Adult Population'))

fig.add_trace(go.Scatter(x = obs_sums.year,
                        y = obs_sums.sum_yr,
                        name = 'Obese Population'))


fig.show()

In [308]:
# Interesting. This paints a very different picture of the prevalence of obesity. When viewed by country,
# the trends are alarming. When viewed by total population, we see a lower prevlaence of obesity in the
# overall adult population. While the individual nations are getting fatter, people are reproducing at
# such a high rate, living to adulthood more often and living longer, lowering the perception of the overall 
# proportion of the population that actually is obese. Let's view this numerically

print('Difference in adult population 1975-2016: ',
      math.ceil(adult_sums[adult_sums['year'] == 2016].sum_yr.iloc[0] - adult_sums[adult_sums['year'] == 1975].sum_yr.iloc[0]))

print('Difference in obese population 1975-2016: ', 
      math.ceil(obs_sums[obs_sums['year'] == 2016].sum_yr.iloc[0] - obs_sums[obs_sums['year'] == 1975].sum_yr.iloc[0]))

print('\nDifference between adult and obese populations 1975: ',
      math.ceil(adult_sums[adult_sums['year'] == 1975].sum_yr.iloc[0] - obs_sums[obs_sums['year'] == 1975].sum_yr.iloc[0]))

print('Difference between adult and obese populations 2016: ', 
      math.ceil(adult_sums[adult_sums['year'] == 2016].sum_yr.iloc[0] - obs_sums[obs_sums['year'] == 2016].sum_yr.iloc[0]))

print('\nPercent increase in adult population 1975-2016: ', 
      round((adult_sums[adult_sums['year'] == 2016].sum_yr.iloc[0] / adult_sums[adult_sums['year'] == 1975].sum_yr.iloc[0]) * 100, 0))

print('Percent increase in obese population 1975-2016: ', 
      round((obs_sums[obs_sums['year'] == 2016].sum_yr.iloc[0] / obs_sums[obs_sums['year'] == 1975].sum_yr.iloc[0]) * 100, 0))

print('\nPercent of obese adult population 1975: ', 
      round((obs_sums[obs_sums['year'] == 1975].sum_yr.iloc[0] / adult_sums[adult_sums['year'] == 1975].sum_yr.iloc[0]) * 100, 0))

print('Percent of obese adult population 2016: ', 
      round((obs_sums[obs_sums['year'] == 2016].sum_yr.iloc[0] / adult_sums[adult_sums['year'] == 2016].sum_yr.iloc[0]) * 100, 0))


Difference in adult population 1975-2016:  2976763274
Difference in obese population 1975-2016:  611095628

Difference between adult and obese populations 1975:  2344132958
Difference between adult and obese populations 2016:  4709800605

Percent increase in adult population 1975-2016:  222.0
Percent increase in obese population 1975-2016:  701.0

Percent of obese adult population 1975:  4.0
Percent of obese adult population 2016:  13.0


In [340]:
# Obesity is on the rise at a very high rate, which was downplayed by the visualization. 
# What does the distribution of obese look like compared to the entire population
fig = go.Figure()

fig.add_trace(go.Bar(x = adult_sums['year'], 
                     y = adult_sums['sum_yr'], 
                     name = 'Adult Pop'))

fig.add_trace(go.Bar(x = obs_sums['year'], 
                     y = obs_sums['sum_yr'], 
                     name = 'Obese Pop'))

annotations = []

# Adding labels
# Title
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor = 'left', yanchor = 'top',
                              text = 'World Population vs. Obese Population',
                              font = dict(family = 'Arial', 
                                          size = 24, 
                                          color = 'rgb(37,37,37)'),
                              showarrow = False))
# Source
annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.1,
                              xanchor = 'center', yanchor = 'top',
                              text = 'Source: World Health Organization & World Bank',
                              font = dict(family = 'Arial', 
                                          size = 12, 
                                          color = 'rgb(150,150,150)'),
                              showarrow=False))

fig.update_layout(barmode = 'overlay', 
                  annotations = annotations, 
                  xaxis=dict(showline=True, 
                             showgrid=False, 
                             showticklabels=True, 
                             linecolor='rgb(204, 204, 204)', 
                             linewidth=2,ticks='outside', 
                             tickfont=dict(family='Arial', 
                                           size=12, 
                                           color='rgb(82, 82, 82)')), 
                  yaxis=dict(showgrid=False, 
                             zeroline=False, 
                             showline=False),
                 plot_bgcolor = 'White')

fig.update_traces(opacity = 0.75)

fig.show()

In [314]:
# This graph shows a much better representation of the data. Predictive modelling might show us where this
# could end up...
obs_sums.to_csv('./obesity_sums.csv')
adult_sums.to_csv('./pop_sums.csv')