In [1]:
import pandas as pd
import numpy as np

In [2]:
obs = pd.read_excel('./obesity_adults.xlsx')

In [3]:
obs.shape

(24570, 26)

In [4]:
obs.head(3)

Unnamed: 0,Location,Indicator,Period,Dim1,Dim2,Tooltip,IndicatorCode,FactValueForMeasure,FactValueString,FactValueUoM,...,DataSourceDimValueCode,SpatialDimValueCode,Dim1ValueCode,Dim2ValueCode,Dim3ValueCode,SlicingDimCode,SlicingDimValueCode,IsLatestYear,LatestYearLabel,FactValueNumericPrefix
0,Afghanistan,"Prevalence of obesity among adults, BMI &Great...",2016,Both sexes,,4.5 [2.8-6.7],NCD_BMI_30C,4.5,,,...,,AFG,BTSX,,,,,True,Latest,
1,Afghanistan,"Prevalence of obesity among adults, BMI &Great...",2016,Male,,2.7 [1.1-5.4],NCD_BMI_30C,2.7,,,...,,AFG,MLE,,,,,True,Latest,
2,Afghanistan,"Prevalence of obesity among adults, BMI &Great...",2016,Female,,6.2 [3.5-10.2],NCD_BMI_30C,6.2,,,...,,AFG,FMLE,,,,,True,Latest,


In [5]:
# List of features in dataset
sorted(obs)

["'Facts'[LANGUAGE_CODE]",
 'DataSourceDimValueCode',
 'Dim1',
 'Dim1ValueCode',
 'Dim2',
 'Dim2ValueCode',
 'Dim3ValueCode',
 'FactComments',
 'FactValueForMeasure',
 'FactValueNumericPrefix',
 'FactValueString',
 'FactValueUoM',
 'Indicator',
 'IndicatorCode',
 'IsLatestYear',
 'LanguageName',
 'LanguageOrder',
 'LatestYearLabel',
 'Location',
 'Period',
 'SlicingDimCode',
 'SlicingDimValueCode',
 'SpaceTime',
 'SpatialDimValueCode',
 'TimeDimValueCode',
 'Tooltip']

In [6]:
obs.describe(include = ['object'])

Unnamed: 0,Location,Indicator,Dim1,Tooltip,IndicatorCode,FactValueForMeasure,FactComments,LanguageName,'Facts'[LANGUAGE_CODE],SpaceTime,SpatialDimValueCode,Dim1ValueCode,LatestYearLabel
count,24570,24570,24570,24570,24570,24570,504,24570,24570,24570,24570,24570,24570
unique,199,1,3,15756,1,592,1,1,1,199,199,3,42
top,Sudan (until 2011),"Prevalence of obesity among adults, BMI &Great...",Both sexes,No data,NCD_BMI_30C,No data,No data available.� Estimate could not be calc...,English,en,SDN736,SDN736,BTSX,Latest
freq,234,24570,8190,504,24570,504,504,24570,24570,234,234,8190,597


In [7]:
# Dropping Indictor and IndicatorCode from features, all observations have same value
obs = obs.drop(['Indicator', 'IndicatorCode'], axis = 1)

In [8]:
# Langauge is all english, dropping both columns as well as LanguageOrder 
obs = obs.drop(['LanguageName', "'Facts'[LANGUAGE_CODE]", 'LanguageOrder'], axis = 1)

In [9]:
# Dim1 and Dim1ValueCode represent the same features, but the ValueCode is already in factor form. Will drop Dim1 but make note of factors
obs = obs.drop(['Dim1'], axis = 1)

In [10]:
# Tooltip provides useful data but we will not use it in this analysis
obs = obs.drop(['Tooltip'], axis = 1)

In [11]:
# SpaceTime and SpatialDimValueCode are the same, basically country codes. 
obs = obs.drop(['SpatialDimValueCode'], axis = 1)

In [12]:
# LatestYearLabel is just a different representation of IsLatestYear withou being Bool, keeping IsLatestYear
obs = obs.drop(['LatestYearLabel'], axis = 1)

In [13]:
obs.head(3)

Unnamed: 0,Location,Period,Dim2,FactValueForMeasure,FactValueString,FactValueUoM,FactComments,SpaceTime,TimeDimValueCode,DataSourceDimValueCode,Dim1ValueCode,Dim2ValueCode,Dim3ValueCode,SlicingDimCode,SlicingDimValueCode,IsLatestYear,FactValueNumericPrefix
0,Afghanistan,2016,,4.5,,,,AFG,2016,,BTSX,,,,,True,
1,Afghanistan,2016,,2.7,,,,AFG,2016,,MLE,,,,,True,
2,Afghanistan,2016,,6.2,,,,AFG,2016,,FMLE,,,,,True,


In [14]:
# Inspecting Dim2, FactValueXXX features for any non-NaN values
print(obs.isna().sum())

Location                      0
Period                        0
Dim2                      24570
FactValueForMeasure           0
FactValueString           24570
FactValueUoM              24570
FactComments              24066
SpaceTime                     0
TimeDimValueCode              0
DataSourceDimValueCode    24570
Dim1ValueCode                 0
Dim2ValueCode             24570
Dim3ValueCode             24570
SlicingDimCode            24570
SlicingDimValueCode       24570
IsLatestYear                  0
FactValueNumericPrefix    24570
dtype: int64


In [15]:
# Almost all observations in Dim2 and Fact categories are blank, dropping Dim nans from DF
obs = obs.drop(['Dim2', 
                'Dim2ValueCode', 
                'Dim3ValueCode', 
                'SlicingDimCode', 
                'SlicingDimValueCode', 
                'DataSourceDimValueCode'], 
               axis = 1)


In [16]:
# Dropping FactValue columns due to NA's. Fact Comments also removed.
obs = obs.drop(['FactValueString', 'FactValueUoM', 'FactComments', 'FactValueNumericPrefix'], axis = 1)

In [17]:
obs.dtypes

Location               object
Period                  int64
FactValueForMeasure    object
SpaceTime              object
TimeDimValueCode        int64
Dim1ValueCode          object
IsLatestYear             bool
dtype: object

In [18]:
obs.describe(include = ['int64'])

Unnamed: 0,Period,TimeDimValueCode
count,24570.0,24570.0
mean,1995.5,1995.5
std,12.121165,12.121165
min,1975.0,1975.0
25%,1985.0,1985.0
50%,1995.5,1995.5
75%,2006.0,2006.0
max,2016.0,2016.0


In [19]:
# Period and TimeDimValueCode are identical, keeping Period because it is easier to understand
obs = obs.drop(['TimeDimValueCode'], axis = 1)

In [20]:
obs.describe(include = ['object'])

Unnamed: 0,Location,FactValueForMeasure,SpaceTime,Dim1ValueCode
count,24570,24570,24570,24570
unique,199,592,199,3
top,Sudan (until 2011),No data,SDN736,BTSX
freq,234,504,234,8190


In [21]:
obs.head(3)

Unnamed: 0,Location,Period,FactValueForMeasure,SpaceTime,Dim1ValueCode,IsLatestYear
0,Afghanistan,2016,4.5,AFG,BTSX,True
1,Afghanistan,2016,2.7,AFG,MLE,True
2,Afghanistan,2016,6.2,AFG,FMLE,True


In [23]:
# Rename colmns for easier reading
obs.columns = ['country', 'year', 'pct_obese', 'country_code', 'sex', 'latest']

In [24]:
obs.head(3)

Unnamed: 0,country,year,pct_obese,country_code,sex,latest
0,Afghanistan,2016,4.5,AFG,BTSX,True
1,Afghanistan,2016,2.7,AFG,MLE,True
2,Afghanistan,2016,6.2,AFG,FMLE,True


In [28]:
obs.dtypes

country         object
year             int64
pct_obese       object
country_code    object
sex             object
latest            bool
dtype: object

In [143]:
# pct_obese should be float...
print('Number of Observations:    ', len(obs))
print('Number of Empty Fields:    ', len(obs) - obs['pct_obese'].notnull().value_counts()[1])
print('Number of non-Alpha Fields:', len(obs) - obs['pct_obese'].str.isalnum().value_counts()[1])
print('Number of Numeric Fields:  ', obs['pct_obese'].str.isdigit().value_counts()[1])
print('Fields with White Space:   ', len(obs) - obs['pct_obese'].str.isspace().value_counts()[0])

Number of Observations:     24570
Number of Empty Fields:     0
Number of non-Alpha Fields: 22310
Number of Numeric Fields:   2260
Fields with White Space:    0


In [206]:
# Some values contain the string 'no data' which makes the float values appear as strings.
obs['pct_obese'] = pd.to_numeric(obs['pct_obese'], errors = 'coerce')

print('Fields with no data:', obs['pct_obese'].isna().value_counts()[1])
print('\nCountries with no Data:', obs[obs['pct_obese'].isna() == True]['country'].unique())

Fields with no data: 504

Countries with no Data: ['Monaco' 'San Marino' 'South Sudan' 'Sudan' 'Sudan (until 2011)']


In [207]:
# Dropping entries with no data
obs = obs.dropna()

In [208]:
obs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24066 entries, 0 to 24569
Data columns (total 6 columns):
country         24066 non-null object
year            24066 non-null int64
pct_obese       24066 non-null float64
country_code    24066 non-null object
sex             24066 non-null object
latest          24066 non-null bool
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 1.1+ MB


In [209]:
obs.to_csv('./obesity_clean.csv')