# Analysis of the UN's World Happiness Index with machine learning  
Maaike de Jong  
June 2020  
  
See the repository's [README](https://github.com/maaikedj/happiness-machine-learning/blob/master/README.md) file for background and details on the analysis and data.  

### Notebook 1: combining data files

In this notebook I combine data on the World Happiness Index 2015-2019 with data on World Development Indicators from the World Bank.

In [1]:
# import packages

import numpy as np
import pandas as pd

In [2]:
# import happiness index files of the past 5 years

hap15 = pd.read_csv('../data/2015.csv')
hap15.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [3]:
hap16 = pd.read_csv('../data/2016.csv')
hap16.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [4]:
hap17 = pd.read_csv('../data/2017.csv')
hap17.head()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [5]:
hap18 = pd.read_csv('../data/2018.csv')
hap18.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


In [6]:
hap19 = pd.read_csv('../data/2019.csv')
hap19.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [7]:
# Select only country/ Region, happiness rank and happiness score
# Dealing with inconsistencies in column names

hap15_select = hap15[['Country', 'Region', 'Happiness Rank', 'Happiness Score']]
hap16_select = hap16[['Country', 'Happiness Rank', 'Happiness Score']]
hap17_select = hap17[['Country', 'Happiness.Rank', 'Happiness.Score']]
hap18_select = hap18[['Country or region', 'Overall rank', 'Score']]
hap19_select = hap19[['Country or region', 'Overall rank', 'Score']]

In [8]:
hap15_s1 = hap15_select.rename(columns={'Happiness Rank': 'Happiness Rank 2015', 'Happiness Score': 'Happiness Score 2015'})
hap15_s1.head()

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015
0,Switzerland,Western Europe,1,7.587
1,Iceland,Western Europe,2,7.561
2,Denmark,Western Europe,3,7.527
3,Norway,Western Europe,4,7.522
4,Canada,North America,5,7.427


In [9]:
hap16_s1 = hap16_select.rename(columns={'Happiness Rank': 'Happiness Rank 2016', 'Happiness Score': 'Happiness Score 2016'})
hap16_s1.head()

Unnamed: 0,Country,Happiness Rank 2016,Happiness Score 2016
0,Denmark,1,7.526
1,Switzerland,2,7.509
2,Iceland,3,7.501
3,Norway,4,7.498
4,Finland,5,7.413


In [11]:
hap17_s1 = hap17_select.rename(columns={'Happiness.Rank': 'Happiness Rank 2017', 'Happiness.Score': 'Happiness Score 2017'})
hap17_s1.head()

Unnamed: 0,Country,Happiness Rank 2017,Happiness Score 2017
0,Norway,1,7.537
1,Denmark,2,7.522
2,Iceland,3,7.504
3,Switzerland,4,7.494
4,Finland,5,7.469


In [12]:
hap18_s1 = hap18_select.rename(columns={'Country or region': 'Country', 'Overall rank': 'Happiness Rank 2018', 'Score': 'Happiness Score 2018'})
hap18_s1.head()

Unnamed: 0,Country,Happiness Rank 2018,Happiness Score 2018
0,Finland,1,7.632
1,Norway,2,7.594
2,Denmark,3,7.555
3,Iceland,4,7.495
4,Switzerland,5,7.487


In [13]:
hap19_s1 = hap19_select.rename(columns={'Country or region': 'Country', 'Overall rank': 'Happiness Rank 2019', 'Score': 'Happiness Score 2019'})
hap19_s1.head()

Unnamed: 0,Country,Happiness Rank 2019,Happiness Score 2019
0,Finland,1,7.769
1,Denmark,2,7.6
2,Norway,3,7.554
3,Iceland,4,7.494
4,Netherlands,5,7.488


In [14]:
# before merging on country, check whether countries are written the same

list15 = hap15_s1['Country'].values.tolist()
list16 = hap16_s1['Country'].values.tolist()

In [15]:
(list(set(list15) - set(list16)))

['Swaziland',
 'Lesotho',
 'Djibouti',
 'Central African Republic',
 'Oman',
 'Mozambique',
 'Somaliland region']

In [16]:
(list(set(list16) - set(list15)))

# Somaliland region occurs in both lists but written slightly different, needs to be changed

['Namibia',
 'Belize',
 'Puerto Rico',
 'South Sudan',
 'Somaliland Region',
 'Somalia']

In [17]:
hap16_s2 = hap16_s1.replace({'Country': {'Somaliland Region': 'Somaliland region'}})

In [18]:
# merge 2016 df on 2015 df

df = pd.merge(hap15_s1, hap16_s2, on = 'Country', how = 'outer')

In [19]:
# before merging 2017 df on country, again check whether countries are written the same

df_list = df['Country'].values.tolist()
list17 = hap17_s1['Country'].values.tolist()

In [20]:
(list(set(df_list) - set(list17)))

['Swaziland',
 'Hong Kong',
 'Puerto Rico',
 'Djibouti',
 'Suriname',
 'Laos',
 'Comoros',
 'Taiwan',
 'Oman',
 'Somaliland region']

In [21]:
(list(set(list17) - set(df_list)))

# in the 2017 df the names for Hong Kong and Taiwan need to be changed

['Taiwan Province of China', 'Hong Kong S.A.R., China']

In [22]:
hap17_s2 = hap17_s1.replace({'Country': {'Hong Kong S.A.R., China': 'Hong Kong', 'Taiwan Province of China': 'Taiwan'}})

In [23]:
# merge dataframes

df2 = pd.merge(df, hap17_s2, on = 'Country', how = 'outer')

In [24]:
# again before merging 2018, check whether countries are written the same

df2_list = df2['Country'].values.tolist()
list18 = hap18_s1['Country'].values.tolist()

In [25]:
(list(set(df2_list) - set(list18)))

['Swaziland',
 'North Cyprus',
 'Puerto Rico',
 'Djibouti',
 'Trinidad and Tobago',
 'Suriname',
 'Comoros',
 'Oman',
 'Somaliland region']

In [26]:
(list(set(list18) - set(df2_list)))

['Trinidad & Tobago', 'Northern Cyprus']

In [27]:
# fix country names

hap18_s2 = hap18_s1.replace({'Country': {'Trinidad & Tobago': 'Trinidad and Tobago', 'Northern Cyprus': 'North Cyprus'}})

In [28]:
# merge dataframes

df3 = pd.merge(df2, hap18_s2, on = 'Country', how = 'outer')

In [29]:
# again before merging 2019, check whether countries are written the same

df3_list = df3['Country'].values.tolist()
list19 = hap19_s1['Country'].values.tolist()

In [30]:
(list(set(df3_list) - set(list19)))

['North Cyprus',
 'Angola',
 'Belize',
 'Puerto Rico',
 'Djibouti',
 'Trinidad and Tobago',
 'Suriname',
 'Macedonia',
 'Sudan',
 'Oman',
 'Somaliland region']

In [31]:
(list(set(list19) - set(df3_list)))

['Trinidad & Tobago', 'Gambia', 'North Macedonia', 'Northern Cyprus']

In [32]:
# fix country names

hap19_s2 = hap19_s1.replace({'Country': {'Trinidad & Tobago': 'Trinidad and Tobago', 'Northern Cyprus': 'North Cyprus', 'North Macedonia': 'Macedonia'}})

In [33]:
# merge dataframes

df4 = pd.merge(df3, hap19_s2, on = 'Country', how = 'outer')

In [34]:
# check complete df with years 2015-2019

df4.head(30)

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,Happiness Rank 2019,Happiness Score 2019
0,Switzerland,Western Europe,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48
1,Iceland,Western Europe,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494
2,Denmark,Western Europe,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6
3,Norway,Western Europe,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554
4,Canada,North America,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278
5,Finland,Western Europe,6.0,7.406,5.0,7.413,5.0,7.469,1.0,7.632,1.0,7.769
6,Netherlands,Western Europe,7.0,7.378,7.0,7.339,6.0,7.377,6.0,7.441,5.0,7.488
7,Sweden,Western Europe,8.0,7.364,10.0,7.291,9.0,7.284,9.0,7.314,7.0,7.343
8,New Zealand,Australia and New Zealand,9.0,7.286,8.0,7.334,8.0,7.314,8.0,7.324,8.0,7.307
9,Australia,Australia and New Zealand,10.0,7.284,9.0,7.313,10.0,7.284,10.0,7.272,11.0,7.228


In [35]:
# make new column with average rank over the years

col = df4[['Happiness Rank 2015', 'Happiness Rank 2016', 'Happiness Rank 2017', 'Happiness Rank 2018', 'Happiness Rank 2019']]

df4['Rank mean'] = col.mean(axis = 1, skipna = True)
df4.head()

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,Happiness Rank 2019,Happiness Score 2019,Rank mean
0,Switzerland,Western Europe,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48,3.6
1,Iceland,Western Europe,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494,3.2
2,Denmark,Western Europe,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6,2.2
3,Norway,Western Europe,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554,2.8
4,Canada,North America,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278,6.8


In [36]:
# make new rank column based on mean rank

df4['Rank overall'] = df4['Rank mean'].rank(method='first', ascending=True)

In [37]:
# make new column with average happiness score over 5 years

col2 = df4[['Happiness Score 2015', 'Happiness Score 2016', 'Happiness Score 2017', 'Happiness Score 2018', 'Happiness Score 2019']]

df4['Score mean'] = col2.mean(axis = 1, skipna = True)
df4.head()

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,Happiness Rank 2019,Happiness Score 2019,Rank mean,Rank overall,Score mean
0,Switzerland,Western Europe,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48,3.6,4.0,7.5114
1,Iceland,Western Europe,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494,3.2,3.0,7.511
2,Denmark,Western Europe,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6,2.2,1.0,7.546
3,Norway,Western Europe,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554,2.8,2.0,7.541
4,Canada,North America,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278,6.8,7.0,7.3506


In [38]:
# sort on score mean

df4_sorted = df4.sort_values('Score mean', ascending = False)
df4_sorted.head(10)

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,Happiness Rank 2019,Happiness Score 2019,Rank mean,Rank overall,Score mean
2,Denmark,Western Europe,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555,2.0,7.6,2.2,1.0,7.546
3,Norway,Western Europe,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594,3.0,7.554,2.8,2.0,7.541
5,Finland,Western Europe,6.0,7.406,5.0,7.413,5.0,7.469,1.0,7.632,1.0,7.769,3.6,5.0,7.5378
0,Switzerland,Western Europe,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487,6.0,7.48,3.6,4.0,7.5114
1,Iceland,Western Europe,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495,4.0,7.494,3.2,3.0,7.511
6,Netherlands,Western Europe,7.0,7.378,7.0,7.339,6.0,7.377,6.0,7.441,5.0,7.488,6.2,6.0,7.4046
4,Canada,North America,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328,9.0,7.278,6.8,7.0,7.3506
7,Sweden,Western Europe,8.0,7.364,10.0,7.291,9.0,7.284,9.0,7.314,7.0,7.343,8.6,9.0,7.3192
8,New Zealand,Australia and New Zealand,9.0,7.286,8.0,7.334,8.0,7.314,8.0,7.324,8.0,7.307,8.2,8.0,7.313
9,Australia,Australia and New Zealand,10.0,7.284,9.0,7.313,10.0,7.284,10.0,7.272,11.0,7.228,10.0,10.0,7.2762


In [39]:
# Import World Bank data with World Development Indicator values per country 1960-2019

WDI = pd.read_csv('../data/WDIData.csv')
WDI.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,87.199474,87.51226,88.129881,87.275323,88.720097,89.308602,90.283638,89.286856,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,75.958878,77.251714,78.165706,75.512153,78.211,79.065508,81.102134,79.2481,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,96.466418,96.435957,96.772853,96.466705,96.936319,97.290083,97.467915,97.063959,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,22.260538,,,30.27713,,,37.165211,,,


In [40]:
# select relevant columns, years 2010-2019

WDI2 = WDI[['Country Name', 'Country Code', 'Indicator Name', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

In [41]:
# create new column with average values over 2010-2019

col3 = WDI2[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]

WDI2['2010_2019'] = col3.mean(axis = 1, skipna = True)

WDI2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Country Name,Country Code,Indicator Name,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2010_2019
0,Arab World,ARB,Access to clean fuels and technologies for coo...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,,83.483502
1,Arab World,ARB,Access to electricity (% of population),86.959991,87.199474,87.51226,88.129881,87.275323,88.720097,89.308602,90.283638,89.286856,,88.297347
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",75.81616,75.958878,77.251714,78.165706,75.512153,78.211,79.065508,81.102134,79.2481,,77.814595
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",96.290866,96.466418,96.435957,96.772853,96.466705,96.936319,97.290083,97.467915,97.063959,,96.799008
4,Arab World,ARB,Account ownership at a financial institution o...,,22.260538,,,30.27713,,,37.165211,,,29.90096


In [42]:
# select relevant columns

WDI3 = WDI2[['Country Name', 'Country Code', 'Indicator Name', '2010_2019']]
WDI3.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,2010_2019
0,Arab World,ARB,Access to clean fuels and technologies for coo...,83.483502
1,Arab World,ARB,Access to electricity (% of population),88.297347
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",77.814595
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",96.799008
4,Arab World,ARB,Account ownership at a financial institution o...,29.90096


In [43]:
# transpose dataframe so that each country has one row and the indicators each have a column

WDI3_pivot = WDI3.pivot_table(index='Country Name', columns='Indicator Name', values='2010_2019', aggfunc = 'max').reset_index()

In [44]:
WDI3_pivot.head()
# there are 1417 columns, too many to review 

Indicator Name,Country Name,ARI treatment (% of children under 5 taken to a health provider),Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)","Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)",...,"Women participating in the three decisions (own health care, major household purchases, and visiting family) (% of women age 15-49)",Women who believe a husband is justified in beating his wife (any of five reasons) (%),Women who believe a husband is justified in beating his wife when she argues with him (%),Women who believe a husband is justified in beating his wife when she burns the food (%),Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Women who believe a husband is justified in beating his wife when she neglects the children (%),Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Women who were first married by age 15 (% of women ages 20-24),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%)
0,Afghanistan,61.0,26.255714,75.452054,69.495679,94.13795,11.286442,4.52978,17.912206,13.354609,...,32.6,85.2,59.2,18.2,66.9,48.4,33.4,8.8,36.0,29.066667
1,Albania,82.4,71.65,99.985714,100.0,99.985525,35.423217,31.456601,39.40929,37.21507,...,83.7,6.8,1.8,0.8,3.7,5.2,0.9,1.4,11.8,
2,Algeria,66.4,92.66,99.554766,98.939293,99.836065,42.179509,29.915664,54.430613,48.701731,...,,59.0,,,,,,0.4,2.5,44.511111
3,American Samoa,,,,,,,,,,...,,,,,,,,,,
4,Andorra,,100.0,100.0,100.0,99.988351,,,,,...,,,,,,,,,,


In [45]:
# only select columns with less than 10% missing data

WDI_select = WDI3_pivot.loc[:, WDI3_pivot.isnull().mean() < .10]

In [46]:
# list the remaining indicators

WDI_column_list = WDI_select.columns.tolist()

WDI_column_list

['Country Name',
 'Access to electricity (% of population)',
 'Access to electricity, rural (% of rural population)',
 'Access to electricity, urban (% of urban population)',
 'Adjusted savings: carbon dioxide damage (% of GNI)',
 'Adjusted savings: consumption of fixed capital (% of GNI)',
 'Adjusted savings: education expenditure (% of GNI)',
 'Adjusted savings: energy depletion (% of GNI)',
 'Adjusted savings: mineral depletion (% of GNI)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old (% of working-age population)',
 'Age dependency ratio, young (% of working-age population)',
 'Agricultural land (% of land area)',
 'Agricultural land (sq. km)',
 'Agriculture, forestry, and fishing, value added (% of GDP)',
 'Agriculture, forestry, and fishing, value added (current US$)',
 'Arable land (% of land area)',
 'Arable land (hectares per person)',
 'Bird species, threatened',
 'Birth ra

In [47]:
# select relevant indicators:

WDI_df = WDI_select[['Country Name', 'Access to electricity (% of population)',
                    'CO2 emissions (metric tons per capita)',
                    'Compulsory education, duration (years)',
                    'GDP growth (annual %)',
                    'GDP per capita (current US$)',
                    'Individuals using the Internet (% of population)',
                    'Land area (sq. km)',
                    'Life expectancy at birth, total (years)',
                    'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
                    'People using at least basic drinking water services (% of population)',
                    'Population density (people per sq. km of land area)',
                    'Population growth (annual %)',
                    'Primary education, duration (years)',
                    'Proportion of seats held by women in national parliaments (%)',
                    'Refugee population by country or territory of origin',
                    'Population, total',
                    'Renewable energy consumption (% of total final energy consumption)',
                    'School enrollment, primary (gross), gender parity index (GPI)',
                    'Secondary education, duration (years)',
                    'Terrestrial protected areas (% of total land area)',
                    'Urban population (% of total population)']]

In [48]:
WDI_df.head()

Indicator Name,Country Name,Access to electricity (% of population),CO2 emissions (metric tons per capita),"Compulsory education, duration (years)",GDP growth (annual %),GDP per capita (current US$),Individuals using the Internet (% of population),Land area (sq. km),"Life expectancy at birth, total (years)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
0,Afghanistan,75.452054,0.32917,9.0,4.808217,581.139111,7.539318,652860.0,62.875778,61.61651,...,2.992885,6.0,27.717,2659035.0,33263360.0,15.728023,0.69078,6.0,0.103138,24.597
1,Albania,99.985714,1.760634,8.777778,2.660677,4405.508884,58.427422,27400.0,77.678333,19.919271,...,-0.23452,5.0,21.461,12034.0,2888828.0,38.619974,1.025974,7.0,17.56073,56.345222
2,Algeria,99.554766,3.471263,10.0,2.9,4756.077939,30.608339,2381740.0,75.854667,35.684048,...,1.980571,5.0,25.106,4564.333,38998180.0,0.148607,0.947517,7.0,7.496707,70.164
3,American Samoa,,,,-1.166953,11219.707568,,200.0,,13.765959,...,-0.241357,6.0,,,55738.56,0.498655,,6.0,11.988496,87.323444
4,Andorra,100.0,5.928314,10.0,-0.322951,39620.559131,90.592816,470.0,,11.644685,...,-1.027003,6.0,42.13,5.125,79991.67,19.409272,,6.0,24.881817,88.439889


In [49]:
# Merge happiness index df with WDI df
# First, change column name to 'Country'

WDI_df.rename(columns = {'Country Name': 'Country'}, inplace = True)
WDI_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Indicator Name,Country,Access to electricity (% of population),CO2 emissions (metric tons per capita),"Compulsory education, duration (years)",GDP growth (annual %),GDP per capita (current US$),Individuals using the Internet (% of population),Land area (sq. km),"Life expectancy at birth, total (years)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
0,Afghanistan,75.452054,0.32917,9.0,4.808217,581.139111,7.539318,652860.0,62.875778,61.61651,...,2.992885,6.0,27.717,2659035.0,33263360.0,15.728023,0.69078,6.0,0.103138,24.597
1,Albania,99.985714,1.760634,8.777778,2.660677,4405.508884,58.427422,27400.0,77.678333,19.919271,...,-0.23452,5.0,21.461,12034.0,2888828.0,38.619974,1.025974,7.0,17.56073,56.345222
2,Algeria,99.554766,3.471263,10.0,2.9,4756.077939,30.608339,2381740.0,75.854667,35.684048,...,1.980571,5.0,25.106,4564.333,38998180.0,0.148607,0.947517,7.0,7.496707,70.164
3,American Samoa,,,,-1.166953,11219.707568,,200.0,,13.765959,...,-0.241357,6.0,,,55738.56,0.498655,,6.0,11.988496,87.323444
4,Andorra,100.0,5.928314,10.0,-0.322951,39620.559131,90.592816,470.0,,11.644685,...,-1.027003,6.0,42.13,5.125,79991.67,19.409272,,6.0,24.881817,88.439889


In [50]:
# Check whether country names need to be renamed

df4_list = df4['Country'].values.tolist()

WDI_list = WDI_df['Country'].values.tolist()

In [51]:
sorted((list(set(df4_list) - set(WDI_list))))

['Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Egypt',
 'Gambia',
 'Hong Kong',
 'Iran',
 'Ivory Coast',
 'Kyrgyzstan',
 'Laos',
 'Macedonia',
 'North Cyprus',
 'Palestinian Territories',
 'Russia',
 'Slovakia',
 'Somaliland region',
 'South Korea',
 'Swaziland',
 'Syria',
 'Taiwan',
 'Venezuela',
 'Yemen']

In [52]:
sorted(list(set(WDI_list) - set(df4_list)))

['American Samoa',
 'Andorra',
 'Antigua and Barbuda',
 'Arab World',
 'Aruba',
 'Bahamas, The',
 'Barbados',
 'Bermuda',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Cabo Verde',
 'Caribbean small states',
 'Cayman Islands',
 'Central Europe and the Baltics',
 'Channel Islands',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 "Cote d'Ivoire",
 'Cuba',
 'Curacao',
 'Dominica',
 'Early-demographic dividend',
 'East Asia & Pacific',
 'East Asia & Pacific (IDA & IBRD countries)',
 'East Asia & Pacific (excluding high income)',
 'Egypt, Arab Rep.',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Euro area',
 'Europe & Central Asia',
 'Europe & Central Asia (IDA & IBRD countries)',
 'Europe & Central Asia (excluding high income)',
 'European Union',
 'Faroe Islands',
 'Fiji',
 'Fragile and conflict affected situations',
 'French Polynesia',
 'Gambia, The',
 'Gibraltar',
 'Greenland',
 'Grenada',
 'Guam',
 'Guinea-Bissau',
 'Guyana',
 'Heavily indebted poor countries (HIPC)',
 'High income',
 '

In [53]:
# Change country names that are in both lists so that they are the same

WDI_df2 = WDI_df.replace({'Country': {'Congo, Rep.': 'Congo (Brazzaville)', 'Congo, Dem. Rep.': 'Congo (Kinshasa)', 'Egypt, Arab Rep.': 'Egypt', 'Gambia, The': 'Gambia', 'Hong Kong SAR, China': 'Hong Kong', 'Iran, Islamic Rep.': 'Iran', "Cote d'Ivoire": 'Ivory Coast', 'Kyrgyz Republic': 'Kyrgyzstan', 'Lao PDR': 'Laos', 'North Macedonia': 'Macedonia', 'West Bank and Gaza': 'Palestinian Territories', 'Russian Federation': 'Russia', 'Slovak Republic': 'Slovakia', 'Korea, Rep.': 'South Korea', 'Eswatini': 'Swaziland', 'Syrian Arab Republic': 'Syria', 'Venezuela, RB': 'Venezuela', 'Yemen, Rep.': 'Yemen'}})


In [54]:
# check

WDI_list2 = WDI_df2['Country'].values.tolist()
sorted((list(set(df4_list) - set(WDI_list2))))

# remove 'North Cyprus', 'Somaliland region', 'Taiwan' after merge (no WDI data)

['North Cyprus', 'Somaliland region', 'Taiwan']

In [55]:
# Left merge WDI df on happiness index df on country

df5 = pd.merge(df4, WDI_df2, on = 'Country', how = 'left')

In [56]:
df5.head()

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
0,Switzerland,Western Europe,1.0,7.587,2.0,7.509,4.0,7.494,5.0,7.487,...,1.052495,6.0,30.8,15.0,8181440.0,22.946425,0.996315,7.0,9.660123,73.697333
1,Iceland,Western Europe,2.0,7.561,3.0,7.501,3.0,7.504,4.0,7.495,...,1.133978,7.0,41.44,2.833333,330144.0,76.467214,0.999472,7.0,17.914355,93.675778
2,Denmark,Western Europe,3.0,7.527,1.0,7.526,2.0,7.522,3.0,7.555,...,0.531352,6.9,38.311,7.75,5659816.0,27.220085,0.991766,6.0,17.921347,87.377222
3,Norway,Western Europe,4.0,7.522,4.0,7.498,1.0,7.537,2.0,7.594,...,1.059666,7.0,39.903,10.0,5121086.0,57.320681,1.000233,6.0,17.060995,80.678222
4,Canada,North America,5.0,7.427,6.0,7.404,7.0,7.316,7.0,7.328,...,1.078803,6.0,25.365,93.222222,35443590.0,22.181059,1.006599,6.0,9.687049,81.209556


In [57]:
print(df4.shape)
print(df5.shape)

(164, 15)
(164, 36)


In [58]:
# remove 'North Cyprus', 'Somaliland region', 'Taiwan'

df5[df5['Country'] == 'North Cyprus']

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
65,North Cyprus,Western Europe,66.0,5.695,62.0,5.771,61.0,5.81,58.0,5.835,...,,,,,,,,,,


In [59]:
df5[df5['Country'] == 'Somaliland region']

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
90,Somaliland region,Sub-Saharan Africa,91.0,5.057,97.0,5.057,,,,,...,,,,,,,,,,


In [60]:
df5[df5['Country'] == 'Taiwan']

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
37,Taiwan,Eastern Asia,38.0,6.298,34.0,6.379,33.0,6.422,26.0,6.441,...,,,,,,,,,,


In [61]:
# drop rows

df6 = df5.drop([df5.index[37], df5.index[65], df5.index[90]])

In [62]:
df6 = df6.reset_index(drop = True)

In [63]:
# check number of rows

df6.shape

(161, 36)

In [64]:
# Check missing data

df6.isnull().sum()

Country                                                                   0
Region                                                                    6
Happiness Rank 2015                                                       6
Happiness Score 2015                                                      6
Happiness Rank 2016                                                       7
Happiness Score 2016                                                      7
Happiness Rank 2017                                                       8
Happiness Score 2017                                                      8
Happiness Rank 2018                                                       7
Happiness Score 2018                                                      7
Happiness Rank 2019                                                       7
Happiness Score 2019                                                      7
Rank mean                                                                 0
Rank overall

In [65]:
# fix missing regions

df6[df6['Region'].isnull()]

Unnamed: 0,Country,Region,Happiness Rank 2015,Happiness Score 2015,Happiness Rank 2016,Happiness Score 2016,Happiness Rank 2017,Happiness Score 2017,Happiness Rank 2018,Happiness Score 2018,...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"School enrollment, primary (gross), gender parity index (GPI)","Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
155,Puerto Rico,,,,15.0,7.039,,,,,...,-1.75067,6.0,,,3506946.0,1.133982,1.02015,6.0,7.32611,93.672444
156,Belize,,,,52.0,5.956,50.0,5.956,49.0,5.956,...,2.186039,6.0,4.368,46.11111,353106.1,34.739543,0.955642,6.0,37.679577,45.393889
157,Somalia,,,,76.0,5.44,93.0,5.151,98.0,4.982,...,2.749931,6.0,15.576,1031512.0,13467010.0,94.00993,,6.0,0.83,42.565222
158,Namibia,,,,113.0,4.574,111.0,4.574,119.0,4.441,...,1.80562,7.0,34.872222,1213.444,2277900.0,27.538634,0.961368,5.0,37.890621,45.825111
159,South Sudan,,,,143.0,3.832,147.0,3.591,154.0,3.254,...,2.03107,6.0,27.384444,969789.0,10421940.0,32.200623,0.684945,6.0,15.501001,18.672778
160,Gambia,,,,,,,,,,...,3.001941,6.0,8.914,7590.667,2029234.0,53.860548,1.058881,6.0,4.108241,58.501556


In [67]:
# check region names

df6['Region'].value_counts()

Sub-Saharan Africa                 39
Central and Eastern Europe         29
Latin America and Caribbean        22
Western Europe                     20
Middle East and Northern Africa    20
Southeastern Asia                   9
Southern Asia                       7
Eastern Asia                        5
North America                       2
Australia and New Zealand           2
Name: Region, dtype: int64

In [68]:
# fill in regions

df6.loc[155, 'Region'] = 'Latin America and Caribbean'
df6.loc[156, 'Region'] = 'Latin America and Caribbean'
df6.loc[157, 'Region'] = 'Sub-Saharan Africa'
df6.loc[158, 'Region'] = 'Sub-Saharan Africa'
df6.loc[159, 'Region'] = 'Sub-Saharan Africa'
df6.loc[160, 'Region'] = 'Sub-Saharan Africa'


In [69]:
# make new df for ML analysis with just the country, region, average happiness score and the selected WDIs

dfML = df6.drop(['Happiness Rank 2015', 'Happiness Score 2015', 'Happiness Rank 2016', 'Happiness Score 2016', 'Happiness Rank 2017', 'Happiness Score 2017', 'Happiness Rank 2018', 'Happiness Score 2018', 'Happiness Rank 2019', 'Happiness Score 2019', 'Rank mean', 'Rank overall', 'Compulsory education, duration (years)', 'GDP growth (annual %)', 'School enrollment, primary (gross), gender parity index (GPI)'], axis=1)
dfML.head()

Unnamed: 0,Country,Region,Score mean,Access to electricity (% of population),CO2 emissions (metric tons per capita),GDP per capita (current US$),Individuals using the Internet (% of population),Land area (sq. km),"Life expectancy at birth, total (years)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",...,Population density (people per sq. km of land area),Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population)
0,Switzerland,Western Europe,7.5114,100.0,4.732688,82645.581204,86.791615,39516.0,83.026287,11.656622,...,207.041205,1.052495,6.0,30.8,15.0,8181440.0,22.946425,7.0,9.660123,73.697333
1,Iceland,Western Europe,7.511,100.0,5.921191,55453.949873,96.981917,100250.0,82.454472,7.178653,...,3.293207,1.133978,7.0,41.44,2.833333,330144.0,76.467214,7.0,17.914355,93.675778
2,Denmark,Western Europe,7.546,100.0,7.003336,58721.482142,94.347397,42087.78,80.440108,11.047112,...,134.484524,0.531352,6.9,38.311,7.75,5659816.0,27.220085,6.0,17.921347,87.377222
3,Norway,Western Europe,7.541,100.0,10.416053,87977.60266,95.537903,365163.3,81.969919,7.697953,...,14.024149,1.059666,7.0,39.903,10.0,5121086.0,57.320681,6.0,17.060995,80.678222
4,Canada,North America,7.3506,100.0,15.23418,48056.677329,86.4225,9093510.0,81.732249,7.558178,...,3.89768,1.078803,6.0,25.365,93.222222,35443590.0,22.181059,6.0,9.687049,81.209556


In [71]:
# make new column for refugees, percentage of total population (existing column gives absolute numbers)

dfML['Refugees country of origin (% of total population'] = dfML['Refugee population by country or territory of origin'] / dfML['Population, total']
dfML.head()

Unnamed: 0,Country,Region,Score mean,Access to electricity (% of population),CO2 emissions (metric tons per capita),GDP per capita (current US$),Individuals using the Internet (% of population),Land area (sq. km),"Life expectancy at birth, total (years)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population),Refugees country of origin (% of total population
0,Switzerland,Western Europe,7.5114,100.0,4.732688,82645.581204,86.791615,39516.0,83.026287,11.656622,...,1.052495,6.0,30.8,15.0,8181440.0,22.946425,7.0,9.660123,73.697333,2e-06
1,Iceland,Western Europe,7.511,100.0,5.921191,55453.949873,96.981917,100250.0,82.454472,7.178653,...,1.133978,7.0,41.44,2.833333,330144.0,76.467214,7.0,17.914355,93.675778,9e-06
2,Denmark,Western Europe,7.546,100.0,7.003336,58721.482142,94.347397,42087.78,80.440108,11.047112,...,0.531352,6.9,38.311,7.75,5659816.0,27.220085,6.0,17.921347,87.377222,1e-06
3,Norway,Western Europe,7.541,100.0,10.416053,87977.60266,95.537903,365163.3,81.969919,7.697953,...,1.059666,7.0,39.903,10.0,5121086.0,57.320681,6.0,17.060995,80.678222,2e-06
4,Canada,North America,7.3506,100.0,15.23418,48056.677329,86.4225,9093510.0,81.732249,7.558178,...,1.078803,6.0,25.365,93.222222,35443590.0,22.181059,6.0,9.687049,81.209556,3e-06


In [72]:
# save df for future use

dfML.to_csv('dfML.csv', index=False)

In [73]:
# Check missing values

dfML[dfML.isnull().any(axis=1)]

Unnamed: 0,Country,Region,Score mean,Access to electricity (% of population),CO2 emissions (metric tons per capita),GDP per capita (current US$),Individuals using the Internet (% of population),Land area (sq. km),"Life expectancy at birth, total (years)","PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",...,Population growth (annual %),"Primary education, duration (years)",Proportion of seats held by women in national parliaments (%),Refugee population by country or territory of origin,"Population, total",Renewable energy consumption (% of total final energy consumption),"Secondary education, duration (years)",Terrestrial protected areas (% of total land area),Urban population (% of total population),Refugees country of origin (% of total population
50,Moldova,Central and Eastern Europe,5.7586,100.0,1.73649,3158.653109,57.099315,32873.333333,71.019,17.662886,...,,4.0,21.094,3597.111,2821592.0,11.532727,7.0,4.150356,42.553,0.001275
66,Kosovo,Central and Eastern Europe,5.6062,99.866667,,3786.016418,86.668314,10887.0,71.063957,,...,0.516566,,,,1812420.0,20.718592,,,,
69,Hong Kong,Eastern Asia,5.4528,100.0,6.144035,40467.773639,80.39082,1050.0,83.979133,,...,0.737015,6.0,,18.33333,7236100.0,0.889434,6.2,41.88912,100.0,3e-06
104,Palestinian Territories,Middle East and Northern Africa,4.7366,99.869412,0.566854,2942.885665,52.261397,6020.0,73.320333,33.181225,...,2.377013,4.0,,96865.33,4173369.0,12.389768,8.0,8.359252,75.131,0.02321
114,Sudan,Sub-Saharan Africa,4.24175,47.913653,0.317103,2003.160548,21.760037,,64.065444,51.586651,...,2.366133,6.0,27.3,607575.3,38056000.0,61.762116,5.0,2.281175,33.762778,0.015965
152,Syria,Middle East and Northern Africa,3.2922,90.112048,2.254463,,27.236616,183630.0,70.751889,42.502831,...,-2.517698,5.2,12.6,3387095.0,18957070.0,1.927706,6.8,0.686869,53.381,0.178672
155,Puerto Rico,Latin America and Caribbean,7.039,100.0,,29172.397755,62.001665,8870.0,79.156149,9.415499,...,-1.75067,6.0,,,3506946.0,1.133982,6.0,7.32611,93.672444,
159,South Sudan,Sub-Saharan Africa,3.3825,15.123399,0.137748,1408.965472,5.700656,,56.401333,44.379443,...,2.03107,6.0,27.384444,969789.0,10421940.0,32.200623,6.0,15.501001,18.672778,0.093053


In [74]:
# create clean df without any missing values 

dfML_clean = dfML.drop([dfML.index[50], dfML.index[66], dfML.index[69], dfML.index[104], dfML.index[114], dfML.index[152], dfML.index[155], dfML.index[157], dfML.index[159]])


In [75]:
# check

dfML_clean.isnull().sum()

Country                                                                   0
Region                                                                    0
Score mean                                                                0
Access to electricity (% of population)                                   0
CO2 emissions (metric tons per capita)                                    0
GDP per capita (current US$)                                              0
Individuals using the Internet (% of population)                          0
Land area (sq. km)                                                        0
Life expectancy at birth, total (years)                                   0
PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)    0
People using at least basic drinking water services (% of population)     0
Population density (people per sq. km of land area)                       0
Population growth (annual %)                                              0
Primary educ

In [76]:
# check data types

dfML_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152 entries, 0 to 160
Data columns (total 22 columns):
Country                                                                   152 non-null object
Region                                                                    152 non-null object
Score mean                                                                152 non-null float64
Access to electricity (% of population)                                   152 non-null float64
CO2 emissions (metric tons per capita)                                    152 non-null float64
GDP per capita (current US$)                                              152 non-null float64
Individuals using the Internet (% of population)                          152 non-null float64
Land area (sq. km)                                                        152 non-null float64
Life expectancy at birth, total (years)                                   152 non-null float64
PM2.5 air pollution, mean annual exposure 

In [77]:
# save df for future use

dfML_clean.to_csv('dfML_clean.csv', index=False)