In [1]:
import pandas as pd
import re
import seaborn as sns
import os
import matplotlib.pyplot as plt
import requests
from lxml import html

The World Happiness Report is a landmark survey of the state of global happiness that ranks 156 countries by how happy their citizens perceive themselves to be. The World Happiness Report 2020 for the first time ranks cities around the world by their subjective well-being and digs more deeply into how the social, urban and natural environments combine to affect our happiness.

https://www.kaggle.com/datasets/mathurinache/world-happiness-report?resource=download&select=2022.csv

https://datosmacro.expansion.com/ipc-paises


a136dfe125d4c622d671e5c904689ced    https://fredaccount.stlouisfed.org/apikey

In [2]:
def downloading(path):
    """This function downloads from a raw link and saves the dataframe locally.
    args:
    :url: string. the link
    :name: string. name to save the file
    """
    df=pd.read_csv(path, encoding='latin')
    return df
    pass

In [3]:
df=pd.read_csv('data/happiness-2015.csv', encoding='latin')
df2017=pd.read_csv('data/happiness-2017.csv', encoding='latin')
df2019=pd.read_csv('data/happiness-2019.csv', encoding='latin')
df2021=pd.read_csv('data/happiness-2021.csv', encoding='latin')
df2022=pd.read_csv('data/happiness-2022.csv', encoding='latin')

In [4]:
df.sample()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
94,Albania,Central and Eastern Europe,95,4.959,0.05013,0.87867,0.80434,0.81325,0.35733,0.06413,0.14272,1.89894


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [6]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature'

In [7]:
headers = {"User-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
res = requests.get(url, headers=headers)

In [8]:
print(res)

<Response [200]>


In [9]:
table = pd.read_html(res.content, encoding = 'utf8')

In [10]:
df2 = table[0] 

In [11]:
df2['Average yearly temperature (1961–1990 Celsius)'] = df2['Average yearly temperature (1961–1990 Celsius)'].str.replace('−', '-').astype(float)

In [12]:
df2.sample()

Unnamed: 0,Country,Average yearly temperature (1961–1990 Celsius)
7,Tuvalu,27.65


In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 2 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Country                                         192 non-null    object 
 1   Average yearly temperature (1961–1990 Celsius)  192 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.1+ KB


In [14]:
df2.describe()

Unnamed: 0,Average yearly temperature (1961–1990 Celsius)
count,192.0
mean,18.623125
std,8.451555
min,-5.35
25%,10.6625
50%,22.15
75%,25.425
max,28.29


In [15]:
df_countries = list(df['Country'].values)
print(df_countries)

['Switzerland', 'Iceland', 'Denmark', 'Norway', 'Canada', 'Finland', 'Netherlands', 'Sweden', 'New Zealand', 'Australia', 'Israel', 'Costa Rica', 'Austria', 'Mexico', 'United States', 'Brazil', 'Luxembourg', 'Ireland', 'Belgium', 'United Arab Emirates', 'United Kingdom', 'Oman', 'Venezuela', 'Singapore', 'Panama', 'Germany', 'Chile', 'Qatar', 'France', 'Argentina', 'Czech Republic', 'Uruguay', 'Colombia', 'Thailand', 'Saudi Arabia', 'Spain', 'Malta', 'Taiwan', 'Kuwait', 'Suriname', 'Trinidad and Tobago', 'El Salvador', 'Guatemala', 'Uzbekistan', 'Slovakia', 'Japan', 'South Korea', 'Ecuador', 'Bahrain', 'Italy', 'Bolivia', 'Moldova', 'Paraguay', 'Kazakhstan', 'Slovenia', 'Lithuania', 'Nicaragua', 'Peru', 'Belarus', 'Poland', 'Malaysia', 'Croatia', 'Libya', 'Russia', 'Jamaica', 'North Cyprus', 'Cyprus', 'Algeria', 'Kosovo', 'Turkmenistan', 'Mauritius', 'Hong Kong', 'Estonia', 'Indonesia', 'Vietnam', 'Turkey', 'Kyrgyzstan', 'Nigeria', 'Bhutan', 'Azerbaijan', 'Pakistan', 'Jordan', 'Montene

In [16]:
df2_countries = list(df2['Country'].values)

In [17]:
df_remaining_countries=[]
df2_remaining_countries=[]

for i in df_countries:
    if i not in df2_countries:
        df_remaining_countries.append(i)
        
for i in df2_countries:
    if i not in df_countries:
        df2_remaining_countries.append(i)
        
print(df_remaining_countries)

['Denmark', 'Norway', 'Taiwan', 'North Cyprus', 'Kosovo', 'Somaliland region', 'Macedonia', 'Swaziland', 'Palestinian Territories', 'Congo (Kinshasa)', 'Congo (Brazzaville)']


In [18]:
print(df2_remaining_countries)

['Kiribati', 'Maldives', 'Tuvalu', 'Palau', 'Gambia', 'Marshall Islands', 'Seychelles', 'Somalia', 'Brunei', 'Saint Vincent and the Grenadines', 'Guinea-Bissau', 'Samoa', 'Grenada', 'Barbados', 'Guyana', 'Federated States of Micronesia', 'Antigua and Barbuda', 'Solomon Islands', 'Eritrea', 'Saint Lucia', 'Belize', 'Timor-Leste', 'Papua New Guinea', 'Tonga', 'Cuba', 'Bahamas', 'Republic of the Congo', 'Equatorial Guinea', 'Saint Kitts and Nevis', 'Fiji', 'Democratic Republic of the Congo', 'Vanuatu', 'São Tomé and Príncipe', 'Cape Verde', 'Dominica', 'Eswatini', 'Namibia', 'Monaco', 'San Marino', 'North Macedonia', 'Andorra', 'Liechtenstein', 'North Korea', 'Norway [note 1]', 'Denmark [note 2]']


In [19]:
mask_Denmark = df2['Country'].str.contains('Denmark', case=False)

df2.loc[mask_Denmark, 'Country'] = 'Denmark'

In [20]:
mask_Cyprus = df2['Country'].str.contains('Cyprus', case=False)
df2.loc[mask_Cyprus, 'Country'] = 'Cyprus'

In [21]:
mask_Norway = df2['Country'].str.contains('Norway', case=False)
df2.loc[mask_Norway, 'Country'] = 'Norway'

In [22]:
mask_Somalia = df2['Country'].str.contains('Somaliland', case=False)
df2.loc[mask_Somalia, 'Country'] = 'Somalia'

In [23]:
mask_Somalia = df['Country'].str.contains('Somaliland', case=False)
df.loc[mask_Somalia, 'Country'] = 'Somalia'

In [24]:
mask_Macedonia = df2['Country'].str.contains('Macedonia', case=False)
df2.loc[mask_Macedonia, 'Country'] = 'Macedonia'

In [25]:
mask_Swaziland = df2['Country'].str.contains('Eswatini', case=False)
df2.loc[mask_Swaziland, 'Country'] = 'Swaziland'

In [26]:
mask_Democratic = df['Country'].str.contains('Kinshasa', case=False)
df.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'

In [27]:
mask_Congo = df['Country'].str.contains('Brazzaville', case=False)
df.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

In [28]:
df2_countries = list(df2['Country'].values)
df_countries = list(df['Country'].values)

In [29]:
df_remaining_countries=[]
df2_remaining_countries=[]

for i in df_countries:
    if i not in df2_countries:
        df_remaining_countries.append(i)
        
for i in df2_countries:
    if i not in df_countries:
        df2_remaining_countries.append(i)
        
print(df_remaining_countries)

['Taiwan', 'North Cyprus', 'Kosovo', 'Palestinian Territories']


In [30]:
df = pd.merge(df, df2, on='Country', how='outer')

In [31]:
pd.isna(df).sum()

Country                                            0
Region                                            38
Happiness Rank                                    38
Happiness Score                                   38
Standard Error                                    38
Economy (GDP per Capita)                          38
Family                                            38
Health (Life Expectancy)                          38
Freedom                                           38
Trust (Government Corruption)                     38
Generosity                                        38
Dystopia Residual                                 38
Average yearly temperature (1961–1990 Celsius)     4
dtype: int64

In [32]:
df = df.add_suffix(' 2015')


In [33]:
df.rename(columns = {"Economy (GDP per Capita) 2015":"GDP per Capita 2015" , "Average yearly temperature (1961–1990 Celsius) 2015": "Average yearly temperature", "Country 2015": "Country", "Region 2015": "Region"}, inplace=True)

In [34]:
df2019

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [35]:
df2019 = df2019.add_suffix(' 2019')

In [36]:
df2019.rename(columns = {"Country or region 2019" : "Country", "Overall rank 2019":"Happiness Rank 2019", "Score 2019": "Happiness Score 2019", "GDP per capita 2019":"GDP per Capita 2019" }, inplace=True)

In [37]:
df2019

Unnamed: 0,Happiness Rank 2019,Country,Happiness Score 2019,GDP per Capita 2019,Social support 2019,Healthy life expectancy 2019,Freedom to make life choices 2019,Generosity 2019,Perceptions of corruption 2019
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [38]:
df2019_countries = list(df2019['Country'].values)
df_countries = list(df['Country'].values)

In [39]:
df_remaining_countries=[]
df2019_remaining_countries=[]

for i in df_countries:
    if i not in df2019_countries:
        df_remaining_countries.append(i)
        
for i in df2019_countries:
    if i not in df_countries:
        df2019_remaining_countries.append(i)
        
print(df_remaining_countries)

['Oman', 'Suriname', 'Trinidad and Tobago', 'North Cyprus', 'Macedonia', 'Sudan', 'Democratic Republic of the Congo', 'Djibouti', 'Angola', 'Republic of the Congo', 'Kiribati', 'Maldives', 'Tuvalu', 'Palau', 'Marshall Islands', 'Seychelles', 'Brunei', 'Saint Vincent and the Grenadines', 'Guinea-Bissau', 'Samoa', 'Grenada', 'Barbados', 'Guyana', 'Federated States of Micronesia', 'Antigua and Barbuda', 'Solomon Islands', 'Eritrea', 'Saint Lucia', 'Belize', 'Timor-Leste', 'Papua New Guinea', 'Tonga', 'Cuba', 'Bahamas', 'Equatorial Guinea', 'Saint Kitts and Nevis', 'Fiji', 'Vanuatu', 'São Tomé and Príncipe', 'Cape Verde', 'Dominica', 'Monaco', 'San Marino', 'Andorra', 'Liechtenstein', 'North Korea']


In [40]:
print(df2019_remaining_countries)

['Trinidad & Tobago', 'Northern Cyprus', 'North Macedonia', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'South Sudan']


mask_Cyprus = df2019['Country'].str.contains('Northern Cyprus', case=False)
df2019.loc[mask_Cyprus, 'Country'] = 'North Cyprus'

In [41]:
mask_Congo = df2019['Country'].str.contains('Brazzaville', case=False)
df2019.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

In [42]:
mask_Sudan = df2019['Country'].str.contains('Sudan', case=False)
df2019.loc[mask_Sudan, 'Country'] = 'Sudan'

In [43]:
mask_Democratic = df2019['Country'].str.contains('Kinshasa', case=False)
df2019.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'

In [44]:
mask_Trinidad = df2019['Country'].str.contains('Trinidad & Tobago', case=False)
df2019.loc[mask_Trinidad, 'Country'] = 'Trinidad and Tobago'

In [45]:
mask_Macedonia = df2019['Country'].str.contains('Macedonia', case=False)
df2019.loc[mask_Macedonia, 'Country'] = 'Macedonia'

In [46]:
mask_Somalia = df2019['Country'].str.contains('Somaliland', case=False)
df2019.loc[mask_Somalia, 'Country'] = 'Somalia'

In [47]:
mask_Cyprus = df2019['Country'].str.contains('Northern Cyprus', case=False)
df2019.loc[mask_Cyprus, 'Country'] = 'North Cyprus'


In [48]:
df2017.sample()

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
119,Sri Lanka,120,4.44,4.553447,4.326553,1.00985,1.259976,0.625131,0.561213,0.490864,0.073654,0.419389


In [49]:
df2017 = df2017.add_suffix(' 2017')

In [50]:
df2017.rename(columns = {"Country 2017" : "Country", "Happiness.Rank 2017" :"Happiness Rank 2017", "Happiness.Score 2017" : "Happiness Score 2017", "Economy..GDP.per.Capita. 2017": "GDP per Capita 2017"}, inplace=True)

In [51]:
df2017_countries = list(df2017['Country'].values)
df_countries = list(df['Country'].values)

In [52]:
df_remaining_countries=[]
df2017_remaining_countries=[]

for i in df_countries:
    if i not in df2017_countries:
        df_remaining_countries.append(i)
        
for i in df2017_countries:
    if i not in df_countries:
        df2017_remaining_countries.append(i)
        
print(df_remaining_countries)

['Oman', 'Taiwan', 'Suriname', 'Hong Kong', 'Laos', 'Swaziland', 'Democratic Republic of the Congo', 'Djibouti', 'Republic of the Congo', 'Comoros', 'Kiribati', 'Maldives', 'Tuvalu', 'Palau', 'Gambia', 'Marshall Islands', 'Seychelles', 'Brunei', 'Saint Vincent and the Grenadines', 'Guinea-Bissau', 'Samoa', 'Grenada', 'Barbados', 'Guyana', 'Federated States of Micronesia', 'Antigua and Barbuda', 'Solomon Islands', 'Eritrea', 'Saint Lucia', 'Timor-Leste', 'Papua New Guinea', 'Tonga', 'Cuba', 'Bahamas', 'Equatorial Guinea', 'Saint Kitts and Nevis', 'Fiji', 'Vanuatu', 'São Tomé and Príncipe', 'Cape Verde', 'Dominica', 'Monaco', 'San Marino', 'Andorra', 'Liechtenstein', 'North Korea']


In [53]:
print(df2017_remaining_countries)

['Taiwan Province of China', 'Hong Kong S.A.R., China', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'South Sudan']


In [54]:
mask_Taiwan = df2017['Country'].str.contains('Taiwan', case=False)
df2017.loc[mask_Taiwan, 'Country'] = 'Taiwan'

In [55]:
mask_HongKong = df2017['Country'].str.contains('Hong Kong', case=False)
df2017.loc[mask_HongKong, 'Country'] = 'Hong Kong'

In [56]:
mask_Democratic = df2017['Country'].str.contains('Kinshasa', case=False)
df2017.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'

In [57]:
mask_Congo = df2017['Country'].str.contains('Brazzaville', case=False)
df2017.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

In [58]:
df2021.sample()

Unnamed: 0,ï»¿Country name,Regional indicator,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
147,Zimbabwe,Sub-Saharan Africa,3.145,0.058,3.259,3.03,7.943,0.75,56.201,0.677,-0.047,0.821,2.43,0.457,0.649,0.243,0.359,0.157,0.075,1.205


In [59]:
df2021 = df2021.add_suffix(' 2021')

In [60]:
df2021.columns

Index(['ï»¿Country name 2021', 'Regional indicator 2021', 'Ladder score 2021',
       'Standard error of ladder score 2021', 'upperwhisker 2021',
       'lowerwhisker 2021', 'Logged GDP per capita 2021',
       'Social support 2021', 'Healthy life expectancy 2021',
       'Freedom to make life choices 2021', 'Generosity 2021',
       'Perceptions of corruption 2021', 'Ladder score in Dystopia 2021',
       'Explained by: Log GDP per capita 2021',
       'Explained by: Social support 2021',
       'Explained by: Healthy life expectancy 2021',
       'Explained by: Freedom to make life choices 2021',
       'Explained by: Generosity 2021',
       'Explained by: Perceptions of corruption 2021',
       'Dystopia + residual 2021'],
      dtype='object')

In [61]:
df2021.rename(columns = {"ï»¿Country name 2021" : "Country", "Ladder score 2021": "Happiness Score 2021", "Explained by: Log GDP per capita 2021":"GDP per Capita 2021"}, inplace=True)

In [62]:
df2021_countries = list(df2021['Country'].values)
df_countries = list(df['Country'].values)

In [63]:
df_remaining_countries=[]
df2021_remaining_countries=[]

for i in df_countries:
    if i not in df2021_countries:
        df_remaining_countries.append(i)
        
for i in df2021_countries:
    if i not in df_countries:
        df2021_remaining_countries.append(i)
        
print(df_remaining_countries)

['Oman', 'Qatar', 'Taiwan', 'Suriname', 'Trinidad and Tobago', 'Hong Kong', 'Bhutan', 'Somalia', 'Macedonia', 'Sudan', 'Democratic Republic of the Congo', 'Djibouti', 'Angola', 'Republic of the Congo', 'Central African Republic', 'Syria', 'Kiribati', 'Tuvalu', 'Palau', 'Marshall Islands', 'Seychelles', 'Brunei', 'Saint Vincent and the Grenadines', 'Guinea-Bissau', 'Samoa', 'Grenada', 'Barbados', 'Guyana', 'Federated States of Micronesia', 'Antigua and Barbuda', 'Solomon Islands', 'Eritrea', 'Saint Lucia', 'Belize', 'Timor-Leste', 'Papua New Guinea', 'Tonga', 'Cuba', 'Bahamas', 'Equatorial Guinea', 'Saint Kitts and Nevis', 'Fiji', 'Vanuatu', 'São Tomé and Príncipe', 'Cape Verde', 'Dominica', 'Monaco', 'San Marino', 'Andorra', 'Liechtenstein', 'North Korea']


In [64]:
print(df2021_remaining_countries)

['Taiwan Province of China', 'Hong Kong S.A.R. of China', 'Congo (Brazzaville)', 'North Macedonia']


In [65]:
mask_Taiwan = df2021['Country'].str.contains('Taiwan', case=False)
df2021.loc[mask_Taiwan, 'Country'] = 'Taiwan'

In [66]:
mask_Congo = df2021['Country'].str.contains('Brazzaville', case=False)
df2021.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

In [67]:
mask_HongKong = df2021['Country'].str.contains('Hong Kong', case=False)
df2021.loc[mask_HongKong, 'Country'] = 'Hong Kong'

In [68]:
mask_Macedonia = df2021['Country'].str.contains('Macedonia', case=False)
df2021.loc[mask_Macedonia, 'Country'] = 'Macedonia'

In [69]:
df.drop(columns=["Region", "Standard Error 2015", "Family 2015", "Health (Life Expectancy) 2015", "Freedom 2015", "Trust (Government Corruption) 2015", "Generosity 2015", "Dystopia Residual 2015"], axis=1, inplace=True)

In [70]:
df.head()

Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature
0,Switzerland,1.0,7.587,1.39651,5.1
1,Iceland,2.0,7.561,1.30232,-0.7
2,Denmark,3.0,7.527,1.32548,-3.7
3,Norway,4.0,7.522,1.459,1.5
4,Canada,5.0,7.427,1.32629,-5.1


In [71]:
df2017.drop(columns=["Whisker.high 2017", "Whisker.low 2017", "Family 2017", "Health..Life.Expectancy. 2017", "Freedom 2017", "Generosity 2017", "Trust..Government.Corruption. 2017", "Dystopia.Residual 2017"], axis=1, inplace=True)

In [72]:
df2017.head()

Unnamed: 0,Country,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017
0,Norway,1,7.537,1.616463
1,Denmark,2,7.522,1.482383
2,Iceland,3,7.504,1.480633
3,Switzerland,4,7.494,1.56498
4,Finland,5,7.469,1.443572


In [73]:
df2019.drop(columns=["Social support 2019", "Freedom to make life choices 2019", "Generosity 2019", "Perceptions of corruption 2019","Healthy life expectancy 2019"], axis=1, inplace=True)

In [74]:
df2019.head()

Unnamed: 0,Happiness Rank 2019,Country,Happiness Score 2019,GDP per Capita 2019
0,1,Finland,7.769,1.34
1,2,Denmark,7.6,1.383
2,3,Norway,7.554,1.488
3,4,Iceland,7.494,1.38
4,5,Netherlands,7.488,1.396


In [75]:
df2021.drop(columns=["Regional indicator 2021", "Standard error of ladder score 2021", "upperwhisker 2021", "lowerwhisker 2021","Social support 2021","Healthy life expectancy 2021", "Freedom to make life choices 2021", "Generosity 2021", "Perceptions of corruption 2021", "Ladder score in Dystopia 2021", "Explained by: Social support 2021", "Explained by: Healthy life expectancy 2021", "Explained by: Perceptions of corruption 2021", "Dystopia + residual 2021" ], axis=1, inplace=True)

In [76]:
df2021.drop(columns=["Explained by: Generosity 2021", "Explained by: Freedom to make life choices 2021","Logged GDP per capita 2021" ], axis=1, inplace=True)

In [77]:
df2021.sample()

Unnamed: 0,Country,Happiness Score 2021,GDP per Capita 2021
47,Serbia,6.078,1.101


In [78]:
df2021['Happiness Rank 2021'] = df2021['Happiness Score 2021'].rank(method='dense', ascending=False).astype(int)

In [79]:
df2021.head()

Unnamed: 0,Country,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021
0,Finland,7.842,1.446,1
1,Denmark,7.62,1.502,2
2,Switzerland,7.571,1.566,3
3,Iceland,7.554,1.482,4
4,Netherlands,7.464,1.501,5


In [80]:
merged_df = pd.merge(df, df2017, on='Country', how='outer')

In [81]:
merged_df2 = pd.merge(merged_df, df2019, on='Country', how='outer')

In [82]:
merged_df3 = pd.merge(merged_df2, df2021, on='Country', how='outer')

In [83]:
df=merged_df3
df

Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017,Happiness Rank 2019,Happiness Score 2019,GDP per Capita 2019,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021
0,Switzerland,1.0,7.587,1.39651,5.10,4.0,7.494,1.564980,6.0,7.480,1.452,7.571,1.566,3.0
1,Iceland,2.0,7.561,1.30232,-0.70,3.0,7.504,1.480633,4.0,7.494,1.380,7.554,1.482,4.0
2,Denmark,3.0,7.527,1.32548,-3.70,2.0,7.522,1.482383,2.0,7.600,1.383,7.620,1.502,2.0
3,Norway,4.0,7.522,1.45900,1.50,1.0,7.537,1.616463,3.0,7.554,1.488,7.392,1.543,6.0
4,Canada,5.0,7.427,1.32629,-5.10,7.0,7.316,1.479204,9.0,7.278,1.365,7.103,1.447,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,San Marino,,,,11.85,,,,,,,,,
193,Andorra,,,,7.60,,,,,,,,,
194,Liechtenstein,,,,5.80,,,,,,,,,
195,North Korea,,,,5.70,,,,,,,,,


In [84]:
df_rank=df[["Country", "Happiness Rank 2015", "Happiness Rank 2017", "Happiness Rank 2019","Happiness Rank 2021"]]

In [85]:
df_score=df[['Country', 'Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 'Happiness Score 2021']]
df_score.sample()

Unnamed: 0,Country,Happiness Score 2015,Happiness Score 2017,Happiness Score 2019,Happiness Score 2021
122,Sierra Leone,4.507,4.709,4.374,3.849


In [86]:
df_mean = df_score.loc[:, ['Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 'Happiness Score 2021']].mean(axis=1).round(2)

In [87]:
df_score["Mean Score"]= df_mean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score["Mean Score"]= df_mean


In [88]:
df = df.dropna(subset=["Happiness Score 2015",
                       "Happiness Score 2017","Happiness Score 2019", 
                       "Happiness Score 2021"], how="all")

In [89]:
eliminated_rows=['North Cyprus','Oman', 'Suriname', 'Belize', 'South Sudan', 'Maldives', 'Djibouti' ]
for i in eliminated_rows:
    df = df.drop(df.loc[df['Country'] == i].index)
    

In [90]:
df.loc[df['Country'] == 'Taiwan', 'Average yearly temperature'] = 27.0
df.loc[df['Country'] == 'Kosovo', 'Average yearly temperature'] = 15.0
df.loc[df['Country'] == 'Palestinian Territories', 'Average yearly temperature'] = 20


In [91]:
df['Mean Happiness Score'] = df[['Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 'Happiness Score 2021']].mean(axis=1).round(2)


In [92]:
df['Mean GDP per Capita'] = df[['GDP per Capita 2015', 'GDP per Capita 2017', 'GDP per Capita 2019', 'GDP per Capita 2021']].mean(axis=1).round(2)

In [93]:
df[df.isna().any(axis=1)]

Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017,Happiness Rank 2019,Happiness Score 2019,GDP per Capita 2019,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021,Mean Happiness Score,Mean GDP per Capita
27,Qatar,28.0,6.611,1.69042,27.15,35.0,6.375,1.870766,29.0,6.374,1.684,,,,6.45,1.75
40,Trinidad and Tobago,41.0,6.168,1.21183,25.75,38.0,6.168,1.361356,39.0,6.192,1.231,,,,6.18,1.27
78,Bhutan,79.0,5.253,0.77042,7.4,97.0,5.011,0.885416,95.0,5.082,0.813,,,,5.12,0.82
90,Somalia,91.0,5.057,0.18847,27.05,93.0,5.151,0.022643,112.0,4.668,0.0,,,,4.96,0.07
98,Laos,99.0,4.876,0.59066,22.8,,,,105.0,4.796,0.764,5.03,0.808,98.0,4.9,0.72
100,Swaziland,101.0,4.867,0.71206,21.4,,,,135.0,4.212,0.811,4.308,0.849,128.0,4.46,0.79
117,Sudan,118.0,4.55,0.52107,26.9,130.0,4.139,0.659517,156.0,2.853,0.306,,,,3.85,0.5
119,Democratic Republic of the Congo,120.0,4.517,0.0,24.0,126.0,4.28,0.092102,127.0,4.418,0.094,,,,4.41,0.06
136,Angola,137.0,4.033,0.75778,21.55,140.0,3.795,0.858428,,,,,,,3.91,0.81
139,Comoros,140.0,3.956,0.23906,25.3,,,,142.0,3.973,0.274,4.289,0.488,129.0,4.07,0.33


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156 entries, 0 to 190
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Country                     156 non-null    object 
 1   Happiness Rank 2015         154 non-null    float64
 2   Happiness Score 2015        154 non-null    float64
 3   GDP per Capita 2015         154 non-null    float64
 4   Average yearly temperature  156 non-null    float64
 5   Happiness Rank 2017         152 non-null    float64
 6   Happiness Score 2017        152 non-null    float64
 7   GDP per Capita 2017         152 non-null    float64
 8   Happiness Rank 2019         155 non-null    float64
 9   Happiness Score 2019        155 non-null    float64
 10  GDP per Capita 2019         155 non-null    float64
 11  Happiness Score 2021        147 non-null    float64
 12  GDP per Capita 2021         147 non-null    float64
 13  Happiness Rank 2021         147 non-null

In [95]:
#df_score['Mean Rank'] = df_score['Mean Score'].rank(method='dense', ascending=False).astype(int)

In [96]:
#df_score = df_score.sort_values("Mean Rank", ascending=True)

In [97]:
#df_score = df_score.set_index("Mean Rank")

In [98]:
#df_mean_score = df_score[["Country", "Mean score"]]

In [99]:
#df_mean_score

In [100]:
df['Mean Rank'] = df['Mean Happiness Score'].rank(method='dense', ascending=False).astype(int)
df= df.sort_values("Mean Rank", ascending=True)

In [101]:
df = df.reset_index(drop=True)

In [102]:
df_mean=df[["Country", "Mean Rank", "Mean Happiness Score", "Mean GDP per Capita", "Average yearly temperature"]]
df_mean.head()

Unnamed: 0,Country,Mean Rank,Mean Happiness Score,Mean GDP per Capita,Average yearly temperature
0,Finland,1,7.62,1.38,1.55
1,Denmark,2,7.57,1.42,-3.7
2,Switzerland,3,7.53,1.49,5.1
3,Iceland,3,7.53,1.41,-0.7
4,Norway,4,7.5,1.53,1.5


In [103]:
df_mean.tail()

Unnamed: 0,Country,Mean Rank,Mean Happiness Score,Mean GDP per Capita,Average yearly temperature
151,Rwanda,129,3.42,0.33,17.85
152,Burundi,130,3.34,0.04,19.8
153,Syria,131,3.31,0.69,17.75
154,Afghanistan,132,3.27,0.36,12.6
155,Central African Republic,133,3.15,0.03,24.9


In [104]:
df['Country'].values

array(['Finland', 'Denmark', 'Switzerland', 'Iceland', 'Norway',
       'Netherlands', 'Sweden', 'New Zealand', 'Canada', 'Australia',
       'Israel', 'Austria', 'Costa Rica', 'Luxembourg', 'Ireland',
       'United States', 'Germany', 'United Kingdom', 'Belgium',
       'Czech Republic', 'United Arab Emirates', 'Mexico', 'France',
       'Brazil', 'Malta', 'Singapore', 'Chile', 'Qatar', 'Taiwan',
       'Panama', 'Uruguay', 'Saudi Arabia', 'Spain', 'Guatemala',
       'Argentina', 'Colombia', 'Bahrain', 'Thailand',
       'Trinidad and Tobago', 'Slovakia', 'Italy', 'Kuwait',
       'El Salvador', 'Uzbekistan', 'Slovenia', 'Poland', 'Lithuania',
       'Nicaragua', 'Ecuador', 'Japan', 'Kazakhstan', 'South Korea',
       'Cyprus', 'Kosovo', 'Bolivia', 'Jamaica', 'Romania', 'Estonia',
       'Peru', 'Moldova', 'Mauritius', 'Latvia', 'Russia', 'Paraguay',
       'Malaysia', 'Croatia', 'Belarus', 'Serbia', 'Libya', 'Philippines',
       'Portugal', 'Hungary', 'Hong Kong', 'Honduras', 'Tur

In [124]:
import wbdata

# Define the indicators we want to retrieve
indicators = {"SP.POP.TOTL": "Population"}

# Define the countries we want to retrieve data for
countries = ['FI', 'DK', 'CH', 'IS', 'NO', 'NL', 'SE', 'NZ', 'CA', 'AU', 'IL', 'AT', 'CR', 'LU', 'IE', 'US', 'DE', 'GB', 
             'BE', 'CZ', 'AE', 'MX', 'FR', 'BR', 'MT', 'SG', 'CL', 'QA', 'TW', 'PA', 'UY', 'SA', 'ES', 'GT', 'AR', 'CO', 
             'BH', 'TH', 'TT', 'SK', 'IT', 'KW', 'SV', 'UZ', 'SI', 'PL', 'LT', 'NI', 'EC', 'JP', 'KZ', 'KR', 'CY', 'XK', 
             'BO', 'JM', 'RO', 'EE', 'PE', 'MD', 'MU', 'LV', 'RU', 'PY', 'MY', 'HR', 'BY', 'RS', 'LY', 'PH', 'PT', 'HU', 
             'HK', 'HN', 'TM', 'VE', 'DZ', 'ME', 'BA', 'KG', 'ID', 'TR', 'GR', 'DO', 'PK', 'VN', 'CN', 'AZ', 'MN', 'TJ', 
             'MK', 'BT', 'NG', 'MA', 'JO', 'SO', 'LB', 'NP', 'LA', 'AL', 'BG', 'ZA', 'CM', 'GM', 'GH', 'MZ', 'BD', 'PS', 
             'IR', 'TN', 'AM', 'IQ', 'CG', 'NA', 'SN', 'KE', 'CI', 'UA', 'GE', 'GA', 'SZ', 'ZM', 'CD', 'MM', 'NE', 'KH', 
             'ET', 'MR', 'SL', 'LK', 'EG', 'ML', 'BF', 'BJ', 'UG', 'LR', 'IN', 'GN', 'TD', 'KM', 'LS', 'AO', 'MG', 'SD', 
             'HT', 'MW', 'ZW', 'BW', 'YE', 'TG', 'TZ', 'RW', 'BI', 'SY', 'AF', 'CF'
]

# Retrieve the data from the API and store it in a dataframe
df_pop = wbdata.get_dataframe(indicators, country=countries)
df_pop = df_pop.loc[df_pop.index.get_level_values("date") == "2019"]
df_pop.reset_index(inplace=True)
df_pop=df_pop.drop(columns=['date'])
df_pop = df_pop.rename(columns={'country': 'Country'})


In [106]:
countries_dict={'Czech Republic':'Czechia',
                'Slovakia':'Slovak Republic',
                'South Korea':'Korea, Rep',
                'Russia':'Russian Federation',
                'Hong Kong':'Hong Kong SAR, China',
                'Venezuela':'Venezuela, RB',
                'Kyrgyzstan':'Kyrgyz Republic',
                'Turkey':'Turkiye',
                'Macedonia':'North Macedonia',
                'Laos':'Lao PDR',
                'Gambia':'Gambia, The',
                'Iran':'Iran, Islamic Rep.',
                'Republic of the Congo':'Congo, Rep.',
                'Ivory Coast':"Cote d'Ivoire" ,
                'Swaziland':'Eswatini',
                'Democratic Republic of the Congo': 'Congo, Dem. Rep.',
                'Egypt':'Egypt',
                'Yemen':'Yemen, Rep.',
                'Syria':'Syrian Arab Republic',
                'Palestinian Territories': 'West Bank and Gaza'
}

for key, value in countries_dict.items():
        mask= df_pop['Country'].str.contains(value, case=False)
        df_pop.loc[mask, 'Country'] = key

In [107]:
merged_df2 = pd.merge(df, df_pop, on='Country', how='outer')

In [108]:
merged_df2.loc[merged_df2['Country'] == 'Taiwan', 'Population'] = 23733876

In [123]:
df=merged_df2


In [115]:
import plotly.express as px

In [125]:
fig = px.scatter(df, x="Mean Happiness Score", y="Average yearly temperature", size='Mean Happiness Score', color="Average yearly temperature",
                 hover_name="Country", title="Países más felices y su temperatura")

fig.show()

In [117]:

import plotly.graph_objects as go
import plotly.offline as opy

fig = go.Figure(go.Choropleth(
    locations = df['Country'],
    locationmode = "country names",
    z = df['Mean Rank'],
    text = df['Mean GDP per Capita'],
    colorscale = 'bluyl',
    autocolorscale = False,
    reversescale = False,
    marker_line_color = '#efefef',
    marker_line_width = 0.5,
    colorbar_title = 'Happiness Rank',       
    )
)
fig.update_layout(
    title_text = 'Happiness Score and GDP per capita',
    showlegend = False,
    geo = dict(
        scope = 'world',
        resolution = 50,
        projection_type = 'miller',
        showcoastlines = True,
        showocean = True,
        showcountries = True,
        oceancolor = '#eaeaea',
        lakecolor = '#eaeaea',
        coastlinecolor = '#dadada'
    )
)
fig.show()

In [118]:
def happiness_GDP(df):
    fig = px.scatter(df, x='Mean GDP per Capita', y='Mean Happiness Score', size='Population',
                 color='Country', hover_name='Population', log_x=True, size_max=60)

    fig.update_layout(title='Comparación de la Felicidad y el PIB per cápita por País',
                  xaxis_title='PIB per cápita (USD)',
                  yaxis_title='Puntuación de Felicidad',
                  legend_title='Country')

    fig.show()

In [119]:
happiness_GDP(df)