In [1]:
import pandas as pd
import re
import seaborn as sns
import os
import matplotlib.pyplot as plt
import requests
from lxml import html

In [2]:
def downloading_csv(path):
    """This function downloads from a raw link and saves the dataframe locally.
    args:
    :url: string. the link
    :name: string. name to save the file
    """
    df=pd.read_csv(path, encoding='latin')
    return df

In [3]:
path1= 'data/happiness-2015.csv'
path2='data/happiness-2017.csv'
path3='data/happiness-2019.csv'
path4='data/happiness-2021.csv'

In [4]:
df= downloading_csv(path1)
df.sample()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
16,Luxembourg,Western Europe,17,6.946,0.03499,1.56391,1.21963,0.91894,0.61583,0.37798,0.28034,1.96961


In [5]:
df2017= downloading_csv(path2)
df.sample()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
139,Comoros,Sub-Saharan Africa,140,3.956,0.04797,0.23906,0.79273,0.36315,0.22917,0.199,0.17441,1.95812


In [6]:
df2019= downloading_csv(path3)
df.sample()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
50,Bolivia,Latin America and Caribbean,51,5.89,0.05642,0.68133,0.97841,0.5392,0.57414,0.088,0.20536,2.82334


In [7]:
df2021= downloading_csv(path4)
df.sample()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
104,Honduras,Latin America and Caribbean,105,4.788,0.05648,0.59532,0.95348,0.6951,0.40148,0.06825,0.23027,1.84408


In [8]:
def downloading_html(path): 
    headers = {"User-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
    res = requests.get(path, headers=headers)
    table = pd.read_html(res.content, encoding = 'utf8')
    df = table[0]
    return df

In [9]:
df2=downloading_html('https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature')
df2.sample()

Unnamed: 0,Country,Average yearly temperature (1961–1990 Celsius)
166,Andorra,7.6


In [10]:
def cleaning_df2015(df, df2):
    countries_dict={'Denmark':'Denmark',
                    'Cyprus': 'Cyprus',
                    'Norway': 'Norway',
                    'Somaliland': 'Somalia',
                    'Macedonia':'Macedonia',
                    'Eswatini': 'Swaziland',
                    'Kinshasa': 'Democratic Republic of the Congo',
                    'Brazzaville':'Republic of the Congo'             
    }
    
    # Iterate over each element of the species2 column of the dataframe and replace the values for the keys
    for key, value in countries_dict.items():
        mask= df['Country'].str.contains(value, case=False)
        df.loc[mask, 'Country'] = key
        
    mask_Somalia = df['Country'].str.contains('Somaliland', case=False)
    df.loc[mask_Somalia, 'Country'] = 'Somalia'
    
    df = pd.merge(df, df2, on='Country', how='outer')
    df = df.add_suffix(' 2015')
    df.rename(columns = {"Economy (GDP per Capita) 2015":"GDP per Capita 2015" ,
                         "Average yearly temperature (1961–1990 Celsius) 2015": "Average yearly temperature", 
                         "Country 2015": "Country", "Region 2015": "Region"}, inplace=True)
    
    df.drop(columns=["Region", "Standard Error 2015", "Family 2015", "Health (Life Expectancy) 2015", "Freedom 2015", 
                     "Trust (Government Corruption) 2015", "Generosity 2015", "Dystopia Residual 2015"], axis=1, inplace=True)
    
    
    return df
    

In [11]:
df = cleaning_df2015(df, df2)
df.sample()

Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature
4,Canada,5.0,7.427,1.32629,−5.10


In [12]:
def cleaning_df2017(df2017):
    df2017 = df2017.add_suffix(' 2017')
    df2017.rename(columns = {"Country 2017" : "Country", 
                             "Happiness.Rank 2017" :"Happiness Rank 2017", 
                             "Happiness.Score 2017" : "Happiness Score 2017", 
                             "Economy..GDP.per.Capita. 2017": "GDP per Capita 2017"}, inplace=True)
    
    df2017.drop(columns=["Whisker.high 2017", "Whisker.low 2017", "Family 2017", "Health..Life.Expectancy. 2017", "Freedom 2017", 
                         "Generosity 2017", "Trust..Government.Corruption. 2017", "Dystopia.Residual 2017"], axis=1, inplace=True)
    
    mask_Taiwan = df2017['Country'].str.contains('Taiwan', case=False)
    df2017.loc[mask_Taiwan, 'Country'] = 'Taiwan'

    mask_HongKong = df2017['Country'].str.contains('Hong Kong', case=False)
    df2017.loc[mask_HongKong, 'Country'] = 'Hong Kong'

    mask_Democratic = df2017['Country'].str.contains('Kinshasa', case=False)
    df2017.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'

    mask_Congo = df2017['Country'].str.contains('Brazzaville', case=False)
    df2017.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

    return df2017
    

In [13]:
df2017=cleaning_df2017(df2017)
df2017.sample()

Unnamed: 0,Country,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017
127,Ivory Coast,128,4.18,0.603049


In [14]:
def cleaning_df2019(df2019):
    df2019 = df2019.add_suffix(' 2019')
    df2019.rename(columns = {"Country or region 2019" : "Country", 
                             "Overall rank 2019":"Happiness Rank 2019", 
                             "Score 2019": "Happiness Score 2019", 
                             "GDP per capita 2019":"GDP per Capita 2019" }, inplace=True)
    
    df2019.drop(columns=["Social support 2019", "Freedom to make life choices 2019", "Generosity 2019", 
                         "Perceptions of corruption 2019","Healthy life expectancy 2019"], axis=1, inplace=True)
    
    mask_Congo = df2019['Country'].str.contains('Brazzaville', case=False)
    df2019.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

    mask_Sudan = df2019['Country'].str.contains('Sudan', case=False)
    df2019.loc[mask_Sudan, 'Country'] = 'Sudan'

    mask_Democratic = df2019['Country'].str.contains('Kinshasa', case=False)
    df2019.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'

    mask_Trinidad = df2019['Country'].str.contains('Trinidad & Tobago', case=False)
    df2019.loc[mask_Trinidad, 'Country'] = 'Trinidad and Tobago'

    mask_Macedonia = df2019['Country'].str.contains('Macedonia', case=False)
    df2019.loc[mask_Macedonia, 'Country'] = 'Macedonia'

    mask_Somalia = df2019['Country'].str.contains('Somaliland', case=False)
    df2019.loc[mask_Somalia, 'Country'] = 'Somalia'

    mask_Cyprus = df2019['Country'].str.contains('Northern Cyprus', case=False)
    df2019.loc[mask_Cyprus, 'Country'] = 'North Cyprus'
    
    return df2019

In [15]:
df2019=cleaning_df2019(df2019)
df2019.sample()

Unnamed: 0,Happiness Rank 2019,Country,Happiness Score 2019,GDP per Capita 2019
44,45,Nicaragua,6.105,0.694


In [16]:
def cleaning_df2021(df2021):
    df2021 = df2021.add_suffix(' 2021')
    df2021.rename(columns = {"ï»¿Country name 2021" : "Country", 
                             "Ladder score 2021": "Happiness Score 2021", 
                             "Explained by: Log GDP per capita 2021":"GDP per Capita 2021"}, inplace=True)
    
    df2021.drop(columns=["Regional indicator 2021", "Standard error of ladder score 2021", "upperwhisker 2021", 
                         "lowerwhisker 2021","Social support 2021","Healthy life expectancy 2021", "Freedom to make life choices 2021", 
                         "Generosity 2021", "Perceptions of corruption 2021", "Ladder score in Dystopia 2021", 
                         "Explained by: Social support 2021", "Explained by: Healthy life expectancy 2021", 
                         "Explained by: Perceptions of corruption 2021", "Dystopia + residual 2021", "Explained by: Generosity 2021", 
                         "Explained by: Freedom to make life choices 2021","Logged GDP per capita 2021"  ], axis=1, inplace=True)
    
    mask_Taiwan = df2021['Country'].str.contains('Taiwan', case=False)
    df2021.loc[mask_Taiwan, 'Country'] = 'Taiwan'

    mask_Congo = df2021['Country'].str.contains('Brazzaville', case=False)
    df2021.loc[mask_Congo, 'Country'] = 'Republic of the Congo'

    mask_HongKong = df2021['Country'].str.contains('Hong Kong', case=False)
    df2021.loc[mask_HongKong, 'Country'] = 'Hong Kong'

    mask_Macedonia = df2021['Country'].str.contains('Macedonia', case=False)
    df2021.loc[mask_Macedonia, 'Country'] = 'Macedonia'
    
    df2021['Happiness Rank 2021'] = df2021['Happiness Score 2021'].rank(method='dense', ascending=False).astype(int)
    
    return df2021

In [17]:
df2021=cleaning_df2021(df2021)
df2021.sample()

Unnamed: 0,Country,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021
59,Croatia,5.882,1.251,59


In [18]:
def merging(df,df1,df2,df3):
    merged_df = pd.merge(df, df1, on='Country', how='outer')
    merged_df2 = pd.merge(merged_df, df2, on='Country', how='outer')
    merged_df3 = pd.merge(merged_df2, df3, on='Country', how='outer')
    df=merged_df3
    
    return df

In [19]:
df=merging(df,df2017,df2019,df2021)
df.sample()

Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017,Happiness Rank 2019,Happiness Score 2019,GDP per Capita 2019,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021
125,Djibouti,126.0,4.369,0.44025,28.0,,,,,,,,,


In [20]:
def finaldf(df):
    df_score=df[['Country', 'Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 'Happiness Score 2021']]
    df_mean = df_score.loc[:, ['Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 'Happiness Score 2021']].mean(axis=1).round(2)
    df_score["Mean Score"]= df_mean
    df = df.dropna(subset=["Happiness Score 2015",
                       "Happiness Score 2017","Happiness Score 2019", 
                       "Happiness Score 2021"], how="all")
    
    eliminated_rows=['North Cyprus','Oman', 'Suriname', 'Belize', 'South Sudan', 'Maldives', 'Djibouti' ]
    for i in eliminated_rows:
        df = df.drop(df.loc[df['Country'] == i].index)
        
    df.loc[df['Country'] == 'Taiwan', 'Average yearly temperature'] = 27.0
    df.loc[df['Country'] == 'Kosovo', 'Average yearly temperature'] = 15.0
    df.loc[df['Country'] == 'Palestinian Territories', 'Average yearly temperature'] = 20
    
    df['Mean Happiness Score'] = df[['Happiness Score 2015', 'Happiness Score 2017', 'Happiness Score 2019', 
                                     'Happiness Score 2021']].mean(axis=1).round(2)
    
    df['Mean GDP per Capita'] = df[['GDP per Capita 2015', 'GDP per Capita 2017', 'GDP per Capita 2019', 
                                    'GDP per Capita 2021']].mean(axis=1).round(2)
    
    df['Mean Rank'] = df['Mean Happiness Score'].rank(method='dense', ascending=False).astype(int)
    df= df.sort_values("Mean Rank", ascending=True)
    df = df.reset_index(drop=True)
    
    return df

In [21]:
df= finaldf(df)
df.sample()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_score["Mean Score"]= df_mean


Unnamed: 0,Country,Happiness Rank 2015,Happiness Score 2015,GDP per Capita 2015,Average yearly temperature,Happiness Rank 2017,Happiness Score 2017,GDP per Capita 2017,Happiness Rank 2019,Happiness Score 2019,GDP per Capita 2019,Happiness Score 2021,GDP per Capita 2021,Happiness Rank 2021,Mean Happiness Score,Mean GDP per Capita,Mean Rank
82,Turkey,76.0,5.332,1.06098,9.9,69.0,5.5,1.198274,79.0,5.373,1.183,4.948,1.26,102.0,5.29,1.18,75


In [22]:
import plotly.graph_objects as go
import plotly.offline as opy
import plotly.express as px

In [29]:
def happiness_temperature(df):
    fig = px.line(df, x="Mean Rank", y="Average yearly temperature")
    fig.show()  

In [30]:
happiness_temperature(df)

In [25]:
def happiness_worldmap(df):
    
    import plotly.graph_objects as go
    import plotly.offline as opy

    fig = go.Figure(go.Choropleth(
        locations = df['Country'],
        locationmode = "country names",
        z = df['Mean Rank'],
        text = df['Mean GDP per Capita'],
        colorscale = 'bluyl',
        autocolorscale = False,
        reversescale = False,
        marker_line_color = '#efefef',
        marker_line_width = 0.5,
        colorbar_title = 'Happiness Rank',       
        )
    )
    fig.update_layout(
        title_text = 'Happiness Score and GDP per capita',
        showlegend = False,
        geo = dict(
            scope = 'world',
            resolution = 50,
            projection_type = 'miller',
            showcoastlines = True,
            showocean = True,
            showcountries = True,
            oceancolor = '#eaeaea',
            lakecolor = '#eaeaea',
            coastlinecolor = '#dadada'
        )
    )
    fig.show()

In [26]:
happiness_worldmap(df)

In [27]:
    """ mask_Somalia = df['Country'].str.contains('Somaliland', case=False)
    df.loc[mask_Somalia, 'Country'] = 'Somalia'  """

" mask_Somalia = df['Country'].str.contains('Somaliland', case=False)\ndf.loc[mask_Somalia, 'Country'] = 'Somalia'  "

In [28]:
    """ mask_Denmark = df2['Country'].str.contains('Denmark', case=False)
    df2.loc[mask_Denmark, 'Country'] = 'Denmark'
    
    mask_Cyprus = df2['Country'].str.contains('Cyprus', case=False)
    df2.loc[mask_Cyprus, 'Country'] = 'Cyprus'
    
    mask_Norway = df2['Country'].str.contains('Norway', case=False)
    df2.loc[mask_Norway, 'Country'] = 'Norway'
    
    mask_Somalia = df2['Country'].str.contains('Somaliland', case=False)
    df2.loc[mask_Somalia, 'Country'] = 'Somalia'
    
    mask_Macedonia = df2['Country'].str.contains('Macedonia', case=False)
    df2.loc[mask_Macedonia, 'Country'] = 'Macedonia'
    
    mask_Swaziland = df2['Country'].str.contains('Eswatini', case=False)
    df2.loc[mask_Swaziland, 'Country'] = 'Swaziland'
    
    mask_Democratic = df['Country'].str.contains('Kinshasa', case=False)
    df.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'
    
    mask_Congo = df['Country'].str.contains('Brazzaville', case=False)
    df.loc[mask_Congo, 'Country'] = 'Republic of the Congo'  """

" mask_Denmark = df2['Country'].str.contains('Denmark', case=False)\ndf2.loc[mask_Denmark, 'Country'] = 'Denmark'\n\nmask_Cyprus = df2['Country'].str.contains('Cyprus', case=False)\ndf2.loc[mask_Cyprus, 'Country'] = 'Cyprus'\n\nmask_Norway = df2['Country'].str.contains('Norway', case=False)\ndf2.loc[mask_Norway, 'Country'] = 'Norway'\n\nmask_Somalia = df2['Country'].str.contains('Somaliland', case=False)\ndf2.loc[mask_Somalia, 'Country'] = 'Somalia'\n\nmask_Macedonia = df2['Country'].str.contains('Macedonia', case=False)\ndf2.loc[mask_Macedonia, 'Country'] = 'Macedonia'\n\nmask_Swaziland = df2['Country'].str.contains('Eswatini', case=False)\ndf2.loc[mask_Swaziland, 'Country'] = 'Swaziland'\n\nmask_Democratic = df['Country'].str.contains('Kinshasa', case=False)\ndf.loc[mask_Democratic, 'Country'] = 'Democratic Republic of the Congo'\n\nmask_Congo = df['Country'].str.contains('Brazzaville', case=False)\ndf.loc[mask_Congo, 'Country'] = 'Republic of the Congo'  "