In [34]:
import pandas as pd
import numpy as np
df = pd.read_csv("Happiness_Report_2012_2020.csv")
df.head()

Unnamed: 0,Report Year,Overall Rank,Country,Happiness Score
0,2015,1,Switzerland,7.587
1,2015,2,Iceland,7.561
2,2015,3,Denmark,7.527
3,2015,4,Norway,7.522
4,2015,5,Canada,7.427


In [35]:
# Any country that has a zero in any of the columns is not being compared on an equal footing, so remove it from the dataset
df_clean = df.replace({'0':np.nan, 0:np.nan})
df_clean = df_clean.dropna(how="any")
print(f"There were {len(df)} data points in the original Dataframe, but only {len(df_clean)} usable points")

There were 1247 data points in the original Dataframe, but only 1247 usable points


In [36]:
# Define the year values to loop through
years = [2012, 2013, 2015, 2016, 2017, 2018, 2019, 2020]

In [37]:
# Create a series of the of the countries to include in the analysis
countries = df_clean.Country.unique()

In [38]:
# Get the number of countries for future reference
country_counts = df_clean.Country.value_counts()

In [39]:
# Exclude countries that are not present in all years of the report
clean_countries = []
for country in countries:
    # If country has seven occurences, add it to clean_countries
    if country_counts[f"{country}"] == 8:
        clean_countries.append(country)

print(f"There were {len(countries)} in the clean dataset, but only {len(clean_countries)} countries that appeared in every report with clean data")

# Create a new dataframe that considers only the countries that are there every year
df_clean = df.loc[df['Country'].isin(clean_countries)]

There were 169 in the clean dataset, but only 139 countries that appeared in every report with clean data


In [40]:
# Create an empty df with columns "Country" "Report Year" "Position Change"
yoy_df = pd.DataFrame(columns = ['Country', 'Report Year', "Clean Rank", "Position Change", "Happiness Change"])

# grab only certain columns, and one year at a time. Sort it by 
dfs = []

for year in years:
    
    rows_df = df_clean.loc[df['Report Year'] == year, df_clean.columns.isin(["Country", "Report Year", "Overall Rank", "Happiness Score"])].sort_values(by=['Overall Rank'])
    rows_df["Clean Rank"] = np.arange(0, len(df_clean.loc[df['Report Year'] == year]))
    dfs.append(rows_df)

In [41]:
print(dfs[0].loc[dfs[0]["Country"] == "Norway", "Clean Rank"])
print(dfs[1].loc[dfs[1]["Country"] == "Norway", "Clean Rank"])
print(dfs[2].loc[dfs[2]["Country"] == "Norway", "Clean Rank"])
print(dfs[3].loc[dfs[3]["Country"] == "Norway", "Clean Rank"])
print(dfs[4].loc[dfs[4]["Country"] == "Norway", "Clean Rank"])
print(dfs[5].loc[dfs[5]["Country"] == "Norway", "Clean Rank"])

1093    2
Name: Clean Rank, dtype: int32
936    1
Name: Clean Rank, dtype: int32
3    3
Name: Clean Rank, dtype: int32
161    3
Name: Clean Rank, dtype: int32
315    0
Name: Clean Rank, dtype: int32
471    1
Name: Clean Rank, dtype: int32


In [42]:
for i in range(len(dfs) - 1):
    
    d_position = []
    d_happiness = []
    
    for row in range(len(dfs[i])):
        
        country = dfs[i+1].iloc[row]["Country"]
        # Get the data for a given country at the given year as well as the previous year
        last_report = dfs[i].loc[dfs[i]["Country"] == f"{country}"]
        new_report = dfs[i+1].loc[dfs[i+1]["Country"] == f"{country}"]

        # Get only the rank and the overall happiness score
        idx_new = new_report.index.to_list()
        idx_last = last_report.index.to_list()

        if len(idx_new) > 1:
            print(f"Error: Multiple reports recieved for {country} in {years[i+1]}")
                                                                       
        elif len(idx_last) >1 :
            print(f"Error: Multiple reports recieved for {country} in {years[i]}")

        else:
            new_pos = new_report.at[idx_new[0],"Clean Rank"]
            last_pos = last_report.at[idx_last[0],"Clean Rank"]
            new_happiness = new_report.at[idx_new[0],"Happiness Score"]
            last_happiness = last_report.at[idx_last[0],"Happiness Score"]

        d_position.append(last_pos - new_pos)
        d_happiness.append(new_happiness - last_happiness)

    dfs[i+1]["Position Change"] = d_position
    dfs[i+1]["Happiness Change"] = d_happiness
    #dfs[i+1] = dfs[i+1].astype({'Position Change': 'int32'})
                                                                       
    yoy_df = pd.concat([yoy_df, dfs[i+1]], ignore_index=True)

yoy_df = yoy_df.astype({'Position Change': 'int32'})
yoy_df = yoy_df.astype({'Clean Rank': 'int32'})



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [43]:
yoy_df.loc[yoy_df["Country"] == "Norway"]

Unnamed: 0,Clean Rank,Country,Happiness Change,Happiness Score,Overall Rank,Position Change,Report Year
1,1,Norway,0.125588,7.655,2.0,1,2013
142,3,Norway,-0.133,7.522,4.0,-2,2015
281,3,Norway,-0.024,7.498,4.0,0,2016
417,0,Norway,0.039,7.537,1.0,3,2017
557,1,Norway,0.057,7.594,2.0,-1,2018
697,2,Norway,-0.04,7.554,3.0,-1,2019
838,4,Norway,-0.066,7.488,5.0,-2,2020


In [44]:
summary_df = yoy_df.groupby(["Country"]).mean()

In [45]:
print(summary_df.sort_values(by="Position Change", ascending=False))

                     Clean Rank  Happiness Change  Happiness Score  \
Country                                                              
Benin                120.000000          0.247073         4.035571   
Congo (Brazzaville)  107.571429          0.197656         4.482629   
Philippines           71.000000          0.169258         5.418286   
Serbia                75.428571          0.172948         5.326743   
Latvia                62.142857          0.170463         5.625286   
...                         ...               ...              ...   
India                114.571429         -0.197192         4.262043   
Myanmar              113.428571         -0.145215         4.380286   
Turkmenistan          67.428571         -0.203154         5.522586   
Jordan                82.571429         -0.155493         5.135057   
Venezuela             63.857143         -0.262988         5.678457   

                     Overall Rank  Position Change  
Country                             

In [46]:
print(summary_df.sort_values(by="Happiness Change", ascending=False))

                     Clean Rank  Happiness Change  Happiness Score  \
Country                                                              
Benin                120.000000          0.247073         4.035571   
Congo (Brazzaville)  107.571429          0.197656         4.482629   
Serbia                75.428571          0.172948         5.326743   
Bulgaria             105.714286          0.172801         4.596500   
Latvia                62.142857          0.170463         5.625286   
...                         ...               ...              ...   
Botswana             127.571429         -0.155524         3.799843   
India                114.571429         -0.197192         4.262043   
Turkmenistan          67.428571         -0.203154         5.522586   
Afghanistan          133.000000         -0.239478         3.452986   
Venezuela             63.857143         -0.262988         5.678457   

                     Overall Rank  Position Change  
Country                             