In [196]:
import pandas as pd
import numpy as np
df = pd.read_csv("Happiness_Report_2015_2020.csv")
df.head()

Unnamed: 0,Report Year,Overall Rank,Country,Happiness Score,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,2015,1,Switzerland,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2.51738
1,2015,2,Iceland,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2.70201
2,2015,3,Denmark,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2.49204
3,2015,4,Norway,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2.46531
4,2015,5,Canada,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2.45176


In [197]:
# Any country that has a zero in any of the columns is not being compared on an equal footing, so remove it from the dataset
df_clean = df.replace({'0':np.nan, 0:np.nan})
df_clean = df_clean.dropna(how="any")
print(f"There were {len(df)} data points in the original Dataframe, but only {len(df_clean)} usable points")

There were 935 data points in the original Dataframe, but only 899 usable points


In [198]:
# Define the year values to loop through
years = [2015, 2016, 2017, 2018, 2019, 2020]

In [199]:
# Create a series of the of the countries to include in the analysis
countries = df_clean.Country.unique()

In [200]:
# Get the number of countries for future reference
country_counts = df_clean.Country.value_counts()

In [201]:
# Exclude countries that are not present in all years of the report
clean_countries = []
for country in countries:
    # If country has six occurences, add it to clean_countries
    if country_counts[f"{country}"] == 6:
        clean_countries.append(country)

print(f"There were {len(countries)} in the clean dataset, but only {len(clean_countries)} countries that appeared in every report with clean data")

# Create a new dataframe that considers only the countries that are there every year
df_clean = df.loc[df['Country'].isin(clean_countries)]

There were 170 in the clean dataset, but only 126 countries that appeared in every report with clean data


In [202]:
# Create an empty df with columns "Country" "Report Year" "Position Change"
yoy_df = pd.DataFrame(columns = ['Country', 'Report Year', "Clean Rank", "Position Change", "Happiness Change"])

# grab only certain columns, and one year at a time. Sort it by 
dfs = []

for year in years:
    
    rows_df = df_clean.loc[df['Report Year'] == year, df_clean.columns.isin(["Country", "Report Year", "Overall Rank", "Happiness Score"])].sort_values(by=['Overall Rank'])
    rows_df["Clean Rank"] = np.arange(0, len(df_clean.loc[df['Report Year'] == year]))
    dfs.append(rows_df)

In [203]:
print(dfs[0].loc[dfs[0]["Country"] == "Norway", "Clean Rank"])
print(dfs[1].loc[dfs[1]["Country"] == "Norway", "Clean Rank"])
print(dfs[2].loc[dfs[2]["Country"] == "Norway", "Clean Rank"])
print(dfs[3].loc[dfs[3]["Country"] == "Norway", "Clean Rank"])
print(dfs[4].loc[dfs[4]["Country"] == "Norway", "Clean Rank"])
print(dfs[5].loc[dfs[5]["Country"] == "Norway", "Clean Rank"])

3    3
Name: Clean Rank, dtype: int32
161    3
Name: Clean Rank, dtype: int32
315    0
Name: Clean Rank, dtype: int32
471    1
Name: Clean Rank, dtype: int32
628    2
Name: Clean Rank, dtype: int32
786    4
Name: Clean Rank, dtype: int32


In [204]:
for i in range(len(dfs) - 1):
    
    d_position = []
    d_happiness = []
    
    for row in range(len(dfs[i])):
        
        country = dfs[i+1].iloc[row]["Country"]
        # Get the data for a given country at the given year as well as the previous year
        last_report = dfs[i].loc[dfs[i]["Country"] == f"{country}"]
        new_report = dfs[i+1].loc[dfs[i+1]["Country"] == f"{country}"]

        # Get only the rank and the overall happiness score
        idx_new = new_report.index.to_list()
        idx_last = last_report.index.to_list()

        if len(idx_new) > 1:
            print(f"Error: Multiple reports recieved for {country} in {years[i+1]}")
                                                                       
        elif len(idx_last) >1 :
            print(f"Error: Multiple reports recieved for {country} in {years[i]}")

        else:
            new_pos = new_report.at[idx_new[0],"Clean Rank"]
            last_pos = last_report.at[idx_last[0],"Clean Rank"]
            new_happiness = new_report.at[idx_new[0],"Happiness Score"]
            last_happiness = last_report.at[idx_last[0],"Happiness Score"]

        d_position.append(last_pos - new_pos)
        d_happiness.append(new_happiness - last_happiness)

    dfs[i+1]["Position Change"] = d_position
    dfs[i+1]["Happiness Change"] = d_happiness
    #dfs[i+1] = dfs[i+1].astype({'Position Change': 'int32'})
                                                                       
    yoy_df = pd.concat([yoy_df, dfs[i+1]], ignore_index=True)

yoy_df = yoy_df.astype({'Position Change': 'int32'})
yoy_df = yoy_df.astype({'Clean Rank': 'int32'})



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [205]:
yoy_df.loc[yoy_df["Country"] == "Norway"]

Unnamed: 0,Clean Rank,Country,Happiness Change,Happiness Score,Overall Rank,Position Change,Report Year
3,3,Norway,-0.024,7.498,4.0,0,2016
126,0,Norway,0.039,7.537,1.0,3,2017
253,1,Norway,0.057,7.594,2.0,-1,2018
380,2,Norway,-0.04,7.554,3.0,-1,2019
508,4,Norway,-0.066,7.488,5.0,-2,2020


In [206]:
summary_df = yoy_df.groupby(["Country"]).mean()

Unnamed: 0_level_0,Clean Rank,Happiness Change,Happiness Score,Overall Rank,Position Change
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Albania,92.0,-0.01526,4.69734,108.4,-2.2
Algeria,63.6,-0.11998,5.54762,72.6,-5.6
Argentina,33.0,-0.11986,6.33954,36.2,-5.0
Armenia,101.4,0.06536,4.45856,120.6,0.4
Australia,9.4,-0.01224,7.26396,10.4,-0.4


In [209]:
print(summary_df.sort_values(by="Position Change", ascending=False))

                     Clean Rank  Happiness Change  Happiness Score  \
Country                                                              
Benin                     104.2           0.37520          4.27620   
Ivory Coast                94.2           0.31566          4.58886   
Congo (Brazzaville)        94.0           0.24108          4.61848   
Hungary                    61.8           0.24008          5.56948   
Honduras                   66.8           0.23304          5.47384   
...                         ...               ...              ...   
Zimbabwe                  117.0          -0.26216          3.74444   
Nigeria                    83.6          -0.10878          5.01862   
Jordan                     80.2          -0.11172          5.06788   
Zambia                    104.0          -0.27392          4.31048   
Venezuela                  74.8          -0.35136          5.18004   

                     Overall Rank  Position Change  
Country                             

In [208]:
print(summary_df.sort_values(by="Happiness Change", ascending=False))

Unnamed: 0_level_0,Clean Rank,Happiness Change,Happiness Score,Overall Rank,Position Change
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Benin,104.2,0.37520,4.27620,124.0,9.8
Ivory Coast,94.2,0.31566,4.58886,111.6,9.4
Guinea,110.4,0.25866,4.11226,132.0,6.4
Congo (Brazzaville),94.0,0.24108,4.61848,111.2,7.0
Hungary,61.8,0.24008,5.56948,70.0,7.0
...,...,...,...,...,...
Botswana,118.2,-0.17062,3.65938,144.0,-3.8
India,108.8,-0.19834,4.09946,131.4,-4.6
Zimbabwe,117.0,-0.26216,3.74444,142.0,-6.2
Zambia,104.0,-0.27392,4.31048,125.2,-8.8
