#### Contents
    1. Notebook Set-Up
    2. Data Overview
    3. Data Wrangling
    4. Data Cleaning
    5. Export

# 1. Set-Up

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# File pathway

path = r'/Volumes/RAID Volume/Data Analytics/6 Advanced Analytics & Dashboard Design/11-2022 World Happiness Report'

In [3]:
# Import data set

df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'World_Happiness_Report_22.csv'))

# 2. Data Overview

In [4]:
df.shape

(1157, 15)

In [5]:
df.head()

Unnamed: 0,Country,Region,Year,Happiness Rank,Happiness Score,Economy GDP per Capita,Family Social Support,Health Life Expectancy,Freedom to Make Choices,Perceptions of Corruption,Generosity,Dystopia Residual,Population 2020,Population 2019,"COVID-19 deaths per 100,000 population in 2020"
0,Afghanistan,South Asia,2015,153,3.575,0.32,0.303,0.303,0.234,0.097,0.365,1.952,38928341,38041754,5.628
1,Afghanistan,South Asia,2016,154,3.36,0.382,0.11,0.173,0.164,0.071,0.313,2.146,38928341,38041754,5.628
2,Afghanistan,South Asia,2017,141,3.794,0.401,0.582,0.181,0.106,0.061,0.312,2.151,38928341,38041754,5.628
3,Afghanistan,South Asia,2018,145,3.632,0.332,0.537,0.255,0.085,0.036,0.191,,38928341,38041754,5.628
4,Afghanistan,South Asia,2019,154,3.203,0.35,0.517,0.361,0.0,0.025,0.158,,38928341,38041754,5.628


In [6]:
df.describe()

Unnamed: 0,Year,Happiness Rank,Happiness Score,Economy GDP per Capita,Family Social Support,Health Life Expectancy,Freedom to Make Choices,Perceptions of Corruption,Generosity,Dystopia Residual,Population 2020,Population 2019,"COVID-19 deaths per 100,000 population in 2020"
count,1157.0,1157.0,1157.0,1157.0,1157.0,1157.0,1157.0,1157.0,1157.0,869.0,1157.0,1157.0,1157.0
mean,2018.520311,76.703544,5.451946,0.985399,1.038534,0.615833,0.444241,0.137,0.191637,2.131733,51702640.0,50930670.0,32.62215
std,2.289734,44.428954,1.112342,0.424901,0.325293,0.23481,0.151035,0.110447,0.119482,0.60039,169106300.0,165868700.0,38.962789
min,2015.0,1.0,2.404,0.0,0.0,0.0,0.0,0.0,0.0,0.187,0.0,0.0,0.0
25%,2017.0,38.0,4.609,0.68,0.831,0.455,0.351,0.06,0.105,1.743,5421242.0,5347896.0,1.786
50%,2019.0,77.0,5.432,1.021,1.069,0.643,0.459,0.104,0.175,2.172,11818620.0,11530580.0,13.586
75%,2021.0,115.0,6.253,1.296,1.279,0.792,0.561,0.178,0.25,2.533,37742160.0,37589260.0,53.558
max,2022.0,158.0,7.842,2.209,1.644,1.141,0.74,0.587,0.838,3.89,1439324000.0,1397715000.0,168.496


# 3. Data Wrangling

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Data columns (total 15 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Country                                         1157 non-null   object 
 1   Region                                          1157 non-null   object 
 2   Year                                            1157 non-null   int64  
 3   Happiness Rank                                  1157 non-null   int64  
 4   Happiness Score                                 1157 non-null   float64
 5   Economy GDP per Capita                          1157 non-null   float64
 6   Family Social Support                           1157 non-null   float64
 7   Health Life Expectancy                          1157 non-null   float64
 8   Freedom to Make Choices                         1157 non-null   float64
 9   Perceptions of Corruption                

# 4. Data Cleaning

In [8]:
# Check for mixed data types

for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

In [9]:
# Check for missing values

df.isnull().sum()

Country                                             0
Region                                              0
Year                                                0
Happiness Rank                                      0
Happiness Score                                     0
Economy GDP per Capita                              0
Family Social Support                               0
Health Life Expectancy                              0
Freedom to Make Choices                             0
Perceptions of Corruption                           0
Generosity                                          0
Dystopia Residual                                 288
Population 2020                                     0
Population 2019                                     0
COVID-19 deaths per 100,000 population in 2020      0
dtype: int64

In [10]:
# Dropping Dystopia Column - Dystopia is an imaginary country that has the world’s least-happy people. Used only as a benchmark and not one of the 6 main pillars

df = df.drop(columns = ['Dystopia Residual'])

In [11]:
# Check for duplicate values

df_dups = df[df.duplicated()]
df_dups

Unnamed: 0,Country,Region,Year,Happiness Rank,Happiness Score,Economy GDP per Capita,Family Social Support,Health Life Expectancy,Freedom to Make Choices,Perceptions of Corruption,Generosity,Population 2020,Population 2019,"COVID-19 deaths per 100,000 population in 2020"


In [12]:
# check for the shape of the final dataframe

df.shape

(1157, 14)

In [13]:
df.head()

Unnamed: 0,Country,Region,Year,Happiness Rank,Happiness Score,Economy GDP per Capita,Family Social Support,Health Life Expectancy,Freedom to Make Choices,Perceptions of Corruption,Generosity,Population 2020,Population 2019,"COVID-19 deaths per 100,000 population in 2020"
0,Afghanistan,South Asia,2015,153,3.575,0.32,0.303,0.303,0.234,0.097,0.365,38928341,38041754,5.628
1,Afghanistan,South Asia,2016,154,3.36,0.382,0.11,0.173,0.164,0.071,0.313,38928341,38041754,5.628
2,Afghanistan,South Asia,2017,141,3.794,0.401,0.582,0.181,0.106,0.061,0.312,38928341,38041754,5.628
3,Afghanistan,South Asia,2018,145,3.632,0.332,0.537,0.255,0.085,0.036,0.191,38928341,38041754,5.628
4,Afghanistan,South Asia,2019,154,3.203,0.35,0.517,0.361,0.0,0.025,0.158,38928341,38041754,5.628


# 5. Export

In [14]:
# Export the dataset

df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'happy_clean.pkl'))