# In this Jupyter Notebook:
# 1. Import Libraries & Data
# 2. Data Cleaning
# 3. Data Wrangling
# 4. Data Dictionary
# 5. Imputation
# 6. Export Updated Dataframe to CSV

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# Import Data Set

In [2]:
import pandas as pd

# Define the file path
file_path = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\12-2023 World Happiness\02 Data\Original Data\\'

# Read the CSV files into pandas DataFrames
df_2015 = pd.read_csv(file_path + "2015.csv")
df_2016 = pd.read_csv(file_path + "2016.csv")
df_2017 = pd.read_csv(file_path + "2017.csv")
df_2018 = pd.read_csv(file_path + "2018.csv")
df_2019 = pd.read_csv(file_path + "2019.csv")

# Comparing Dataframes

In [3]:
# Checking columns in each Dataframe

[df.columns for df in [df_2015, df_2016, df_2017, df_2018, df_2019]]

[Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
        'Standard Error', 'Economy (GDP per Capita)', 'Family',
        'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
        'Generosity', 'Dystopia Residual'],
       dtype='object'),
 Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
        'Lower Confidence Interval', 'Upper Confidence Interval',
        'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)',
        'Freedom', 'Trust (Government Corruption)', 'Generosity',
        'Dystopia Residual'],
       dtype='object'),
 Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
        'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
        'Health..Life.Expectancy.', 'Freedom', 'Generosity',
        'Trust..Government.Corruption.', 'Dystopia.Residual'],
       dtype='object'),
 Index(['Overall rank', 'Country or region', 'Score', 'GDP per capita',
        'Social support', 'Healthy life exp

# Renaming Columns for Consistency

In [4]:
# Renaming Columns in each Dataframe for Consistency

df_2015 = df_2015.rename(columns={
    'Happiness Rank': 'Happiness_Rank',
    'Happiness Score': 'Happiness_Score',
    'Economy (GDP per Capita)': 'GDP_Per_Capita',
    'Family': 'Social_Support',
    'Health (Life Expectancy)': 'Health_Life_Expectancy',
    'Freedom': 'Freedom_Life_Choices',
    'Trust (Government Corruption)': 'Perceptions_of_Corruption',
    'Generosity': 'Generosity'
})

df_2016 = df_2016.rename(columns={
    'Happiness Rank': 'Happiness_Rank',
    'Happiness Score': 'Happiness_Score',
    'Economy (GDP per Capita)': 'GDP_Per_Capita',
    'Family': 'Social_Support',
    'Health (Life Expectancy)': 'Health_Life_Expectancy',
    'Freedom': 'Freedom_Life_Choices',
    'Trust (Government Corruption)': 'Perceptions_of_Corruption',
    'Generosity': 'Generosity'
})

df_2017 = df_2017.rename(columns={
    'Happiness.Rank': 'Happiness_Rank',
    'Happiness.Score': 'Happiness_Score',
    'Economy..GDP.per.Capita.': 'GDP_Per_Capita',
    'Family': 'Social_Support',
    'Health..Life.Expectancy.': 'Health_Life_Expectancy',
    'Freedom': 'Freedom_Life_Choices',
    'Trust..Government.Corruption.': 'Perceptions_of_Corruption',
    'Generosity': 'Generosity'
})

df_2018 = df_2018.rename(columns={
    'Overall rank': 'Happiness_Rank',
    'Score': 'Happiness_Score',
    'GDP per capita': 'GDP_Per_Capita',
    'Social support': 'Social_Support',
    'Healthy life expectancy': 'Health_Life_Expectancy',
    'Freedom to make life choices': 'Freedom_Life_Choices',
    'Perceptions of corruption': 'Perceptions_of_Corruption',
    'Generosity': 'Generosity',
    'Country or region': 'Country'
})

df_2019 = df_2019.rename(columns={
    'Overall rank': 'Happiness_Rank',
    'Score': 'Happiness_Score',
    'GDP per capita': 'GDP_Per_Capita',
    'Social support': 'Social_Support',
    'Healthy life expectancy': 'Health_Life_Expectancy',
    'Freedom to make life choices': 'Freedom_Life_Choices',
    'Perceptions of corruption': 'Perceptions_of_Corruption',
    'Generosity': 'Generosity',
    'Country or region': 'Country'
})

# Dropping Unused Columns

In [5]:
# Drop 'Standard Error' column from df_2015
df_2015.drop('Standard Error', axis=1, inplace=True)

# Drop 'Lower Confidence Interval' and 'Upper Confidence Interval' columns from df_2016
df_2016.drop(['Lower Confidence Interval', 'Upper Confidence Interval'], axis=1, inplace=True)

# Drop 'Whisker.high' and 'Whisker.low' columns from df_2017
df_2017.drop(['Whisker.high', 'Whisker.low'], axis=1, inplace=True)

# Drop 'Dystopia.Residual' column from df_2015
df_2015.drop('Dystopia Residual', axis=1, inplace=True)

# Drop 'Dystopia.Residual' column from df_2016
df_2016.drop('Dystopia Residual', axis=1, inplace=True)

# Drop 'Dystopia.Residual' column from df_2017
df_2017.drop('Dystopia.Residual', axis=1, inplace=True)

# Drop 'Region' from df_2015
df_2015.drop('Region', axis=1, inplace=True)

# Drop 'Region' from df_2016
df_2016.drop('Region', axis=1, inplace=True)

# Adding Year Column to Dataframes

In [6]:
# Add 'Year' column to each DataFrame
df_2015['Year'] = 2015
df_2016['Year'] = 2016
df_2017['Year'] = 2017
df_2018['Year'] = 2018
df_2019['Year'] = 2019

# Display a DataFrame to verify the addition of the 'Year' column
print(df_2015.head())  # Displaying the head of df_2015

       Country  Happiness_Rank  Happiness_Score  GDP_Per_Capita  \
0  Switzerland               1            7.587         1.39651   
1      Iceland               2            7.561         1.30232   
2      Denmark               3            7.527         1.32548   
3       Norway               4            7.522         1.45900   
4       Canada               5            7.427         1.32629   

   Social_Support  Health_Life_Expectancy  Freedom_Life_Choices  \
0         1.34951                 0.94143               0.66557   
1         1.40223                 0.94784               0.62877   
2         1.36058                 0.87464               0.64938   
3         1.33095                 0.88521               0.66973   
4         1.32261                 0.90563               0.63297   

   Perceptions_of_Corruption  Generosity  Year  
0                    0.41978     0.29678  2015  
1                    0.14145     0.43630  2015  
2                    0.48357     0.34139  2015 

# Fixing Column Order for Consistency

In [7]:
columns_order = ['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per_Capita',
                 'Social_Support', 'Health_Life_Expectancy', 'Freedom_Life_Choices',
                 'Generosity', 'Perceptions_of_Corruption', 'Year']

# Reindexing columns for each DataFrame
df_2015 = df_2015.reindex(columns=columns_order)
df_2016 = df_2016.reindex(columns=columns_order)
df_2017 = df_2017.reindex(columns=columns_order)
df_2018 = df_2018.reindex(columns=columns_order)
df_2019 = df_2019.reindex(columns=columns_order)

In [8]:
# RE-Checking columns in each Dataframe

[df.columns for df in [df_2015, df_2016, df_2017, df_2018, df_2019]]

[Index(['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per_Capita',
        'Social_Support', 'Health_Life_Expectancy', 'Freedom_Life_Choices',
        'Generosity', 'Perceptions_of_Corruption', 'Year'],
       dtype='object'),
 Index(['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per_Capita',
        'Social_Support', 'Health_Life_Expectancy', 'Freedom_Life_Choices',
        'Generosity', 'Perceptions_of_Corruption', 'Year'],
       dtype='object'),
 Index(['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per_Capita',
        'Social_Support', 'Health_Life_Expectancy', 'Freedom_Life_Choices',
        'Generosity', 'Perceptions_of_Corruption', 'Year'],
       dtype='object'),
 Index(['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per_Capita',
        'Social_Support', 'Health_Life_Expectancy', 'Freedom_Life_Choices',
        'Generosity', 'Perceptions_of_Corruption', 'Year'],
       dtype='object'),
 Index(['Country', 'Happiness_Rank', 'Happiness_Score', 'GDP_Per

# Combining everything into a single dataframe

In [9]:
# Concatenate the DataFrames vertically
combined_df = pd.concat([df_2015, df_2016, df_2017, df_2018, df_2019], ignore_index=True)

# Display the combined DataFrame
print(combined_df)

                      Country  Happiness_Rank  Happiness_Score  \
0                 Switzerland               1            7.587   
1                     Iceland               2            7.561   
2                     Denmark               3            7.527   
3                      Norway               4            7.522   
4                      Canada               5            7.427   
..                        ...             ...              ...   
777                    Rwanda             152            3.334   
778                  Tanzania             153            3.231   
779               Afghanistan             154            3.203   
780  Central African Republic             155            3.083   
781               South Sudan             156            2.853   

     GDP_Per_Capita  Social_Support  Health_Life_Expectancy  \
0           1.39651         1.34951                 0.94143   
1           1.30232         1.40223                 0.94784   
2           1.3254

In [10]:
combined_df.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015


# Checking Country Counts

In [11]:
with pd.option_context('display.max_rows', None):
    print(combined_df['Country'].value_counts())

Country
Switzerland                 5
Dominican Republic          5
Greece                      5
Lebanon                     5
Hungary                     5
Honduras                    5
Tajikistan                  5
Tunisia                     5
Palestinian Territories     5
Bangladesh                  5
Iran                        5
Ukraine                     5
Iraq                        5
South Africa                5
Ghana                       5
Zimbabwe                    5
Liberia                     5
Mongolia                    5
Bosnia and Herzegovina      5
Vietnam                     5
Albania                     5
Kyrgyzstan                  5
Nigeria                     5
Bhutan                      5
Azerbaijan                  5
Pakistan                    5
Jordan                      5
Montenegro                  5
China                       5
Zambia                      5
Iceland                     5
Serbia                      5
Portugal                    5
La

In [12]:
# Define a mapping for consolidating country names
country_mapping = {
    'Trinidad & Tobago': 'Trinidad and Tobago',
    'Northern Cyprus': 'North Cyprus',
    'Hong Kong S.A.R., China': 'Hong Kong',
    'Taiwan Province of China': 'Taiwan',
    'Somaliland region': 'Somaliland Region'
}

# Update the Country column in the DataFrame
combined_df['Country'] = combined_df['Country'].replace(country_mapping)

In [13]:
with pd.option_context('display.max_rows', None):
    print(combined_df['Country'].value_counts())

Country
Switzerland                 5
Dominican Republic          5
Greece                      5
Lebanon                     5
Hungary                     5
Honduras                    5
Tajikistan                  5
Tunisia                     5
Palestinian Territories     5
Bangladesh                  5
Iran                        5
Ukraine                     5
Iraq                        5
South Africa                5
Ghana                       5
Zimbabwe                    5
Liberia                     5
Mongolia                    5
Bosnia and Herzegovina      5
Vietnam                     5
Albania                     5
Kyrgyzstan                  5
Nigeria                     5
Bhutan                      5
Azerbaijan                  5
Pakistan                    5
Jordan                      5
Iceland                     5
China                       5
Zambia                      5
Romania                     5
Serbia                      5
Portugal                    5
La

# Checking for Mixed Data Types

In [14]:
# Check for mixed data types in columns of the 'combined_df' DataFrame
for col in combined_df.columns:
    types = combined_df[col].apply(type).unique()
    if len(types) > 1:
        print(f"Column '{col}' has mixed data types: {types}")
else:
    print("No mixed data types found in the DataFrame.")

No mixed data types found in the DataFrame.


# Checking Data Types

In [15]:
combined_df.dtypes

Country                       object
Happiness_Rank                 int64
Happiness_Score              float64
GDP_Per_Capita               float64
Social_Support               float64
Health_Life_Expectancy       float64
Freedom_Life_Choices         float64
Generosity                   float64
Perceptions_of_Corruption    float64
Year                           int64
dtype: object

# Checking Missing Values

In [16]:
# Check for missing values in the DataFrame
missing_values = combined_df.isnull().sum()

# Check if any missing values are found
if missing_values.any():
    print("Missing values found in the DataFrame:")
    print(missing_values)
else:
    print("No missing values found in the DataFrame.")

Missing values found in the DataFrame:
Country                      0
Happiness_Rank               0
Happiness_Score              0
GDP_Per_Capita               0
Social_Support               0
Health_Life_Expectancy       0
Freedom_Life_Choices         0
Generosity                   0
Perceptions_of_Corruption    1
Year                         0
dtype: int64


In [17]:
# Locate rows with missing values in 'Perceptions_of_Corruption' column
rows_with_missing_values = combined_df[combined_df['Perceptions_of_Corruption'].isnull()]

# Display rows with missing values
rows_with_missing_values

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
489,United Arab Emirates,20,6.774,2.096,0.776,0.67,0.284,0.186,,2018


# Impute Missing Values

In [18]:
# Extracting the value from the respective years for United Arab Emirates
uae_values = combined_df.loc[combined_df['Country'] == 'United Arab Emirates', 'Perceptions_of_Corruption']
uae_values = uae_values.dropna()  # Exclude NaN values

# Calculate the sum of the available values for United Arab Emirates
uae_sum = uae_values.sum()

# Calculate the mean by dividing the sum by the number of available years
uae_mean_perception = uae_sum / len(uae_values)

# Fill in the missing value in the combined DataFrame for 'United Arab Emirates' with the calculated mean
combined_df.loc[combined_df['Country'] == 'United Arab Emirates', 'Perceptions_of_Corruption'] = uae_mean_perception
print(uae_mean_perception)

0.3119823909258842


In [19]:
# Check for missing values in the DataFrame
missing_values = combined_df.isnull().sum()

# Check if any missing values are found
if missing_values.any():
    print("Missing values found in the DataFrame:")
    print(missing_values)
else:
    print("No missing values found in the DataFrame.")

No missing values found in the DataFrame.


# Checking for Zero (0) Values

In [20]:
# Locate rows with any zero values in the DataFrame
rows_with_any_zero_values = combined_df[(combined_df == 0).any(axis=1)]

# Display rows with any zero values
rows_with_any_zero_values

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
73,Indonesia,74,5.399,0.82827,1.08708,0.63793,0.46611,0.51535,0.0,2015
101,Greece,102,4.857,1.15406,0.92933,0.88213,0.07699,0.0,0.01397,2015
111,Iraq,112,4.677,0.98549,0.81889,0.60237,0.0,0.17922,0.13788,2015
119,Congo (Kinshasa),120,4.517,0.0,1.0012,0.09806,0.22605,0.24834,0.07625,2015
122,Sierra Leone,123,4.507,0.33024,0.95571,0.0,0.4084,0.21488,0.08786,2015
147,Central African Republic,148,3.678,0.0785,0.0,0.06699,0.48879,0.23835,0.08289,2015
233,Somalia,76,5.44,0.0,0.33613,0.11466,0.56778,0.27225,0.3118,2016
244,Bosnia and Herzegovina,87,5.163,0.93383,0.64367,0.70766,0.09511,0.29889,0.0,2016
256,Greece,99,5.033,1.24886,0.75473,0.80029,0.05822,0.0,0.04127,2016
268,Sierra Leone,111,4.635,0.36485,0.628,0.0,0.30685,0.23897,0.08196,2016


# Imputation for Zero(0) Values

### Indonesia

In [21]:
combined_df[combined_df['Country'] == 'Indonesia']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
73,Indonesia,74,5.399,0.82827,1.08708,0.63793,0.46611,0.51535,0.0,2015
236,Indonesia,79,5.314,0.95104,0.87625,0.49374,0.39237,0.56521,0.00322,2016
395,Indonesia,81,5.262,0.995539,1.274445,0.492346,0.443323,0.611705,0.015317,2017
565,Indonesia,96,5.093,0.899,1.215,0.522,0.538,0.484,0.018,2018
717,Indonesia,92,5.192,0.931,1.203,0.66,0.491,0.498,0.028,2019


In [22]:
indonesia_perceptions_of_corruption = (0.003220 + 0.015317 + 0.018000 + 0.028000)/(4)

In [23]:
# Calculate the mean for 'Perceptions_of_Corruption' for Indonesia
indonesia_perceptions_of_corruption = (0.003220 + 0.015317 + 0.018000 + 0.028000) / 4

# Replace the zero value in 'Perceptions_of_Corruption' for Indonesia with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Indonesia') & (combined_df['Perceptions_of_Corruption'] == 0), 'Perceptions_of_Corruption'] = indonesia_perceptions_of_corruption

In [24]:
combined_df[combined_df['Country'] == 'Indonesia']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
73,Indonesia,74,5.399,0.82827,1.08708,0.63793,0.46611,0.51535,0.016134,2015
236,Indonesia,79,5.314,0.95104,0.87625,0.49374,0.39237,0.56521,0.00322,2016
395,Indonesia,81,5.262,0.995539,1.274445,0.492346,0.443323,0.611705,0.015317,2017
565,Indonesia,96,5.093,0.899,1.215,0.522,0.538,0.484,0.018,2018
717,Indonesia,92,5.192,0.931,1.203,0.66,0.491,0.498,0.028,2019


### Greece

In [25]:
combined_df[combined_df['Country'] == 'Greece']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
101,Greece,102,4.857,1.15406,0.92933,0.88213,0.07699,0.0,0.01397,2015
256,Greece,99,5.033,1.24886,0.75473,0.80029,0.05822,0.0,0.04127,2016
401,Greece,87,5.227,1.289487,1.239415,0.810199,0.095731,0.0,0.04329,2017
548,Greece,79,5.358,1.154,1.202,0.879,0.131,0.0,0.044,2018
707,Greece,82,5.287,1.181,1.156,0.999,0.067,0.0,0.034,2019


In [26]:
# Calculate the median of the entire dataset for 'Generosity'
overall_median_generosity = np.median(combined_df[combined_df['Generosity'] != 0]['Generosity'])

# Replace 0 values in 'Generosity' for Greece with the overall median of 'Generosity'
combined_df.loc[(combined_df['Country'] == 'Greece') & (combined_df['Generosity'] == 0), 'Generosity'] = overall_median_generosity

In [27]:
combined_df[combined_df['Country'] == 'Greece']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
101,Greece,102,4.857,1.15406,0.92933,0.88213,0.07699,0.202,0.01397,2015
256,Greece,99,5.033,1.24886,0.75473,0.80029,0.05822,0.202,0.04127,2016
401,Greece,87,5.227,1.289487,1.239415,0.810199,0.095731,0.202,0.04329,2017
548,Greece,79,5.358,1.154,1.202,0.879,0.131,0.202,0.044,2018
707,Greece,82,5.287,1.181,1.156,0.999,0.067,0.202,0.034,2019


### Iraq

In [28]:
combined_df[combined_df['Country'] == 'Iraq']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
111,Iraq,112,4.677,0.98549,0.81889,0.60237,0.0,0.17922,0.13788,2015
269,Iraq,112,4.575,1.07474,0.59205,0.51076,0.24856,0.19589,0.13636,2016
431,Iraq,117,4.497,1.10271,0.978613,0.50118,0.288556,0.199637,0.107216,2017
586,Iraq,117,4.456,1.01,0.971,0.536,0.304,0.148,0.095,2018
751,Iraq,126,4.437,1.043,0.98,0.574,0.241,0.148,0.089,2019


In [29]:
# Calculate the mean for 'Freedom_Life_Choices' for Iraq
iraq_freedom_life_choices = (0.248560 + 0.288556 + 0.304000 + 0.241000) / 4

# Replace the zero value in 'Freedom_Life_Choices' for Iraq with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Iraq') & (combined_df['Freedom_Life_Choices'] == 0), 'Freedom_Life_Choices'] = iraq_freedom_life_choices

In [30]:
combined_df[combined_df['Country'] == 'Iraq']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
111,Iraq,112,4.677,0.98549,0.81889,0.60237,0.270529,0.17922,0.13788,2015
269,Iraq,112,4.575,1.07474,0.59205,0.51076,0.24856,0.19589,0.13636,2016
431,Iraq,117,4.497,1.10271,0.978613,0.50118,0.288556,0.199637,0.107216,2017
586,Iraq,117,4.456,1.01,0.971,0.536,0.304,0.148,0.095,2018
751,Iraq,126,4.437,1.043,0.98,0.574,0.241,0.148,0.089,2019


### Congo

In [31]:
combined_df[combined_df['Country'] == 'Congo (Kinshasa)']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
119,Congo (Kinshasa),120,4.517,0.0,1.0012,0.09806,0.22605,0.24834,0.07625,2015
282,Congo (Kinshasa),125,4.272,0.05661,0.80676,0.188,0.15602,0.25458,0.06075,2016
440,Congo (Kinshasa),126,4.28,0.092102,1.229023,0.191407,0.235961,0.246456,0.060241,2017
601,Congo (Kinshasa),132,4.245,0.069,1.136,0.204,0.312,0.197,0.052,2018
752,Congo (Kinshasa),127,4.418,0.094,1.125,0.357,0.269,0.212,0.053,2019


In [32]:
# Calculate the mean for 'GDP_Per_Capita' for Congo
congo_gdp_per_capita = (0.056610 + 0.092102 + 0.069000 + 0.094000) / 4

# Replace the zero value in 'GDP_Per_Capita' for Congo with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Congo (Kinshasa)') & (combined_df['GDP_Per_Capita'] == 0), 'GDP_Per_Capita'] = congo_gdp_per_capita

In [33]:
combined_df[combined_df['Country'] == 'Congo (Kinshasa)']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
119,Congo (Kinshasa),120,4.517,0.077928,1.0012,0.09806,0.22605,0.24834,0.07625,2015
282,Congo (Kinshasa),125,4.272,0.05661,0.80676,0.188,0.15602,0.25458,0.06075,2016
440,Congo (Kinshasa),126,4.28,0.092102,1.229023,0.191407,0.235961,0.246456,0.060241,2017
601,Congo (Kinshasa),132,4.245,0.069,1.136,0.204,0.312,0.197,0.052,2018
752,Congo (Kinshasa),127,4.418,0.094,1.125,0.357,0.269,0.212,0.053,2019


### Sierra Leone

In [34]:
combined_df[combined_df['Country'] == 'Sierra Leone']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
122,Sierra Leone,123,4.507,0.33024,0.95571,0.0,0.4084,0.21488,0.08786,2015
268,Sierra Leone,111,4.635,0.36485,0.628,0.0,0.30685,0.23897,0.08196,2016
420,Sierra Leone,106,4.709,0.368421,0.984136,0.005565,0.318698,0.293041,0.071095,2017
582,Sierra Leone,113,4.571,0.256,0.813,0.0,0.355,0.238,0.053,2018
754,Sierra Leone,129,4.374,0.268,0.841,0.242,0.309,0.252,0.045,2019


In [35]:
# Calculate the mean for 'Health_Life_Expectancy' for Sierra Leone
sierra_health_life_expectancy = (0.005565 + 0.242000) / 2

# Replace the zero value in 'Health_Life_Expectancy' for Sierra Leone with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Sierra Leone') & (combined_df['Health_Life_Expectancy'] == 0), 'Health_Life_Expectancy'] = sierra_health_life_expectancy

In [36]:
combined_df[combined_df['Country'] == 'Sierra Leone']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
122,Sierra Leone,123,4.507,0.33024,0.95571,0.123782,0.4084,0.21488,0.08786,2015
268,Sierra Leone,111,4.635,0.36485,0.628,0.123782,0.30685,0.23897,0.08196,2016
420,Sierra Leone,106,4.709,0.368421,0.984136,0.005565,0.318698,0.293041,0.071095,2017
582,Sierra Leone,113,4.571,0.256,0.813,0.123782,0.355,0.238,0.053,2018
754,Sierra Leone,129,4.374,0.268,0.841,0.242,0.309,0.252,0.045,2019


### Central African Republic

In [37]:
combined_df[combined_df['Country'] == 'Central African Republic']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
147,Central African Republic,148,3.678,0.0785,0.0,0.06699,0.48879,0.23835,0.08289,2015
469,Central African Republic,155,2.693,0.0,0.0,0.018773,0.270842,0.280876,0.056565,2017
624,Central African Republic,155,3.083,0.024,0.0,0.01,0.305,0.218,0.038,2018
780,Central African Republic,155,3.083,0.026,0.0,0.105,0.225,0.235,0.035,2019


In [38]:
# Calculate the mean for 'GDP_Per_Capita' for Central African Republic
car_gdp_per_capita = (0.0785 + 0.0260 + 0.0240) / 3

# Replace the zero value in 'GDP_Per_Capita' for Congo with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Central African Republic') & (combined_df['GDP_Per_Capita'] == 0), 'GDP_Per_Capita'] = car_gdp_per_capita

In [39]:
combined_df[combined_df['Country'] == 'Central African Republic']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
147,Central African Republic,148,3.678,0.0785,0.0,0.06699,0.48879,0.23835,0.08289,2015
469,Central African Republic,155,2.693,0.042833,0.0,0.018773,0.270842,0.280876,0.056565,2017
624,Central African Republic,155,3.083,0.024,0.0,0.01,0.305,0.218,0.038,2018
780,Central African Republic,155,3.083,0.026,0.0,0.105,0.225,0.235,0.035,2019


In [40]:
# Calculate the median of the entire dataset for 'Social_Support'
overall_median_Social_Support = np.median(combined_df[combined_df['Social_Support'] != 0]['Social_Support'])

# Replace 0 values in 'Generosity' for Greece with the overall median of 'Social_Support'
combined_df.loc[(combined_df['Country'] == 'Central African Republic') & (combined_df['Social_Support'] == 0), 'Social_Support'] = overall_median_Social_Support

In [41]:
combined_df[combined_df['Country'] == 'Central African Republic']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
147,Central African Republic,148,3.678,0.0785,1.125,0.06699,0.48879,0.23835,0.08289,2015
469,Central African Republic,155,2.693,0.042833,1.125,0.018773,0.270842,0.280876,0.056565,2017
624,Central African Republic,155,3.083,0.024,1.125,0.01,0.305,0.218,0.038,2018
780,Central African Republic,155,3.083,0.026,1.125,0.105,0.225,0.235,0.035,2019


### Somalia

In [42]:
combined_df[combined_df['Country'] == 'Somalia']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
233,Somalia,76,5.44,0.0,0.33613,0.11466,0.56778,0.27225,0.3118,2016
407,Somalia,93,5.151,0.022643,0.721151,0.113989,0.602127,0.291631,0.28241,2017
567,Somalia,98,4.982,0.0,0.712,0.115,0.674,0.238,0.282,2018
737,Somalia,112,4.668,0.0,0.698,0.268,0.559,0.243,0.27,2019


In [43]:
# Calculate the mean for 'GDP_Per_Capita' for Somalia
Somalia_gdp_per_capita = (0.022643) / 1

# Replace the zero value in 'GDP_Per_Capita' for Somalia with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Somalia') & (combined_df['GDP_Per_Capita'] == 0), 'GDP_Per_Capita'] = Somalia_gdp_per_capita

In [44]:
combined_df[combined_df['Country'] == 'Somalia']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
233,Somalia,76,5.44,0.022643,0.33613,0.11466,0.56778,0.27225,0.3118,2016
407,Somalia,93,5.151,0.022643,0.721151,0.113989,0.602127,0.291631,0.28241,2017
567,Somalia,98,4.982,0.022643,0.712,0.115,0.674,0.238,0.282,2018
737,Somalia,112,4.668,0.022643,0.698,0.268,0.559,0.243,0.27,2019


### Bosnia and Herzegovina

In [45]:
combined_df[combined_df['Country'] == 'Bosnia and Herzegovina']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
95,Bosnia and Herzegovina,96,4.949,0.83223,0.91916,0.79081,0.09245,0.24808,0.00227,2015
244,Bosnia and Herzegovina,87,5.163,0.93383,0.64367,0.70766,0.09511,0.29889,0.0,2016
404,Bosnia and Herzegovina,90,5.182,0.982409,1.069336,0.705186,0.204403,0.328867,0.0,2017
562,Bosnia and Herzegovina,93,5.129,0.915,1.078,0.758,0.28,0.216,0.0,2018
703,Bosnia and Herzegovina,78,5.386,0.945,1.212,0.845,0.212,0.263,0.006,2019


In [46]:
# Calculate the mean for 'Perceptions_of_Corruption' for Bosnia and Herzegovina
BAH_Perceptions_of_Corruption = (0.00227 + 0.00600) / 2

# Replace the zero value in 'Perceptions_of_Corruption' for Bosnia and Herzegovina with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Bosnia and Herzegovina') & (combined_df['Perceptions_of_Corruption'] == 0), 'Perceptions_of_Corruption'] = BAH_Perceptions_of_Corruption

In [47]:
combined_df[combined_df['Country'] == 'Bosnia and Herzegovina']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
95,Bosnia and Herzegovina,96,4.949,0.83223,0.91916,0.79081,0.09245,0.24808,0.00227,2015
244,Bosnia and Herzegovina,87,5.163,0.93383,0.64367,0.70766,0.09511,0.29889,0.004135,2016
404,Bosnia and Herzegovina,90,5.182,0.982409,1.069336,0.705186,0.204403,0.328867,0.004135,2017
562,Bosnia and Herzegovina,93,5.129,0.915,1.078,0.758,0.28,0.216,0.004135,2018
703,Bosnia and Herzegovina,78,5.386,0.945,1.212,0.845,0.212,0.263,0.006,2019


### Sudan

In [48]:
combined_df[combined_df['Country'] == 'Sudan']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
117,Sudan,118,4.55,0.52107,1.01404,0.36878,0.10081,0.19062,0.1466,2015
290,Sudan,133,4.139,0.63069,0.81928,0.29759,0.0,0.18077,0.10039,2016
444,Sudan,130,4.139,0.659517,1.214009,0.290921,0.014996,0.182317,0.089848,2017
606,Sudan,137,4.139,0.605,1.24,0.312,0.016,0.134,0.082,2018


In [49]:
# Calculate the mean for 'Freedom_Life_Choices' for Sudan
Sudan_Freedom_Life_Choices = (0.100810 + 0.014996 + 0.016000) / 3

# Replace the zero value in 'Freedom_Life_Choices' for Sudan with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Sudan') & (combined_df['Freedom_Life_Choices'] == 0), 'Freedom_Life_Choices'] = Sudan_Freedom_Life_Choices

In [50]:
combined_df[combined_df['Country'] == 'Sudan']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
117,Sudan,118,4.55,0.52107,1.01404,0.36878,0.10081,0.19062,0.1466,2015
290,Sudan,133,4.139,0.63069,0.81928,0.29759,0.043935,0.18077,0.10039,2016
444,Sudan,130,4.139,0.659517,1.214009,0.290921,0.014996,0.182317,0.089848,2017
606,Sudan,137,4.139,0.605,1.24,0.312,0.016,0.134,0.082,2018


### Togo

In [51]:
combined_df[combined_df['Country'] == 'Togo']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
157,Togo,158,2.839,0.20868,0.13995,0.28443,0.36453,0.16681,0.10731,2015
312,Togo,155,3.303,0.28123,0.0,0.24811,0.34678,0.17517,0.11587,2016
464,Togo,150,3.495,0.305445,0.431883,0.247106,0.380426,0.196896,0.095665,2017
608,Togo,139,3.999,0.259,0.474,0.253,0.434,0.158,0.101,2018
764,Togo,139,4.085,0.275,0.572,0.41,0.293,0.177,0.085,2019


In [52]:
# Calculate the mean for 'Social_Support' for Togo
Togo_Social_Support = (0.139950 + 0.431883 + 0.474000 + 0.572000) / 4

# Replace the zero value in 'Social_Support' for Togo with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Togo') & (combined_df['Social_Support'] == 0), 'Social_Support'] = Togo_Social_Support

In [53]:
combined_df[combined_df['Country'] == 'Togo']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
157,Togo,158,2.839,0.20868,0.13995,0.28443,0.36453,0.16681,0.10731,2015
312,Togo,155,3.303,0.28123,0.404458,0.24811,0.34678,0.17517,0.11587,2016
464,Togo,150,3.495,0.305445,0.431883,0.247106,0.380426,0.196896,0.095665,2017
608,Togo,139,3.999,0.259,0.474,0.253,0.434,0.158,0.101,2018
764,Togo,139,4.085,0.275,0.572,0.41,0.293,0.177,0.085,2019


### Lesotho

In [54]:
combined_df[combined_df['Country'] == 'Lesotho']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
96,Lesotho,97,4.898,0.37545,1.04103,0.07612,0.31767,0.16388,0.12504,2015
453,Lesotho,139,3.808,0.521021,1.190095,0.0,0.390661,0.157497,0.119095,2017
610,Lesotho,141,3.808,0.472,1.215,0.079,0.423,0.116,0.112,2018
769,Lesotho,144,3.802,0.489,1.169,0.168,0.359,0.107,0.093,2019


In [55]:
# Calculate the mean for 'Health_Life_Expectancy' for Lesotho
Lesotho_Health_Life_Expectancy = (0.07612 + 0.07900 + 0.16800) / 3

# Replace the zero value in 'Health_Life_Expectancy' for Lesotho with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Lesotho') & (combined_df['Health_Life_Expectancy'] == 0), 'Health_Life_Expectancy'] = Lesotho_Health_Life_Expectancy

In [56]:
combined_df[combined_df['Country'] == 'Lesotho']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
96,Lesotho,97,4.898,0.37545,1.04103,0.07612,0.31767,0.16388,0.12504,2015
453,Lesotho,139,3.808,0.521021,1.190095,0.107707,0.390661,0.157497,0.119095,2017
610,Lesotho,141,3.808,0.472,1.215,0.079,0.423,0.116,0.112,2018
769,Lesotho,144,3.802,0.489,1.169,0.168,0.359,0.107,0.093,2019


### Angola

In [57]:
combined_df[combined_df['Country'] == 'Angola']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
136,Angola,137,4.033,0.75778,0.8604,0.16683,0.10384,0.12344,0.07122,2015
298,Angola,141,3.866,0.84731,0.66366,0.04991,0.00589,0.12071,0.08434,2016
454,Angola,140,3.795,0.858428,1.104412,0.049869,0.0,0.097926,0.06972,2017
611,Angola,142,3.795,0.73,1.125,0.269,0.0,0.079,0.061,2018


In [58]:
# Calculate the mean for 'Freedom_Life_Choices' for Angola
Angola_Freedom_Life_Choices = (0.10384 + 0.00589) / 2

# Replace the zero value in 'Freedom_Life_Choices' for Angola with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Angola') & (combined_df['Freedom_Life_Choices'] == 0), 'Freedom_Life_Choices'] = Angola_Freedom_Life_Choices

In [59]:
combined_df[combined_df['Country'] == 'Angola']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
136,Angola,137,4.033,0.75778,0.8604,0.16683,0.10384,0.12344,0.07122,2015
298,Angola,141,3.866,0.84731,0.66366,0.04991,0.00589,0.12071,0.08434,2016
454,Angola,140,3.795,0.858428,1.104412,0.049869,0.054865,0.097926,0.06972,2017
611,Angola,142,3.795,0.73,1.125,0.269,0.054865,0.079,0.061,2018


### Moldova

In [60]:
combined_df[combined_df['Country'] == 'Moldova']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
51,Moldova,52,5.889,0.59448,1.01528,0.61826,0.32818,0.20951,0.01615,2015
212,Moldova,55,5.897,0.69177,0.83132,0.52309,0.25202,0.19997,0.01903,2016
370,Moldova,56,5.838,0.728871,1.251826,0.589465,0.240729,0.208779,0.010091,2017
536,Moldova,67,5.64,0.657,1.301,0.62,0.232,0.171,0.0,2018
696,Moldova,71,5.529,0.685,1.328,0.739,0.245,0.181,0.0,2019


In [61]:
# Calculate the mean for 'Perceptions_of_Corruption' for Moldova
Moldova_Perceptions_of_Corruption = (0.016150 + 0.019030 + 0.010091) / 3

# Replace the zero value in 'Perceptions_of_Corruption' for Moldova with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Moldova') & (combined_df['Perceptions_of_Corruption'] == 0), 'Perceptions_of_Corruption'] = Moldova_Perceptions_of_Corruption

In [62]:
combined_df[combined_df['Country'] == 'Moldova']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
51,Moldova,52,5.889,0.59448,1.01528,0.61826,0.32818,0.20951,0.01615,2015
212,Moldova,55,5.897,0.69177,0.83132,0.52309,0.25202,0.19997,0.01903,2016
370,Moldova,56,5.838,0.728871,1.251826,0.589465,0.240729,0.208779,0.010091,2017
536,Moldova,67,5.64,0.657,1.301,0.62,0.232,0.171,0.01509,2018
696,Moldova,71,5.529,0.685,1.328,0.739,0.245,0.181,0.01509,2019


### Swaziland

In [63]:
combined_df[combined_df['Country'] == 'Swaziland']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
100,Swaziland,101,4.867,0.71206,1.07284,0.07566,0.30658,0.18259,0.0306,2015
760,Swaziland,135,4.212,0.811,1.149,0.0,0.313,0.074,0.135,2019


In [64]:
# Calculate the mean for 'Health_Life_Expectancy' for Swaziland
Swaziland_Health_Life_Expectancy = (0.07566) / 1

# Replace the zero value in 'Health_Life_Expectancy' for Swaziland with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Swaziland') & (combined_df['Health_Life_Expectancy'] == 0), 'Health_Life_Expectancy'] = Swaziland_Health_Life_Expectancy

In [65]:
combined_df[combined_df['Country'] == 'Swaziland']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
100,Swaziland,101,4.867,0.71206,1.07284,0.07566,0.30658,0.18259,0.0306,2015
760,Swaziland,135,4.212,0.811,1.149,0.07566,0.313,0.074,0.135,2019


### Afghanistan

In [66]:
combined_df[combined_df['Country'] == 'Afghanistan']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
152,Afghanistan,153,3.575,0.31982,0.30285,0.30335,0.23414,0.3651,0.09719,2015
311,Afghanistan,154,3.36,0.38227,0.11037,0.17344,0.1643,0.31268,0.07112,2016
455,Afghanistan,141,3.794,0.401477,0.581543,0.180747,0.10618,0.311871,0.061158,2017
614,Afghanistan,145,3.632,0.332,0.537,0.255,0.085,0.191,0.036,2018
779,Afghanistan,154,3.203,0.35,0.517,0.361,0.0,0.158,0.025,2019


In [67]:
# Calculate the mean for 'Freedom_Life_Choices' for Afghanistan
Afghanistan_Freedom_Life_Choices = (0.23414 + 0.16430 + 0.10618 + 0.08500) / 4

# Replace the zero value in 'Freedom_Life_Choices' for Afghanistan with the calculated mean
combined_df.loc[(combined_df['Country'] == 'Afghanistan') & (combined_df['Freedom_Life_Choices'] == 0), 'Freedom_Life_Choices'] = Afghanistan_Freedom_Life_Choices

In [68]:
combined_df[combined_df['Country'] == 'Afghanistan']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
152,Afghanistan,153,3.575,0.31982,0.30285,0.30335,0.23414,0.3651,0.09719,2015
311,Afghanistan,154,3.36,0.38227,0.11037,0.17344,0.1643,0.31268,0.07112,2016
455,Afghanistan,141,3.794,0.401477,0.581543,0.180747,0.10618,0.311871,0.061158,2017
614,Afghanistan,145,3.632,0.332,0.537,0.255,0.085,0.191,0.036,2018
779,Afghanistan,154,3.203,0.35,0.517,0.361,0.147405,0.158,0.025,2019


# Confirming No more zero values

In [69]:
# Locate rows with any zero values in the DataFrame
rows_with_any_zero_values = combined_df[(combined_df == 0).any(axis=1)]

# Display rows with any zero values
rows_with_any_zero_values

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year


# Confirming No duplicates

In [70]:
# Check for duplicates in the entire DataFrame 'combined_df'
combined_df_duplicates = combined_df[combined_df.duplicated()]
combined_df_duplicates.head(20)

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year


# Confirming No Zero Values

In [71]:
# Check for missing values in the DataFrame
missing_values = combined_df.isnull().sum()

# Check if any missing values are found
if missing_values.any():
    print("Missing values found in the DataFrame:")
    print(missing_values)
else:
    print("No missing values found in the DataFrame.")

No missing values found in the DataFrame.


In [72]:
combined_df.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015


In [73]:
combined_df.describe()

Unnamed: 0,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,78.69821,5.379018,0.916289,1.084664,0.613125,0.411822,0.219867,0.125749,2016.993606
std,45.182384,1.127456,0.406811,0.31892,0.246714,0.151334,0.121064,0.105727,1.417364
min,1.0,2.693,0.0153,0.10419,0.005565,0.00589,0.00199,0.001,2015.0
25%,40.0,4.50975,0.6065,0.871393,0.440183,0.309768,0.134,0.05425,2016.0
50%,79.0,5.322,0.982205,1.125,0.64731,0.431,0.202,0.091033,2017.0
75%,118.0,6.1895,1.236187,1.32725,0.808,0.531,0.278832,0.156243,2018.0
max,158.0,7.769,2.096,1.644,1.141,0.724,0.838075,0.55191,2019.0


In [74]:
combined_df.dtypes

Country                       object
Happiness_Rank                 int64
Happiness_Score              float64
GDP_Per_Capita               float64
Social_Support               float64
Health_Life_Expectancy       float64
Freedom_Life_Choices         float64
Generosity                   float64
Perceptions_of_Corruption    float64
Year                           int64
dtype: object

# Data Dictionary to create Region column

In [75]:
unique_countries = combined_df['Country'].unique()
print(unique_countries)

['Switzerland' 'Iceland' 'Denmark' 'Norway' 'Canada' 'Finland'
 'Netherlands' 'Sweden' 'New Zealand' 'Australia' 'Israel' 'Costa Rica'
 'Austria' 'Mexico' 'United States' 'Brazil' 'Luxembourg' 'Ireland'
 'Belgium' 'United Arab Emirates' 'United Kingdom' 'Oman' 'Venezuela'
 'Singapore' 'Panama' 'Germany' 'Chile' 'Qatar' 'France' 'Argentina'
 'Czech Republic' 'Uruguay' 'Colombia' 'Thailand' 'Saudi Arabia' 'Spain'
 'Malta' 'Taiwan' 'Kuwait' 'Suriname' 'Trinidad and Tobago' 'El Salvador'
 'Guatemala' 'Uzbekistan' 'Slovakia' 'Japan' 'South Korea' 'Ecuador'
 'Bahrain' 'Italy' 'Bolivia' 'Moldova' 'Paraguay' 'Kazakhstan' 'Slovenia'
 'Lithuania' 'Nicaragua' 'Peru' 'Belarus' 'Poland' 'Malaysia' 'Croatia'
 'Libya' 'Russia' 'Jamaica' 'North Cyprus' 'Cyprus' 'Algeria' 'Kosovo'
 'Turkmenistan' 'Mauritius' 'Hong Kong' 'Estonia' 'Indonesia' 'Vietnam'
 'Turkey' 'Kyrgyzstan' 'Nigeria' 'Bhutan' 'Azerbaijan' 'Pakistan' 'Jordan'
 'Montenegro' 'China' 'Zambia' 'Romania' 'Serbia' 'Portugal' 'Latvia'
 'Philip

In [76]:
# Creation of Data Dictionary

country_to_region = {
    'Switzerland':'Western Europe',
    'Iceland':'Western Europe',
    'Denmark':'Western Europe',
    'Norway':'Western Europe',
    'Canada':'North America',
    'Finland':'Western Europe',
    'Netherlands':'Western Europe',
    'Sweden':'Western Europe',
    'New Zealand':'Australia and New Zealand',
    'Australia':'Australia and New Zealand',
    'Israel':'Middle East and Northern Africa',
    'Costa Rica':'Latin America and Caribbean',
    'Austria':'Western Europe',
    'Mexico':'Latin America and Caribbean', 
    'United States':'North America',
    'Brazil':'Latin America and Caribbean',
    'Luxembourg':'Western Europe',
    'Ireland':'Western Europe',
    'Belgium':'Western Europe',
    'United Arab Emirates':'Middle East and Northern Africa',
    'United Kingdom':'Western Europe',
    'Oman':'Middle East and Northern Africa',
    'Venezuela':'Latin America and Caribbean',
    'Singapore':'Southeastern Asia',
    'Panama':'Latin America and Caribbean',
    'Germany':'Western Europe',
    'Chile':'Latin America and Caribbean',
    'Qatar':'Middle East and Northern Africa',
    'France':'Western Europe',
    'Argentina':'Latin America and Caribbean',
    'Czech Republic':'Central and Eastern Europe',
    'Uruguay':'Latin America and Caribbean',
    'Colombia':'Latin America and Caribbean',
    'Thailand':'Southeastern Asia',
    'Saudi Arabia':'Middle East and Northern Africa',
    'Spain':'Western Europe',
    'Malta':'Western Europe',
    'Taiwan':'Eastern Asia',
    'Kuwait':'Middle East and Northern Africa',
    'Suriname':'Latin America and Caribbean',
    'Trinidad and Tobago':'Latin America and Caribbean',
    'El Salvador':'Latin America and Caribbean',
    'Guatemala':'Latin America and Caribbean',
    'Uzbekistan':'Central and Eastern Europe',
    'Slovakia':'Central and Eastern Europe',
    'Japan':'Eastern Asia',
    'South Korea':'Eastern Asia',
    'Ecuador':'Latin America and Caribbean',
    'Bahrain':'Middle East and Northern Africa',
    'Italy':'Western Europe',
    'Bolivia':'Latin America and Caribbean',
    'Moldova':'Central and Eastern Europe',
    'Paraguay':'Latin America and Caribbean',
    'Kazakhstan':'Central and Eastern Europe',
    'Slovenia':'Central and Eastern Europe',
    'Lithuania':'Central and Eastern Europe',
    'Nicaragua':'Latin America and Caribbean',
    'Peru':'Latin America and Caribbean',
    'Belarus':'Central and Eastern Europe',
    'Poland':'Central and Eastern Europe',
    'Malaysia':'Southeastern Asia',
    'Croatia':'Central and Eastern Europe',
    'Libya':'Middle East and Northern Africa',
    'Russia':'Central and Eastern Europe',
    'Jamaica':'Latin America and Caribbean',
    'North Cyprus':'Western Europe',
    'Cyprus':'Western Europe',
    'Algeria':'Middle East and Northern Africa',
    'Kosovo':'Central and Eastern Europe',
    'Turkmenistan':'Central and Eastern Europe',
    'Mauritius':'Sub-Saharan Africa',
    'Hong Kong':'Eastern Asia',
    'Estonia':'Central and Eastern Europe',
    'Indonesia':'Southeastern Asia',
    'Vietnam':'Southeastern Asia',
    'Turkey':'Middle East and Northern Africa',
    'Kyrgyzstan':'Central and Eastern Europe',
    'Nigeria':'Sub-Saharan Africa',
    'Bhutan':'Southern Asia',
    'Azerbaijan':'Central and Eastern Europe',
    'Pakistan':'Southern Asia',
    'Jordan':'Middle East and Northern Africa',
    'Montenegro':'Central and Eastern Europe',
    'China':'Eastern Asia',
    'Zambia':'Sub-Saharan Africa',
    'Romania':'Central and Eastern Europe',
    'Serbia':'Central and Eastern Europe',
    'Portugal':'Western Europe',
    'Latvia':'Central and Eastern Europe',
    'Philippines':'Southeastern Asia',
    'Somaliland Region':'Sub-Saharan Africa',
    'Morocco':'Middle East and Northern Africa',
    'Macedonia':'Central and Eastern Europe',
    'Mozambique':'Sub-Saharan Africa',
    'Albania':'Central and Eastern Europe',
    'Bosnia and Herzegovina':'Central and Eastern Europe',
    'Lesotho':'Sub-Saharan Africa',
    'Dominican Republic':'Latin America and Caribbean',
    'Laos':'Southeastern Asia',
    'Mongolia':'Eastern Asia',
    'Swaziland':'Sub-Saharan Africa',
    'Greece':'Western Europe',
    'Lebanon':'Middle East and Northern Africa',
    'Hungary':'Central and Eastern Europe',
    'Honduras':'Latin America and Caribbean',
    'Tajikistan':'Central and Eastern Europe',
    'Tunisia':'Middle East and Northern Africa',
    'Palestinian Territories':'Middle East and Northern Africa',
    'Bangladesh':'Southern Asia',
    'Iran':'Middle East and Northern Africa',
    'Ukraine':'Central and Eastern Europe',
    'Iraq':'Middle East and Northern Africa',
    'South Africa':'Sub-Saharan Africa',
    'Ghana':'Sub-Saharan Africa',
    'Zimbabwe':'Sub-Saharan Africa',
    'Liberia':'Sub-Saharan Africa',
    'India':'Southern Asia',
    'Sudan':'Sub-Saharan Africa',
    'Haiti':'Latin America and Caribbean',
    'Congo (Kinshasa)':'Sub-Saharan Africa',
    'Nepal':'Southern Asia',
    'Ethiopia':'Sub-Saharan Africa',
    'Sierra Leone':'Sub-Saharan Africa',
    'Mauritania':'Sub-Saharan Africa',
    'Kenya':'Sub-Saharan Africa',
    'Djibouti':'Sub-Saharan Africa',
    'Armenia':'Central and Eastern Europe',
    'Botswana':'Sub-Saharan Africa',
    'Myanmar':'Southeastern Asia',
    'Georgia':'Central and Eastern Europe',
    'Malawi':'Sub-Saharan Africa',
    'Sri Lanka':'Southern Asia',
    'Cameroon':'Sub-Saharan Africa',
    'Bulgaria':'Central and Eastern Europe',
    'Egypt':'Middle East and Northern Africa',
    'Yemen':'Middle East and Northern Africa',
    'Angola':'Sub-Saharan Africa',
    'Mali':'Sub-Saharan Africa',
    'Congo (Brazzaville)':'Sub-Saharan Africa',
    'Comoros':'Sub-Saharan Africa',
    'Uganda':'Sub-Saharan Africa',
    'Senegal':'Sub-Saharan Africa',
    'Gabon':'Sub-Saharan Africa',
    'Niger':'Sub-Saharan Africa',
    'Cambodia':'Southeastern Asia',
    'Tanzania':'Sub-Saharan Africa',
    'Madagascar':'Sub-Saharan Africa',
    'Central African Republic':'Sub-Saharan Africa',
    'Chad':'Sub-Saharan Africa',
    'Guinea':'Sub-Saharan Africa',
    'Ivory Coast':'Sub-Saharan Africa',
    'Burkina Faso':'Sub-Saharan Africa',
    'Afghanistan':'Southern Asia',
    'Rwanda':'Sub-Saharan Africa',
    'Benin':'Sub-Saharan Africa',
    'Syria':'Middle East and Northern Africa',
    'Burundi':'Sub-Saharan Africa',
    'Togo':'Sub-Saharan Africa',
    'Puerto Rico':'Latin America and Caribbean',
    'Belize':'Latin America and Caribbean',
    'Somalia':'Sub-Saharan Africa',
    'Namibia':'Sub-Saharan Africa',
    'South Sudan':'Sub-Saharan Africa',
    'North Macedonia':'Central and Eastern Europe',
    'Gambia':'Sub-Saharan Africa'
}

# Creation of Region Column

In [77]:
# Create Region Column
combined_df['Region'] = combined_df['Country'].map(country_to_region)

In [78]:
combined_df.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year,Region
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015,Western Europe
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015,Western Europe
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015,Western Europe
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015,Western Europe
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015,North America


In [79]:
combined_df.count()

Country                      782
Happiness_Rank               782
Happiness_Score              782
GDP_Per_Capita               782
Social_Support               782
Health_Life_Expectancy       782
Freedom_Life_Choices         782
Generosity                   782
Perceptions_of_Corruption    782
Year                         782
Region                       782
dtype: int64

In [80]:
combined_df.dtypes

Country                       object
Happiness_Rank                 int64
Happiness_Score              float64
GDP_Per_Capita               float64
Social_Support               float64
Health_Life_Expectancy       float64
Freedom_Life_Choices         float64
Generosity                   float64
Perceptions_of_Corruption    float64
Year                           int64
Region                        object
dtype: object

In [81]:
combined_df.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,GDP_Per_Capita,Social_Support,Health_Life_Expectancy,Freedom_Life_Choices,Generosity,Perceptions_of_Corruption,Year,Region
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015,Western Europe
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015,Western Europe
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015,Western Europe
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015,Western Europe
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015,North America


# Exporting Cleaned & Wrangled CSV

In [82]:
# Define the directory path
directory = r'C:\Users\natha\OneDrive\Desktop\Data Analytics\Jupyter\12-2023 World Happiness\02 Data\Prepared Data'

# Set the file path for the CSV file
file_path = os.path.join(directory, 'happiness_data_cleaned.csv')

# Save the DataFrame to a CSV file without the index
combined_df.to_csv(file_path, index=False)