## This Notebook includes:
#### 1. Importing libraries and data
#### 2. Consistancy checks
#### 3. Data Wrangling
#### 4. Exporting csv

### 1. Importing libraries and data

In [2]:
## importing libraries
import pandas as pd
import numpy as np
import os

In [3]:
## create directory path
path = r'C:\Users\luke_\Documents\FinalProject\02_data'

In [4]:
df = pd.read_csv(os.path.join(path, 'original_data','happiness.csv')) 

### 2. Consistancy Checks for the Dataframe 

In [5]:
## start with checking if the import was successful
df.head()

Unnamed: 0,Year,Rank,Country name,Region,Happiness score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,2024,1,Finland,Europe and Central Asia,7.741,1.844,1.572,0.695,0.859,0.142,0.546
1,2024,2,Denmark,Europe and Central Asia,7.583,1.908,1.52,0.699,0.823,0.204,0.548
2,2024,3,Iceland,Europe and Central Asia,7.525,1.881,1.617,0.718,0.819,0.258,0.182
3,2024,4,Sweden,Europe and Central Asia,7.344,1.878,1.501,0.724,0.838,0.221,0.524
4,2024,5,Israel,Middle East and North Africa,7.341,1.803,1.513,0.74,0.641,0.153,0.193


In [6]:
df.shape

(1506, 11)

In [7]:
# Lets get a overview of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1506 entries, 0 to 1505
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year                          1506 non-null   int64  
 1   Rank                          1506 non-null   int64  
 2   Country name                  1506 non-null   object 
 3   Region                        1506 non-null   object 
 4   Happiness score               1506 non-null   float64
 5   GDP per capita                1506 non-null   float64
 6   Social support                1506 non-null   float64
 7    Healthy life expectancy      1506 non-null   float64
 8   Freedom to make life choices  1506 non-null   float64
 9   Generosity                    1506 non-null   float64
 10  Perceptions of corruption     1506 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 129.6+ KB


In [8]:
# Lets look at the basic stats for all numerical columns
df.describe()

Unnamed: 0,Year,Rank,Happiness score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0,1506.0
mean,2019.374502,76.112882,5.44979,1.052746,1.05343,0.578175,0.466582,0.170902,0.155222
std,2.852379,43.896865,1.124127,0.463059,0.332298,0.239459,0.164827,0.105981,0.125084
min,2015.0,1.0,1.721,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,38.0,4.59775,0.74025,0.841,0.40125,0.367,0.092,0.064
50%,2019.0,76.0,5.473,1.0705,1.0925,0.602,0.4825,0.1515,0.1185
75%,2022.0,114.0,6.27125,1.374,1.318,0.76075,0.585,0.229,0.21
max,2024.0,158.0,7.842,2.209,1.644,1.141,0.863,0.838,0.82


In [9]:
#Checking for missing values in the dataframe
missing_values = df.isnull().sum()
print(missing_values)
#There are no missing values

Year                            0
Rank                            0
Country name                    0
Region                          0
Happiness score                 0
GDP per capita                  0
Social support                  0
 Healthy life expectancy        0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64


In [10]:
# Check for duplicate rows in the DataFrame
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)
#an empty dataframe means there are not duplucates

Empty DataFrame
Columns: [Year, Rank, Country name, Region, Happiness score, GDP per capita, Social support,  Healthy life expectancy, Freedom to make life choices, Generosity, Perceptions of corruption]
Index: []


In [11]:
# Check data types of each column in the DataFrame
data_types = df.dtypes
print(data_types)
#This checks out and is appropriate for the analysis

Year                              int64
Rank                              int64
Country name                     object
Region                           object
Happiness score                 float64
GDP per capita                  float64
Social support                  float64
 Healthy life expectancy        float64
Freedom to make life choices    float64
Generosity                      float64
Perceptions of corruption       float64
dtype: object


### 3. Data Wrangling

In [12]:
# I will change the column names to make them consistant.
print(df.columns)

Index(['Year', 'Rank', 'Country name', 'Region', 'Happiness score',
       'GDP per capita', 'Social support', ' Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')


In [13]:
#I am having problems renaming some of the columns, so I will clean up any potential white space which might be causing issues
df.columns = df.columns.str.strip() 

In [14]:
df.rename(columns={'Year':'year', 'Rank':'rank', 'Country name':'country_name', 'Happiness score':'happiness_score', 'GDP per capita':'gdp/capita',
                  'Social support':'social_support', 'Healthy life expectancy':'life_expectancy', 'Freedom to make life choices':'freedom_of_choice',
                  'Generosity':'generosity', 'Perceptions of corruption':'percieved_corruption'}, inplace=True)

In [15]:
#All good now!
df.head()

Unnamed: 0,year,rank,country_name,Region,happiness_score,gdp/capita,social_support,life_expectancy,freedom_of_choice,generosity,percieved_corruption
0,2024,1,Finland,Europe and Central Asia,7.741,1.844,1.572,0.695,0.859,0.142,0.546
1,2024,2,Denmark,Europe and Central Asia,7.583,1.908,1.52,0.699,0.823,0.204,0.548
2,2024,3,Iceland,Europe and Central Asia,7.525,1.881,1.617,0.718,0.819,0.258,0.182
3,2024,4,Sweden,Europe and Central Asia,7.344,1.878,1.501,0.724,0.838,0.221,0.524
4,2024,5,Israel,Middle East and North Africa,7.341,1.803,1.513,0.74,0.641,0.153,0.193


## Exporting csv

In [16]:
df.to_csv(os.path.join(path, 'prepared_data', 'happiness_checked.csv'), index=False)