# EDA of World Hapinness Report 2019

### Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

### Loading the Data

In [2]:
df = pd.read_csv("2019.csv")
df.shape

(156, 9)

### Preprocessing the Data

In [3]:
df.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [5]:
df.describe()

Unnamed: 0,Overall rank,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,156.0,156.0,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,5.407096,0.905147,1.208814,0.725244,0.392571,0.184846,0.110603
std,45.177428,1.11312,0.398389,0.299191,0.242124,0.143289,0.095254,0.094538
min,1.0,2.853,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.75,4.5445,0.60275,1.05575,0.54775,0.308,0.10875,0.047
50%,78.5,5.3795,0.96,1.2715,0.789,0.417,0.1775,0.0855
75%,117.25,6.1845,1.2325,1.4525,0.88175,0.50725,0.24825,0.14125
max,156.0,7.769,1.684,1.624,1.141,0.631,0.566,0.453


In [6]:
df.isnull().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

### Exploratory Data Analysis (EDA)
Finding a relationship between the target column (Score) and the rest

In [8]:
columns = ['GDP per capita', 'Social support', 'Healthy life expectancy',
		   'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']

In [9]:
# calculate the correlations and plot the relationship between the Score column and each column in the list
for column in columns:
	print("Score vs.", column)

	corr = df["Score"].corr(df[column])
	print("Correlation: ", corr)

	fig = px.scatter(data_frame= df, x= column, y= "Score", title= f"Score vs. {column}")
	fig.show()

Score vs. GDP per capita
Correlation:  0.7938828678781276


Score vs. Social support
Correlation:  0.7770577880638643


Score vs. Healthy life expectancy
Correlation:  0.7798831492425828


Score vs. Freedom to make life choices
Correlation:  0.5667418257199902


Score vs. Generosity
Correlation:  0.07582369490389652


Score vs. Perceptions of corruption
Correlation:  0.38561307086647867


### Insights

1. There is a strong positive correlation between Score and;
	* GDP per Capita
	* Social Support
	* Healthy Life Expectancy
2. There is a moreate positive correlation between Score and;
	* Freedom to make Life Choices
3. There is a positive correlation between Score and;
	* Perceptions of Corruption
4. There is a weak positive correlation between Score and;
	* Generosity

### Exploratory Data Analysis (EDA) - Continue

After finding the relationships between columns, we try to explore the countries and regions that have the highest and the lowest Score

In [10]:
df = df.sort_values(by= ["Score"], ascending= False)
highest = df.head(10)
lowest = df.tail(10)

In [19]:
# plot the highest and the lowest scored countries or regions
fig = make_subplots(rows= 1, cols= 2, subplot_titles= ("Highest Scores", "Lowest Scores"))

fig.add_trace(
	go.Scatter(x= highest["Country or region"], y= highest["Score"],
			mode= 'markers+lines'), row= 1, col= 1
)

fig.add_trace(
	go.Scatter(x= lowest["Country or region"], y= lowest["Score"],
			mode= 'markers+lines'), row= 1, col= 2
)

fig.update_layout(height= 600, width= 1100, title_text= "Highest and Lowest Scores of Countries")
fig.show()

In [20]:
# visualize the highest and lowest countries with the other columns
for column in columns:
	fig = make_subplots(rows= 1, cols= 2, subplot_titles= ("Highest Scores", "Lowest Scores"))

	fig.add_trace(
		go.Scatter(x= highest["Country or region"], y= highest[column],
				mode= 'markers+lines'), row= 1, col= 1
	)

	fig.add_trace(
		go.Scatter(x= lowest["Country or region"], y= lowest[column],
				mode= 'markers+lines'), row= 1, col= 2
	)

	fig.update_layout(height= 600, width= 1100, title_text= f"{column} for Highest and Lowest Scored Countries")
	fig.show()

### Summarize

We first took an overview of the dataset, then we found that there are no null or duplicate values.

Then, we started discovering the relationship between the target column Score and the other columns, and visualize these relationships.

At the end, we compared the higest and the lowest scored countries or regions according to the other columns using visualization.