<a href="https://colab.research.google.com/github/mizzony/ML-class/blob/main/World_Happiness_Report_2022_Clustering%2C_GIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
ajaypalsinghlo_world_happiness_report_2022_path = kagglehub.dataset_download('ajaypalsinghlo/world-happiness-report-2022')

print('Data source import complete.')


# Import Libraries and Load Data

# World Happiness Report 2022 - Clustering
The World Happiness Report is a landmark survey of the state of global happiness. The reports review the state of happiness in the world today and show how the new science of happiness explains personal and national variations in happiness.

For more information about the dataset use the following Kaggle link:<br>
https://www.kaggle.com/datasets/ajaypalsinghlo/world-happiness-report-2022

![happy.jpg](attachment:62a8caa4-85f7-4ccd-9c89-858acb5de86e.jpg)<br>
**"Happiness is not a goal...it's a by-product of a life well-lived."** - Eleanor Roosevelt

# Import Libraries and Load Data

In [None]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'): # get file path
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load data
df = pd.read_csv("/kaggle/input/world-happiness-report-2022/World Happiness Report 2022.csv")

# EDA - Exploratory Data Analysis

In [None]:
df.head() #first 5 entries

In [None]:
df.tail()

In [None]:
df.info()  # infos about the samples, features and datatypes

In [None]:
df.isnull().sum() # checking missing values

We have 146 samples, 12 features and no missing values. Let's check statistical infos and correlation between features.

In [None]:
df.describe() # statistical infos of the columns

In [None]:
df.corr() # the correlation between the columns

In [None]:
# Correlation of the features shown in a heatmap
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True);

In [None]:
# Names and Number of Countrys
print("Country Names:\n", df.Country.unique())
print("\nNumber of Countrys:", df.Country.nunique())

In [None]:
country_name='Austria' # Change name to find your Countrys Rank
country_rank = df[df['Country'] == country_name]['RANK'].values[0]
print(f"{country_name}'s Rank: {country_rank}")

## Data Visualization

In [None]:
# show all numeric columns as boxplots except for RANK
plt.figure(figsize=(20,15))
sns.boxplot(data=df.iloc[:,1:])
plt.xticks(rotation=90);

In [None]:
# Create a figure and a set of subplots with as many rows as needed and 4 columns
num_rows = (len(df.columns) + 2) // 4  # Calculate the number of rows needed
fig, axes = plt.subplots(nrows=num_rows, ncols=4, figsize=(20,15))

# Create a histplot for each column in the DataFrame
for i, col in enumerate(df.columns):
    sns.histplot(data=df, x=col, ax=axes[i // 4, i % 4])
plt.tight_layout()
plt.show()

# Elbow Method and KMeans for Clustering

In [None]:
x=df.iloc[:, 2:] # get needed columns
x.head() # show features

## Elbow Method

In [None]:
# Elbow Method
allscore = []
allclusters = []
distances = []

# Score change according to cluster
def clust(n):

    for i in range(1,n+1):
        i=i+1
        # fit and predict model for range of clusters
        model=KMeans(n_clusters=i)
        model=model.fit(x)
        pred=model.predict(x)
        score=silhouette_score(x,pred)
        distance=model.inertia_
        print("Cluster",i, "score:", score)
        # add the scores and distances to the lists
        allscore.append(score)
        allclusters.append(i)
        distances.append(distance)
clust(10) # call function wit 10 clusters

In [None]:
# plot the results in a graph
plt.xlabel("k")
plt.ylabel("distances")
plt.plot(allclusters,distances);

We can see that the graph is changing at the value of 3, so we use 3 clusters.

## KMeans

In [None]:
# Use 3 clusters, than fit and predict model
cluster_labels = KMeans(n_clusters=3).fit_predict(x)
df['cluster'] = cluster_labels # Add the cluster labels to the DataFrame
print("Silhouette Score: ", silhouette_score(x,cluster_labels)) # Print the Silhouette Score

In [None]:
# see the distribution of clusters
ax=sns.countplot(x=df.cluster)
ax.bar_label(ax.containers[0]);

## Plotting end results in Scatter Diagrams.

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'GDP per capita' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Explained by: GDP per capita',palette='mako', hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'Social support' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Explained by: Social support',palette="rocket", hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'Healthy life expectancy' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Explained by: Healthy life expectancy',palette="vlag", hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'Freedom to make life choices' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Explained by: Freedom to make life choices',palette="crest", hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'Dystopia (1.83) + residual' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Dystopia (1.83) + residual',palette="Spectral", hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'Happiness score' feature on the x-axis and the 'Generosity' feature on the y-axis
sns.scatterplot(x='Happiness score', y='Explained by: Generosity',palette="viridis", hue='cluster', data=df);

In [None]:
# Use seaborn to create a scatter plot with the 'GDP per capita' feature on the x-axis and the 'Healthy life expectancy' feature on the y-axis
sns.scatterplot(x='Explained by: GDP per capita', y='Explained by: Healthy life expectancy',palette="turbo", hue='cluster', data=df);

**Conclusion:**<br>
We can the that the Happines Score is rising with better Social Support, more freedom and GDP per capita.

# GIS - Geographic Info Systems
We are using GIS and plotly to visualize the clusters around the world.

In [None]:
#!pip install plotly

In [None]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, plot
init_notebook_mode(connected=True)

In [None]:
data=dict(
    type="choropleth",
    colorscale='aggrnyl',
    reversescale=True,
    locations=df['Country'],
    locationmode="country names",
    z=df["cluster"],
    text=df["Country"],
    colorbar={'title':'Cluster Groups'}
)

layout=dict(title='Cluster Groups around the World',
            geo=dict(showframe=False,projection={'type':'mercator'})
           )

In [None]:
choromap=go.Figure(data=data,layout=layout)
iplot(choromap, validate=False)

![happy2.jpg](attachment:9c8dad03-fdd3-4b25-a172-0e9b18f2721e.jpg)<br>
**"Don't worry. Be happy."** - Bobby McFerrin