In [1]:
import pandas as pd

HAPPINESS = '/kaggle/input/world-happiness-report-2024/World Happiness Report_Feb24.csv'

df = pd.read_csv(filepath_or_buffer=HAPPINESS)

df.head()

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.35,0.451,50.5,0.718,0.164,0.882,0.414,0.258
1,Afghanistan,2009,4.402,7.509,0.552,50.8,0.679,0.187,0.85,0.481,0.237
2,Afghanistan,2010,4.758,7.614,0.539,51.1,0.6,0.118,0.707,0.517,0.275
3,Afghanistan,2011,3.832,7.581,0.521,51.4,0.496,0.16,0.731,0.48,0.267
4,Afghanistan,2012,3.783,7.661,0.521,51.7,0.531,0.234,0.776,0.614,0.268


Let's use dimension reduction to build some clusters. We want to cluster on the independent variables and color using the dependent variable.

In [2]:
import arrow
from plotly import express
from umap import UMAP

TARGET = 'Life Ladder'
COLUMNS = ['Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect']

time_start = arrow.now()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
umap_df = pd.DataFrame(data=reducer.fit_transform(X=df[COLUMNS].fillna(value=0.0)), columns=['x', 'y'])
umap_df[TARGET] = df[TARGET].tolist()
umap_df['year'] = df['year'].tolist()
umap_df['Country name'] = df['Country name'].tolist()
express.scatter(data_frame=umap_df, x='x', y='y', color=TARGET, hover_name='Country name', hover_data=['year'] ).show()
print('UMAP done in {}'.format(arrow.now() - time_start))

2024-05-23 17:55:08.823812: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-23 17:55:08.823965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-23 17:55:09.040528: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Thu May 23 17:55:21 2024 Construct fuzzy simplicial set
Thu May 23 17:55:27 2024 Finding Nearest Neighbors
Thu May 23 17:55:32 2024 Finished Nearest Neighbor Search
Thu May 23 17:55:36 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Thu May 23 17:55:38 2024 Finished embedding


UMAP done in 0:00:19.030441


UMAP tends to put the high happiness country/years together; the rest of the data is kind of a mix.

Let's color the same clusters based on their region/continent. We need to load some more data first.

In [3]:
ISO = '/kaggle/input/country-mapping-iso-continent-region/continents2.csv'

iso_df = pd.read_csv(filepath_or_buffer=ISO, usecols=['name', 'region'])
iso_df.head()

Unnamed: 0,name,region
0,Afghanistan,Asia
1,Åland Islands,Europe
2,Albania,Europe
3,Algeria,Africa
4,American Samoa,Oceania


In [4]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
region_df = umap_df.merge(right=iso_df, left_on='Country name', right_on='name', how='inner')
express.scatter(data_frame=region_df, x='x', y='y', color='region', hover_name='Country name', hover_data=['year'] ).show()

Very broadly the happy countries are in Europe and Oceania and the unhappy countries are in Africa.