# Clustering Methods

### Imports

In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Sklearn import
from sklearn.model_selection import train_test_split # Splitting the data set
from sklearn.model_selection import KFold, cross_val_score # Cross validation
from sklearn.preprocessing import MinMaxScaler, StandardScaler # Normalization and standard scaler
from sklearn.preprocessing import PolynomialFeatures # Polynomial features
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # Label and 1-hot encoding
from sklearn.linear_model import LogisticRegression # Logistic regression model
from sklearn.linear_model import LogisticRegressionCV # Logistic regression model with cross validation
from sklearn.neighbors import KNeighborsClassifier # KNN Algorithm
from sklearn.model_selection import GridSearchCV   # Grid search for cross validation
from sklearn.tree import DecisionTreeClassifier, plot_tree # Decision Trees
from sklearn.metrics import accuracy_score  # Accuracy
from sklearn.metrics import confusion_matrix # Confusion matrix
from sklearn.metrics import precision_score, recall_score, f1_score  # Precision, recall, and f1 score
from sklearn.linear_model import LinearRegression # Regression linear model
from sklearn.linear_model import Ridge # Ridge model
from sklearn.linear_model import RidgeCV # Ridge with cross validation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # Metrics for errors

# Import to load arff file from url
from scipy.io import arff
import urllib.request
import io

In [2]:
world_metrics_subset_path = "https://raw.githubusercontent.com/mivelikikh/what_makes_us_happy/main/data/world_metrics_subset.csv"
world_metrics_subset = pd.read_csv(world_metrics_subset_path)

world_metrics_subset

Unnamed: 0,country,development_index,life_expect,life_exp60,basic_water,gdp_per_capita,eco_footprint,pf_rol,ef_legal,adult_mortality,infant_mort,age1-4mort,happiness_score
0,Angola,0.52,62.63262,17.34829,55.08428,4665.91,0.93,3.451814,2.963635,237.96940,0.057900,0.007520,3.866
1,Burundi,0.39,60.09811,16.59126,60.20415,276.69,0.80,2.961470,3.495487,290.18580,0.052420,0.006450,2.905
2,Benin,0.48,61.08568,17.20543,66.32024,746.83,1.41,4.129480,3.822761,242.37410,0.066690,0.009390,3.484
3,Burkina Faso,0.39,60.32101,15.48575,48.26772,671.07,1.21,4.860575,3.687657,254.60270,0.055795,0.008635,3.739
4,Botswana,0.69,66.05297,17.42258,89.40444,7743.50,3.83,5.641684,5.950516,249.24130,0.032560,0.002040,3.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,New Zealand,0.91,82.24739,25.29202,100.00000,37488.30,5.60,7.868546,8.715280,66.05728,0.003975,0.000235,7.334
133,Japan,0.89,84.16616,26.39402,98.97000,46201.60,5.02,7.643490,7.586987,50.82619,0.001980,0.000195,5.921
134,Cambodia,0.55,69.36723,17.36710,76.94537,877.64,1.21,2.566741,4.277907,170.49700,0.027600,0.001110,3.907
135,South Korea,0.89,82.66409,25.26966,99.67540,24155.80,5.69,7.438183,6.391154,60.81405,0.002955,0.000125,5.835


In [3]:
# Displaying the data types
world_metrics_subset.dtypes

country               object
development_index    float64
life_expect          float64
life_exp60           float64
basic_water          float64
gdp_per_capita       float64
eco_footprint        float64
pf_rol               float64
ef_legal             float64
adult_mortality      float64
infant_mort          float64
age1-4mort           float64
happiness_score      float64
dtype: object

In [4]:
world_metrics_subset.describe()

Unnamed: 0,development_index,life_expect,life_exp60,basic_water,gdp_per_capita,eco_footprint,pf_rol,ef_legal,adult_mortality,infant_mort,age1-4mort,happiness_score
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,0.704919,72.962743,20.26201,88.432938,15695.127153,3.423285,5.166594,5.235571,150.25442,0.021038,0.001848,5.41692
std,0.155286,7.307447,3.026119,15.825358,22170.466768,2.432932,1.628152,1.530037,78.374564,0.020495,0.002789,1.124471
min,0.34,53.1305,13.34951,38.85259,276.69,0.61,2.060257,2.002915,49.20185,0.00179,7e-05,2.905
25%,0.59,68.03925,17.60933,82.80058,1627.9,1.46,3.974523,4.030095,87.42916,0.005,0.00024,4.459
50%,0.73,74.84309,20.11714,95.9924,5880.8,2.84,4.757804,5.071814,135.3411,0.013175,0.000545,5.303
75%,0.83,77.76206,22.39087,99.58675,18103.1,4.98,6.382863,6.158069,194.9528,0.03256,0.002235,6.324
max,0.94,84.16616,26.39402,100.0,114665.0,15.82,8.687101,8.798181,388.8057,0.0908,0.014615,7.526


In [5]:
world_metrics_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country            137 non-null    object 
 1   development_index  137 non-null    float64
 2   life_expect        137 non-null    float64
 3   life_exp60         137 non-null    float64
 4   basic_water        137 non-null    float64
 5   gdp_per_capita     137 non-null    float64
 6   eco_footprint      137 non-null    float64
 7   pf_rol             137 non-null    float64
 8   ef_legal           137 non-null    float64
 9   adult_mortality    137 non-null    float64
 10  infant_mort        137 non-null    float64
 11  age1-4mort         137 non-null    float64
 12  happiness_score    137 non-null    float64
dtypes: float64(12), object(1)
memory usage: 14.0+ KB


In [6]:
world_metrics_subset.columns

Index(['country', 'development_index', 'life_expect', 'life_exp60',
       'basic_water', 'gdp_per_capita', 'eco_footprint', 'pf_rol', 'ef_legal',
       'adult_mortality', 'infant_mort', 'age1-4mort', 'happiness_score'],
      dtype='object')

In [7]:
world_metrics_subset['country'].unique()

array(['Angola', 'Burundi', 'Benin', 'Burkina Faso', 'Botswana',
       'Tanzania', 'Uganda', 'South Africa', 'Zambia', 'Zimbabwe', 'Chad',
       'Togo', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Senegal',
       'Sierra Leone', 'Madagascar', 'Mali', 'Mauritania', 'Mauritius',
       'Malawi', 'Kenya', 'Liberia', 'Algeria', 'Ethiopia', 'Gabon',
       'Ghana', 'Guinea', 'Cameroon', 'DR Congo', 'Congo Republic',
       'Argentina', 'United States', 'Venezuela', 'Paraguay',
       'El Salvador', 'Suriname', 'Trinidad and Tobago', 'Uruguay',
       'Mexico', 'Nicaragua', 'Panama', 'Peru', 'Guatemala', 'Honduras',
       'Haiti', 'Jamaica', 'Costa Rica', 'Dominican Republic', 'Ecuador',
       'Bolivia', 'Brazil', 'Canada', 'Chile', 'Colombia',
       'United Arab Emirates', 'Bahrain', 'Egypt', 'Iran', 'Tunisia',
       'Pakistan', 'Qatar', 'Saudi Arabia', 'Iraq', 'Jordan', 'Kuwait',
       'Lebanon', 'Libya', 'Morocco', 'Albania', 'Armenia', 'Austria',
       'Azerbaijan', 'Belgium', 'Sw

## Scaling

To scale our data, we will use two methods:
- MinMaxScaler
- StandardScaler

In [8]:
# Extract features and class
X_metrics = world_metrics_subset[['development_index', 'life_expect', 'life_exp60',
       'basic_water', 'gdp_per_capita', 'eco_footprint', 'pf_rol', 'ef_legal',
       'adult_mortality', 'infant_mort', 'age1-4mort']]
y_metrics = world_metrics_subset['happiness_score']

In [9]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_metrics, y_metrics, 
                                                            test_size=0.2, 
                                                            random_state=39, 
                                                            shuffle=True)

print(f"The training set has {X_train.shape[0]} observations, and the test set has {X_test.shape[0]} observations.")

The training set has 109 observations, and the test set has 28 observations.


In [10]:
# Encode the countries

# Extract the column of interest
#item_en = X_train[['country']].values.ravel()
#item_en_test = X_test[['country']].values.ravel()
#Define the encoder
#le = LabelEncoder()
#Fit the encoder
#le.fit(item_en)
#Transform the train and the test set
#X_train = X_train.assign(Item=le.transform(item_en))
#X_test = X_test.assign(Item=le.transform(item_en_test))
#print(X_train[['Item']])

### StandardScaler

In [11]:
# Rescaling
scaler_std = StandardScaler() # Define the scaler
scaler_std.fit(X_train)   # Fit the scaler

# Transform the train and the test set
X_train_std = scaler_std.transform(X_train)
X_test_std = scaler_std.transform(X_test)

### MinMaxScaler

In [12]:
# Rescaling
scaler_minmax = MinMaxScaler() # Define the scaler
scaler_minmax.fit(X_train) # Fit the scaler

# Transform the train and the test set
X_train_minmax = scaler_minmax.transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

## Clustering Methods

To perform the clustering we will use the following methods:
- K-Nearest Neighbors
- Agglomarative Clustering
- Other method

### KNN Model

In [15]:
# Define parameters to test
#grid = {'n_neighbors':np.arange(1,12),     # array from 1 to 12 neighbors (because we want to capture 1-11)
        #'p':np.arange(1,3),                # array from 1 to 3, distance metrics
        #'weights':['uniform','distance']   # weights
       #}

# Define and fit model
#knn = KNeighborsClassifier()
#knn_cv = GridSearchCV(knn, grid, cv=7)
#knn_cv.fit(X_train, y_train)

### KNN model with standard scaler

In [18]:
# Define the model
model_knn_std = KNeighborsClassifier(n_neighbors=9, p=2, weights='uniform')

# Fit our model
model_knn_std.fit(X_train_std, y_train)

ValueError: Unknown label type: 'continuous'

### KNN model with minmax scaler

In [19]:
# Define the model
model_knn_minmax = KNeighborsClassifier(n_neighbors=9, p=2, weights='uniform')

# Fit our model
model_kNN_std.fit(X_train_minmax, y_train_minmax)

NameError: name 'model_kNN_std' is not defined

In [20]:
# Set up KMeans model with the number of clusters=4 
kmeans1 = KMeans(n_clusters=4, random_state=17, n_init='auto') 

# Fit the model
kmeans1.fit(X)

NameError: name 'KMeans' is not defined

### Agglomerative Model

In [None]:
# Set up hierarchical algorithm
agglomerative = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')

# Fit model
agglomerative.fit(X)