# Clustering - Tuto 1

In this tuto we play with a realistic 1D dataset: Vertical mean temperature from Argo data

We use clustering with GMM to analyse the PDF of the dataset.

The goal is to understand better how GMM works, and its limitations.

The optimal number of clusters is determined with BIC and independant samples issues are demonstrated

(c) G. Maze


***
*First, let's make sure the Python env is correct to run this notebook*:

In [1]:
import os, sys, urllib, tempfile
with tempfile.TemporaryDirectory() as local:
    sys.path.append(local)
    urllib.request.urlretrieve("https://raw.githubusercontent.com/obidam/ds2-2024/main/utils.py", os.path.join(local, "utils.py"))
    from utils import check_up_env
    check_up_env(with_tuto=True)

Running on your own environment


In [2]:
# Libraries import section
import os, sys

import xarray as xr
import numpy as np
from scipy import stats
from scipy import signal
from intake import open_catalog

from sklearn import preprocessing
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

import matplotlib
# matplotlib.use('agg')
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.cm as cm
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import seaborn as sns
sns.set(context="notebook", style="whitegrid", palette="deep", color_codes=True)

from tuto_tools import create_map, gaussian, plot_normal

ModuleNotFoundError: No module named 'intake'

# Load data

From Google cloud storage [see specific tuto here](https://github.com/obidam/ds2-2023/blob/main/practice/environment/02-Access_to_data_in_the_cloud.ipynb).

We work with a 1-dimensional dataset: local 0-2000m vertical mean measurements (eg: temperature, salinity), from Argo floats.

In [None]:
catalog_url = 'https://raw.githubusercontent.com/obidam/ds2-2024/main/ds2_data_catalog.yml'
cat = open_catalog(catalog_url)
ds = cat['argo_global_vertical_mean'].read_chunked()
print('This dataset holds: %.3f GB' % (ds.nbytes / 1e9))
print(ds)

# Stats for 1D data

In [None]:
X, Xlabel = ds['TEMP'], "VERTICAL MEAN TEMP"
# X = X[np.where(X>3)]
print(X)
print(Xlabel)
print(X.shape)

In [None]:
fig, proj, ax = create_map()
plt.scatter(X.LONGITUDE, X.LATITUDE, 1, X, cmap=plt.cm.get_cmap('gist_ncar',8), vmin=0, vmax=16)
plt.colorbar()
plt.title(Xlabel)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=plt.figaspect(0.5), dpi=120)
axes.set_title('Histogram')
sns.histplot(X, ax=axes)

# Pre-process data

Normalisation step

In [None]:
%%time
# From xarray, X has shape: (N_PROF,)
# For scikit-learn we need X with shape: [N_PROF,1]
X0 = X.values[np.newaxis].T
print("Data shape [n_samples, n_features]:", X0.shape) # shape [n_samples, n_features=1]

# Fit the scaler object:
scaler = preprocessing.StandardScaler()
scaler = scaler.fit(X0)

# The mean and std profiles are in the scaler object properties:
X_ave = scaler.mean_
X_std = scaler.scale_
print("Data mean, std:", X_ave, X_std)

# Normalize data:
Xn = scaler.transform(X0)       

# Here, we only center data:
Xc = preprocessing.StandardScaler(with_std=False).fit(X0).transform(X0)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=plt.figaspect(0.5), dpi=120)
axes.set_title('PDF and Gaussian fit to normalised data')
ax = sns.histplot(Xn, stat='density', kde=False)
plot_normal(np.mean(Xn), np.std(Xn), color=ax.patches[-1].get_facecolor())

# Clustering in 1D

Clearly from the figure above, one can see that the dataset is not Gaussian and exhibits several modes. In other words, data samples agregated into several clusters.

Let's identify them

## Use a GMM to identify modes (the "clusters") in the distribution

In [None]:
from sklearn.mixture import GaussianMixture as GMM

# Clustering with GMM:
classifier = GMM(n_components=4)
classifier = classifier.fit(Xn)
print(classifier)

In [None]:
# Predict class labels:
labels = classifier.predict(Xn)
print(labels)

In [None]:
# See more about clusters:
n_clusters = np.unique(labels).shape[0]
for k in range(n_clusters):
    print("Cluster %i mean(std), weight: %0.2f (%0.2f), %0.2f"%(k, 
                                                                classifier.means_[k,0], 
                                                                classifier.covariances_[k,0],
                                                                classifier.weights_[k]))
print(np.sum(classifier.weights_)) # Must be 1.

In [None]:
# Compute the GMM dataset pdf:
x = np.linspace(-3,3,200)
gmm_pdf = np.zeros(x.shape)
for k in range(n_clusters):
    gmm_pdf += classifier.weights_[k]*gaussian(x, 
                                               classifier.means_[k,0], 
                                               classifier.covariances_[k,0])    

In [None]:
colors = sns.husl_palette(n_clusters)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5), dpi=90)
ax = sns.histplot(Xn, stat='density', kde=False, ax=ax)
ax.set_title('Histogram of normalised data with GMM clustering results (%i clusters)' % n_clusters)
plt.plot(x,gmm_pdf,'k', linewidth=2)
for k, col in zip(range(n_clusters),colors):
    plt.plot(x,classifier.weights_[k]*
             gaussian(x, classifier.means_[k,0], classifier.covariances_[k,0]),\
             color=col, linewidth=2, 
             label="$\lambda_%i=%0.0f$%%: $\mu_%i$=%0.2f ($\sigma^2_%i=%0.2f$)"%(k, 
                                                    classifier.weights_[k]*100, k, 
                                                    classifier.means_[k,0], k, 
                                                    classifier.covariances_[k,0]))
plt.legend()

## How to select the nb of cluster ?

This is the most problematic question in clustering.

It is a rather difficult problem to determine automatically the most appropriate number of components. There exist different methods that are mostly based on estimating the most probable K, or minimizing a given metric such as the mixture entropy or misfit with the observed PDF (Fraley et al., 1998). A popular method is the Bayesian Information Criterion (BIC, Schwarz, 1978). The BIC is an empirical approach of the model probability computed as:

$$BIC(K) = -2\,\mathcal{L}(K) + N_f(K)\,\log(n) \label{eq:bic}$$

where $\mathcal{L}(K)$ is the log likelihood of the trained model with $K$ classes, $N_f(K)=K-1+K\,D+K\,D\,(D+1)/2$ is the number of independent parameters to be estimated (the sum of the component weights, Gaussian means and covariance matrix elements in the D-dimensional data space) and $n$ is the number of profiles used to train the model.

The BIC is empirical because the first r.h.s. term decreases as the number of classes K increases while the second r.h.s. term is a penalty term that increases with K and thus prevents model overfitting the data. The sum of the two terms is expected to exhibit a minimum for the most appropriate $K$.

In [None]:
kmax = 20
BIC = np.zeros((kmax))
BICf = np.zeros((kmax))

print(Xn.shape[0])
n = Xn.shape[0] # Nb of samples
n = 900
n = 2196

for k in range(kmax):
    print(k, kmax)
    this_gmm = GMM(n_components=k+1).fit(Xn)
    BIC[k] = this_gmm.bic(Xn)
    D = 1 # Nb of dimension
    Nf = (k+1)-1 + (k+1)*D + (k+1)*D*(D+1)/2 # Nb of independant parameters to estimate
#     print Nf, this_gmm._n_parameters()
    BICf[k] = -2*n*this_gmm.score(Xn) + Nf*np.log(n)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5), dpi=90)
plt.plot(np.arange(kmax)+1,(BIC-np.mean(BIC))/np.std(BIC),label='Raw BIC')
plt.plot(np.arange(kmax)+1,(BICf-np.mean(BICf))/np.std(BICf), label='BIC using only samples of independant observations')
plt.ylabel('Normalized BIC')
plt.xticks(np.arange(kmax)+1)
plt.legend()

In [None]:
# Since we have more observations than the maximum independant sample size, 
# we can run several BIC computations
kmax = 20
Nrun = 30
BIC = np.zeros((kmax,Nrun))
BICf = np.zeros((kmax,Nrun))

print(Xn.shape[0])
n = Xn.shape[0] # Nb of samples
n = 2196 # Nb of independant samples

for run in range(Nrun):
    print(run, Nrun)
    for k in range(kmax):
        ii = np.random.choice(range(X.shape[0]), n, replace=False)
        this_gmm = GMM(n_components=k+1).fit(Xn[ii])
        BIC[k,run] = this_gmm.bic(Xn[ii])
        D = 1 # Nb of dimension
        Nf = (k+1)-1 + (k+1)*D + (k+1)*D*(D+1)/2 # Nb of independant parameters to estimate
    #     print Nf, this_gmm._n_parameters()
        BICf[k,run] = -2*n*this_gmm.score(Xn[ii]) + Nf*np.log(n)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5), dpi=90)
BICfmean = np.mean(BICf,axis=1)
BICfstd = np.std(BICf,axis=1)
normBICfmean = (BICfmean-np.mean(BICfmean))/np.std(BICfmean)
plt.plot(np.arange(kmax)+1,BICfmean, 
         label='BIC using independant observations')
plt.plot(np.arange(kmax)+1,BICfmean+BICfstd,color=[0.7]*3,linewidth=0.5)
plt.plot(np.arange(kmax)+1,BICfmean-BICfstd,color=[0.7]*3,linewidth=0.5)
plt.ylabel('BIC')
plt.xlabel('Number of clusters')
plt.xticks(np.arange(kmax)+1)
plt.legend()

## Exercise

Change the nb of clusters in the GMM.

What happens when K=2 ?