# LCPB 23-24 exercise 2 (Data visualization and clustering)
- Andrea Semenzato 2130973
- Pietro Bernardi 2097494
- Tomàs Mezquita 2109239
- Mariam Chokheli 2122278

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import axes3d
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300

### POINT 1

### “Eps” ($\epsilon$) and “minPts” (mP) in DBSCAN algorithm for clustering

### Refine the grid with more values of ε and mP and plot a heat-map showing the normalized mutual information (NMI) between true and predicted clusters, similar to the one on the right.

### Is the high NMI region showing a correlation between ε and mP?




Yes, as we can see they are strongly correlated. We can see a linear connection between these two parameters. The image shows a band where the NMIs are not zero, which shows a linear tendency. It makes sense that if you increase the epsilon, you must increase the minimum points of the cluster too to get results. It is also remarkable that the best results are gotten with a small number of minimum points.


<img src="img1ex2.png" width="600">






This is the code to generate the heat map of point 1

eps_range=np.linspace(r/2,2.5*r,30)
min_sample_range = np.arange(1,50,2, dtype=int)

XX,YY=np.meshgrid(min_sample_range, eps_range)

DIMY=len(eps_range)
DIMX=len(min_sample_range)
it=0

nmi=np.zeros((DIMY,DIMX))
for i, eps in enumerate(eps_range):
    for j, min_samples in enumerate(min_sample_range):
        model = DBSCAN(eps=eps, min_samples=min_samples)
        model.fit(X)
        y_hat = model.labels_
        nmi[i,j]=NMI(y_hat, y_true)
        

plt.pcolormesh(XX,YY,nmi)
plt.xlabel('Min samples')
plt.ylabel('Eps')
plt.title('NMI Heat map')
plt.colorbar()
plt.show()

### POINT 1

### Understanding the 12-dimensional data Use the principal component analysis (PCA) to visualize the first components of the data.
### Does it help understand its structure?

In [None]:
# 2D and 3D PCA on the 12-dimensional dataset

# Loading data
data = np.loadtxt("x_12d.dat", delimiter='\t')
y = np.loadtxt("y_12d.dat", dtype=int)
colors = ['r', 'b', 'gold']

# Data standardization
scaler = StandardScaler()
X = scaler.fit_transform(data)

# PCA transformations
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)
X_pca_2d = pca_2d.fit_transform(X)
X_pca_3d = pca_3d.fit_transform(X)

# 2D PCA visualizations
fig = plt.figure(figsize=(16, 8))
unique_labels = np.unique(y)

# Combined 2D PCA plot
ax1 = fig.add_subplot(2, 4, 1)
for i, color in zip(unique_labels, colors):
    ax1.scatter(X_pca_2d[y == i, 0], X_pca_2d[y == i, 1], c=color, alpha=0.8, label=f'Class {i}')
ax1.set_title('Combined 2D PCA Visualization')
ax1.set_xlabel('Principal Component 1')
ax1.set_ylabel('Principal Component 2')
ax1.grid(True)

# Individual 2D PCA plots
for index, label in enumerate(unique_labels):
    ax = fig.add_subplot(2, 4, index + 2)
    ax.scatter(X_pca_2d[y == label, 0], X_pca_2d[y == label, 1], c=colors[index], alpha=0.8, label=f'Class {label}')
    ax.set_title(f'2D PCA Class {label}')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.grid(True)

# 3D PCA visualizations
fig_3d = plt.figure(figsize=(16, 8))

# Combined 3D PCA plot
ax2 = fig_3d.add_subplot(2, 4, 1, projection='3d')
for i, color in zip(unique_labels, colors):
    ax2.scatter(X_pca_3d[y == i, 0], X_pca_3d[y == i, 1], X_pca_3d[y == i, 2], c=color, alpha=0.8, label=f'Class {i}')
ax2.set_title('Combined 3D PCA Visualization')
ax2.set_xlabel('Principal Component 1')
ax2.set_ylabel('Principal Component 2')
ax2.set_zlabel('Principal Component 3')
ax2.grid(True)

# Individual 3D PCA plots
for index, label in enumerate(unique_labels):
    ax = fig_3d.add_subplot(2, 4, index + 2, projection='3d')
    ax.scatter(X_pca_3d[y == label, 0], X_pca_3d[y == label, 1], X_pca_3d[y == label, 2], c=colors[index], alpha=0.8, label=f'Class {label}')
    ax.set_title(f'3D PCA Class {label}')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.set_zlabel('Principal Component 3')
    ax.grid(True)

plt.show()

# Print explained variance ratios
print("Explained variance ratio for 2D PCA:", pca_2d.explained_variance_ratio_)
print("Explained variance ratio for 3D PCA:", pca_3d.explained_variance_ratio_)