# 2. PCA analysis and Clustering

## Loading of extracted high-level features

In [None]:
import numpy as np

# Load the .npz file
data = np.load('dataset_features.npz')

# List all arrays within the .npz file
print(data.files)

# Access individual arrays by their names
trainset_features = data['trainset_features']
trainset_labels = data['trainset_labels']
testset_features = data['testset_features']
testset_labels = data['testset_labels']


class_labels = data['class_labels']

X = trainset_features
y = trainset_labels

X.shape

## PCA Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_rescaled = scaler.fit_transform(X)

# Create PCA transformer
pca = PCA(n_components=None)

# Fit model to rescaled data
pca.fit(X_rescaled)

pve = pca.explained_variance_ratio_
print(f'Total number of PCA components  : {len(pve)}')
print(f'Value of first 5 PCA components : {pve[:5]}')

## Scree plot of PCA components

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(13,5))

pca_component = np.arange(len(pve))
plt.bar(pca_component, pve)
plt.xticks(pca_component[::5], rotation=90)

# Add cumulative sum
pve_cumsum = np.cumsum(pve)
plt.step(
    pca_component,
    pve_cumsum,  
)

# Add labels
plt.xlabel("principal component")
plt.ylabel("proportion of variance explained")

In [None]:
for var_percent in np.arange(10,100,10):

    print(f'Components explaining {var_percent}% of variance: {np.argmax(pve_cumsum > var_percent/100)+1}')

__Observation:__
- There are a total of 280 PCA components which is equal to the number of samples in the training set
- For explaining 100% of the variance in the dataset we obviously need all the 280 components
- And with just 2 components we can already explain 10% of the variance

## PCA(2) basis and k-means clustering

We start by transforming (projecting) the trainset data into the basis formed by 2 first PCA components

In [None]:
X_pca2 = pca.transform(X_rescaled)[:,:2]    


We also apply the k-means clustering algorithm on these 2 components of the PCA-transformed data, with the numbers of clusters equal to the number of categories we want to classify.

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, random_state=42)

# Compute k-means
kmeans.fit(X_pca2)


Finally we show the both data transformations into two side-by-side plots

In [None]:
from util import decode_class

fig, axes = plt.subplots(1,2,figsize=(16,8))

# Plot each category on the 2D PCA space
for label_idx, label in enumerate(class_labels):
    # Images of this digit
    sample_idx = decode_class(trainset_labels) == label_idx

    # Plot images
    axes[0].scatter(
        X_pca2[sample_idx, 0],
        X_pca2[sample_idx, 1],
        s=20, 
        label = label
    )
axes[0].set_aspect('equal')
axes[0].set_xlabel('PCA component 1')
axes[0].set_ylabel('PCA component 2')
axes[0].set_title('PCA components of each class')
axes[0].legend()



# Plot each cluster also in the 2D PCA space
for cluster in [2, 4, 5, 0, 3, 1]:
    # Get points in this cluster
    idx = kmeans.labels_ == cluster
    x1, x2 = X_pca2[idx, 0], X_pca2[idx, 1]

    # Plot points
    axes[1].scatter(x1, x2, s=20, label=f"cluster {cluster}")

    # Plot centroid
    centroid = kmeans.cluster_centers_[cluster]
    axes[1].plot(centroid[0], centroid[1], marker="*", color="black", markersize=18)

axes[1].set_aspect('equal')
axes[1].set_xlabel("PCA component 1")
axes[1].set_ylabel("PCA component 2")
axes[1].set_title('k-means clustering of PCA components')
axes[1].legend()



__Observation:__

- In the left plot, we see that just the first two PCA components aleady allows for a good separation of some categories.
    - 'bike', 'van', and 'car' form relatively distinct clusters in the 2D PCA projection, suggesting that the first two principal components capture significant variance for these classes.
    - Other classes (e.g., 'truck' and 'other') appear to overlap more, indicating that the first two components may not fully separate these categories.

- By comparing the right plot clusters with the true categorization shown in the left plot we observed that:
    - The 'bike', 'car' and 'van' categories aligns relatively well with the k-means clusters 2, 4 and 1 respectively
    - Cluster 5 seems to capture about half of the 'motorcyle' category
    - For the 'other' and 'truck' categories, these are grouped together into cluster 3

## PCA transformation of test dataset

In [None]:
scaler = StandardScaler()

X_test_rescaled = scaler.fit_transform(testset_features)

# Transform the test dataset to the 2D PCA space
test_pca2 = pca.transform(X_test_rescaled)[:,:2]


Next we overlap the test samples, scaled and projected in the 2D PCA space, with the previously shown train data. The color of each datapoint of these datasamples correctly corresponds to the true categories. We use a star "*" marker to differentiate the test set samples from the train set samples (represented by a circle).

In [None]:
plt.figure(figsize=(10, 10))

for label_idx, label in enumerate(class_labels):
    
    sample_idx = decode_class(trainset_labels) == label_idx

    # Plot images
    plt.scatter(
        X_pca2[sample_idx, 0],
        X_pca2[sample_idx, 1],
        s=10, 
        label = label
    )
    
plt.gca().set_aspect('equal')
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.title('PCA components of the train set (cirlces) and test set (stars)')   
plt.legend()

plt.gca().set_prop_cycle(None)
    
for label_idx, label in enumerate(class_labels):
    
    sample_idx = decode_class(testset_labels) == label_idx

    # Plot images
    plt.scatter(
        test_pca2[sample_idx, 0],
        test_pca2[sample_idx, 1],
        s=80, 
        label = label,
        marker= '*',
    )
    


__Observation:__

- By counting the star points that are located in regions where the predominant train samples correspond to a different category, we estimated that around 3 to 6 samples (out of 50) wil be hard to classify