In [None]:
# Import all libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap.umap_ as umap

# Part 1. Visualizing Unlabeled Training Data.

In [None]:
# Load the data
relative_path = "faces/faces_AU.csv" # Change this path if you need to
data = pd.read_csv(relative_path)


selected_features = data[[' gaze_0_x', ' gaze_0_y', ' gaze_0_z', ' gaze_1_x', ' gaze_1_y', ' gaze_1_z',
                          ' gaze_angle_x', ' gaze_angle_y',
                          ' AU05_r', ' AU06_r', ' AU07_r']]
# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(selected_features)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(features_scaled)

# Plot data points in 2D
plt.figure(figsize=(8, 6))
plt.scatter(principal_components[:, 0], principal_components[:, 1], alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Faces Data')
plt.grid(True)
plt.show()

# Display explained variance for each component
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance for each component:", explained_variance_ratio)

After standardizing the features, we used PCA to plot the data points in 2 dimensions. The results of the plot are shown above with the explained variance for each component.

PCA was used to dimensionally reduce the data by transforming it into a new coordinate system. The first principal component finds the direction of the maximum variance and the second component finds the direction of the maximum remaining variance. Our plot shows most of our data is cluttered in one area, as expected.

The explained variance ratio of the principal component represents the proportion of the dataset's variance that lies along the axis of that component. In other words, it shows how much variance is captured by each component. As shown above, the explained variance for our two components is 0.301 and 0.251.  This means that the first component explains about 30.1% of the total variance in the dataset and the second component explains 25.1% of the total variance. In total, our two components explain 30.1+25.1=55.2% of the total variance.

Our two components together capture a big portion of the total variance in our data, making them useful for reducing the dimensionality of our data, while retaining information.

# Part 2. K-Means and Silhouette Score.

In [None]:

# Initialize a list to store silhouette scores
silhouette_scores = []

# Define range of K values
k_values = range(2, 11)

# Iterate over each K value
for k in k_values:
    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=k, random_state=0, n_init='auto')
    clusters = kmeans.fit_predict(principal_components)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(principal_components, clusters)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores for different K values
plt.figure(figsize=(8, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)
plt.show()

# Find the optimal K and its corresponding silhouette score
optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
optimal_score = max(silhouette_scores)

print("Optimal number of clusters (K):", optimal_k)
print("Optimal silhouette score:", optimal_score)


As shown in the plot above, the optimal silhouette score is 0.368, and it represents a k value (number of clusters) of 3.

The total silhouette score range is from -1 to 1, and our optimal score is 0.368, indicating a moderate score. This suggests our clusters are fairly well-separated but some overlapping is expected between our clusters. Looking at the plot, this is the best score as increasing the number of clusters would decrease our score and provide a worse result.

# Part 3. Cluster Interpretation.

In [None]:

# Choose the optimal K from Part 2
k = optimal_k

# Perform K-Means clustering with the optimal K
kmeans = KMeans(n_clusters=k, random_state=0, n_init='auto')
clusters = kmeans.fit_predict(principal_components)

# Plot data points with colors representing clusters
plt.figure(figsize=(8, 6))
for cluster_id in range(k):
    cluster_data = principal_components[clusters == cluster_id]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=f'Cluster {cluster_id}')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'K-Means Clustering with K={k}')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Create a dictionary to hold image numbers for each cluster
cluster_images = {cluster_id: [] for cluster_id in range(k)}

# Populate the dictionary with image numbers
for idx, cluster_id in enumerate(clusters):
    image_number = data.iloc[idx]['img']
    cluster_images[cluster_id].append(image_number)

# Print the image numbers for each cluster
for cluster_id, images in cluster_images.items():
    print(f'Cluster {cluster_id}: {images}')


As shown in the k-means plot above, our PCA-reduced data has been seperated into our optimal k (3) clusters, labeled cluster 0, 1, and 2. Then a dictionary was created to map each image with the correct cluster it was in. The results are printed above. Finally, I selected random images from each cluster to look for similarities within each cluster, so I can label them correctly.

Also, we will be mostly looking at the eyes and cheeks because the selected features were gaze, AU5 (upper eyelid), AU6 (cheeks) and AU7 (eyelid tightner).

The first thing I noticed was in Cluster 2, most people were smiling which made their cheeks rise (AU7) and they were looking directly into the camera. For Cluster 0, the cheeks were not raised, eyelids were not tightened, and upper eyelids were raised, which made most people look a little sad. Most faces were also looking into the camera. Finally for Cluster 1, the lids were tightened but the cheeks were lowered, and most people were not looking directly into the camera. So to label these, we could call Cluster 0: lowered cheeks and upper eyelids raised, Cluster 1: looking away, and Cluster 2: cheeks raised.

It is worth mentioning that these labels are not perfect because based on our silhouette scores, we know there is overlapping between our clusters.

# Part 4. Use UMAP.

In [None]:
# Dimensionality Reduction with UMAP
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_embeddings = umap_reducer.fit_transform(features_scaled)

# Loop Over Different Numbers of Clusters
best_score = -1
best_k = -1
for k in range(2, 11):
    kmeans_umap = KMeans(n_clusters=k, random_state=42, n_init='auto')
    umap_clusters = kmeans_umap.fit_predict(umap_embeddings)
    
    # Compute Silhouette Score
    silhouette_avg = silhouette_score(umap_embeddings, umap_clusters)
    
    # Choose Optimal Number of Clusters
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_k = k

print("Optimal number of clusters:", best_k)
print("Best score:", best_score)


In [None]:
standard_embedding = umap.UMAP(n_components=2, random_state=42).fit_transform(features_scaled)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=kmeans.labels_, s=0.1, cmap='Spectral')
plt.grid(True)
plt.show()

In [None]:
# Initialize a list to store silhouette scores
silhouette_scores = []

# Define range of K values
k_values = range(2, 11)

# Iterate over each K value
for k in k_values:
    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=k, random_state=0, n_init='auto')
    clusters_umap = kmeans.fit_predict(umap_embeddings)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(umap_embeddings, clusters_umap)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores for different K values
plt.figure(figsize=(8, 6))
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.grid(True)
plt.show()

# Find the optimal K and its corresponding silhouette score
optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
optimal_score = max(silhouette_scores)

print("Optimal number of clusters (K):", optimal_k)
print("Optimal silhouette score:", optimal_score)


In [None]:
# Choose the optimal K from Part 2
k = best_k

# Perform K-Means clustering with the optimal K
kmeans = KMeans(n_clusters=k, random_state=0)
clusters_umap = kmeans.fit_predict(umap_embeddings)

plt.figure(figsize=(8, 6))
for cluster_id in range(k):
    cluster_data = umap_embeddings[clusters_umap == cluster_id]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1])
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('UMAP with K-Means Clustering')
plt.legend()
plt.show()

In [None]:
# Create a dictionary to hold image numbers for each cluster
cluster_images = {cluster_id: [] for cluster_id in range(k)}

# Populate the dictionary with image numbers
for idx, cluster_id in enumerate(clusters_umap):
    image_number = data.iloc[idx]['img']
    cluster_images[cluster_id].append(image_number)

# Print the image numbers for each cluster
for cluster_id, images in cluster_images.items():
    print(f'Cluster {cluster_id}: {images}')

As shown above, UMAP found the same number of clusters as PCA (3). However, the optimal silhouette score was higher than in step 2, as in this step it is 0.484. Looking at the k-means plot, we can see the data points are much more spread out compared to in the previous step.

The results from our two steps are very similar as they both have three clusters and show similar shared expressions within the clusters.

Overall, we observe that cluster 0 has lowered cheeks and upper eyelids raised, while cluster 1 is looking away with cheeks lowered and eyelids tightened, and cluster 2 has cheeks raised, while directly looking into the camera. People in cluster 2 seem the happiest, while in cluster 0 a lot of people look sad, and in cluster 1 people are mostly indifferent.