In [1]:
#Import All Necessary Files
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler

In [2]:
#Retrieve and load the Olivetti faces dataset 
olivetti_faces = fetch_olivetti_faces(shuffle=True, random_state=42)
X, y = olivetti_faces.data, olivetti_faces.target

In [3]:
#This will split the training data set and temporary data set (60% Training, 40% Temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=40)

#This will split the temporary data set into validation and test sets (50% validation, 50% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=40)

In [8]:
# Create the K-nearest neighbors classifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Perform k-fold cross-validation
k = 5  # Number of folds 
cross_val_scores = cross_val_score(classifier, X_train, y_train, cv=k)

# Calculate the average accuracy
average_accuracy = cross_val_scores.mean()

# Fit the classifier on the entire training set
classifier.fit(X_train, y_train)

# Evaluate the classifier on the validation set
validation_accuracy = classifier.score(X_val, y_val)

print(f'Average Accuracy (Cross-Validation): {average_accuracy:.2f}')
print(f'Accuracy on Validation Set: {validation_accuracy:.2f}')

Average Accuracy (Cross-Validation): 0.71
Accuracy on Validation Set: 0.71


In [10]:
# Reduce dimensionality using K-Means
n_clusters_range = range(2, 9)  

best_silhouette_score = -1
best_n_clusters = 0
best_kmeans = None

for n_clusters in n_clusters_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=40)
    cluster_labels = kmeans.fit_predict(X_train)
    
    silhouette_avg = silhouette_score(X_train, cluster_labels)
    
    #This will find the number of clusters and silhouette score
    print(f'Number of Clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.2f}')
    
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_n_clusters = n_clusters
        best_kmeans = kmeans

#This will find the best number of clusters and silhouette score
print(f'Best Number of Clusters: {best_n_clusters}, Best Silhouette Score: {best_silhouette_score:.2f}')

# Fit the best K-Means model on the entire training set
best_kmeans.fit(X_train)

# Transform the data using the best K-Means model to reduce dimensionality
X_train_reduced = best_kmeans.transform(X_train)
X_val_reduced = best_kmeans.transform(X_val)


Number of Clusters: 2, Silhouette Score: 0.15
Number of Clusters: 3, Silhouette Score: 0.11
Number of Clusters: 4, Silhouette Score: 0.10
Number of Clusters: 5, Silhouette Score: 0.10
Number of Clusters: 6, Silhouette Score: 0.09
Number of Clusters: 7, Silhouette Score: 0.09
Number of Clusters: 8, Silhouette Score: 0.09
Best Number of Clusters: 2, Best Silhouette Score: 0.15


In [11]:
# Create the K-nearest neighbors classifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on the reduced training data
classifier.fit(X_train_reduced, y_train)

# Evaluate the classifier on the reduced validation data
validation_accuracy = classifier.score(X_val_reduced, y_val)

# Print the accuracy on the validation set
print(f'Accuracy on Validation Set: {validation_accuracy:.2f}')

Accuracy on Validation Set: 0.24


In [14]:
# Reshape the images into feature vectors
X = olivetti_faces.images.reshape((len(olivetti_faces.images), -1))

# Standardize the feature vectors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction 
n_components = 100 
pca = PCA(n_components=n_components, random_state=40)
X_pca = pca.fit_transform(X_scaled)

# Compute pairwise cosine distances
cosine_distances = pairwise_distances(X_pca, metric='cosine')

# Apply DBSCAN clustering
eps = 0.3  
min_samples = 5  
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
labels = dbscan.fit_predict(cosine_distances)

# Number of clusters (including noise points, labeled as -1)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f'Number of Clusters: {n_clusters}')


Number of Clusters: 21
