# Import the dataset

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
# https://archive.ics.uci.edu/dataset/186/wine+quality
online_retail = fetch_ucirepo(id=186) 
  
# dataset (as pandas dataframes) 
X = online_retail.data.features 
print(X.head())

In [2]:
# Dropping some columns to reduce the dimensionality of the data
X = X.drop(columns=['fixed_acidity', 'volatile_acidity', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'sulphates'])
# Storing the remaining feature names
features = X.columns

In [None]:
# Display the first few rows of the dataframe to understand the structure of the data
X.head()

In [None]:
# Display the shape of the dataframe to understand the number of rows and columns
X.shape

In [5]:
# Converting the dataframe to a numpy array
X = X.to_numpy()

# KMeans clustering

In [6]:
# Importing KMeans from sklearn.cluster for clustering
from sklearn.cluster import KMeans
# Importing matplotlib.pyplot for plotting
import matplotlib.pyplot as plt
# Initializing KMeans with 3 clusters and a random state of 42 for reproducibility
kmeans = KMeans(n_clusters=3, random_state=42)

In [7]:
# Scale the data
from sklearn.preprocessing import StandardScaler
# StandardScaler will scale the data to have mean 0 and standard deviation 1
# scaler = StandardScaler()

# MinMaxScaler will scale the data to a given range (default is 0 to 1)
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# MaxAbsScaler will scale the data to the range [-1, 1] based on the maximum absolute value
# from sklearn.preprocessing import MaxAbsScaler
# scaler = MaxAbsScaler()

# Xscaled = scaler.fit_transform(X)

Xscaled = X.copy()

In [None]:
# Fit the KMeans model to the scaled data
kmeans.fit(Xscaled)
# Retrieve the cluster labels assigned to each data point
labels = kmeans.labels_
labels[:10]

In [None]:
# Create a grid of subplots with dimensions based on the number of features
fig, axes = plt.subplots(len(features), len(features), figsize=(20, 20))
# Iterate over each feature for the x-axis
for i in range(len(features)):
  # Iterate over each feature for the y-axis
  for j in range(len(features)):
    # Check if the current subplot is not on the diagonal
    if i != j:
      # Scatter plot of the data points for the i-th and j-th features, colored by cluster labels
      axes[i, j].scatter(Xscaled[:, i], Xscaled[:, j], c=labels, cmap='viridis', s=10)
      # Set the x-axis label to the i-th feature name
      axes[i, j].set_xlabel(features[i])
      # Set the y-axis label to the j-th feature name
      axes[i, j].set_ylabel(features[j])
      # Set the title of the subplot to show the i-th feature vs the j-th feature
      axes[i, j].set_title(f'{features[i]} vs {features[j]}')
    else:
      # Turn off the axis for the diagonal subplots
      axes[i, j].axis('off')
# Adjust the layout to prevent overlap
plt.tight_layout()
# Display the plot
plt.show()

In [None]:
# Import the silhouette_score function from sklearn.metrics
from sklearn.metrics import silhouette_score  

# Determine the optimal number of clusters using the silhouette score
range_n_clusters = list(range(2, 11))  # Define a range of cluster numbers to evaluate, from 2 to 10
best_n_clusters = 2  # Initialize the best number of clusters with the minimum value in the range
best_silhouette_score = -1  # Initialize the best silhouette score with a very low value

for n_clusters in range_n_clusters:  # Iterate over the range of cluster numbers
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)  # Initialize KMeans with the current number of clusters
    kmeans.fit(Xscaled)  # Fit the KMeans model to the scaled data
    cluster_labels = kmeans.labels_  # Retrieve the cluster labels assigned to each data point
    silhouette_avg = silhouette_score(Xscaled, cluster_labels)  # Calculate the average silhouette score for the current clustering
    print(f"For n_clusters = {n_clusters}, the average silhouette_score is : {silhouette_avg}")  # Print the silhouette score for the current number of clusters
    if silhouette_avg > best_silhouette_score:  # Check if the current silhouette score is better than the best one found so far
        best_silhouette_score = silhouette_avg  # Update the best silhouette score
        best_n_clusters = n_clusters  # Update the best number of clusters

print(f"The optimal number of clusters is {best_n_clusters} with a silhouette score of {best_silhouette_score:.2f}")  # Print the optimal number of clusters and the corresponding silhouette score

# Return above to refit KMeans with the optimal number of clusters

# Hierarchical clustering

In [None]:
# Import necessary functions for hierarchical clustering and silhouette score calculation
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics import silhouette_score

# Perform hierarchical clustering
# The linkage function from scipy.cluster.hierarchy is typically used for agglomerative clustering. The method='ward' specifies that the Ward's method is used to minimize the variance within each cluster.
linkage_matrix = linkage(Xscaled, method='ward')

# Determine the optimal number of clusters using the silhouette score
range_n_clusters = list(range(2, 11))  # Define a range of cluster numbers to evaluate, from 2 to 10
best_n_clusters_hierarchical = 2  # Initialize the best number of clusters with the minimum value in the range
best_silhouette_score_hierarchical = -1  # Initialize the best silhouette score with a very low value

for n_clusters in range_n_clusters:  # Iterate over the range of cluster numbers
    cluster_labels = fcluster(linkage_matrix, n_clusters, criterion='maxclust')  # Retrieve the cluster labels assigned to each data point
    silhouette_avg = silhouette_score(Xscaled, cluster_labels)  # Calculate the average silhouette score for the current clustering
    print(f"For n_clusters = {n_clusters}, the average silhouette_score is : {silhouette_avg}")  # Print the silhouette score for the current number of clusters
    if silhouette_avg > best_silhouette_score_hierarchical:  # Check if the current silhouette score is better than the best one found so far
        best_silhouette_score_hierarchical = silhouette_avg  # Update the best silhouette score
        best_n_clusters_hierarchical = n_clusters  # Update the best number of clusters

print(f"The optimal number of clusters for hierarchical clustering is {best_n_clusters_hierarchical} with a silhouette score of {best_silhouette_score_hierarchical:.2f}")  # Print the optimal number of clusters and the corresponding silhouette score

In [None]:
# Visualize a part of the dendrogram
# Create a figure with a specific size
plt.figure(figsize=(6, 4))

# Generate a dendrogram from the linkage matrix, truncating the dendrogram to show only the last 30 merged clusters.
# p is the number of last merged clusters to display. It controls the number of displayed leaf nodes.  Each leaf may represent a single data point or a cluster of data points, depending on the total number of original data points and the value of p. p is used with the truncate_mode='lastp' parameter.
dendrogram(linkage_matrix, truncate_mode='lastp', p=30)

# Set the title of the dendrogram plot
plt.title('Hierarchical Clustering Dendrogram (truncated)')

# Label the x-axis as 'Sample index'
plt.xlabel('Sample index')

# Label the y-axis as 'Distance'
plt.ylabel('Distance')

# Display the plot
plt.show()

# Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA  # Import PCA for dimensionality reduction

# Fit KMeans with the optimal number of clusters
best_n_clusters = 4  # Define the optimal number of clusters
kmeans = KMeans(n_clusters=best_n_clusters, random_state=42)  # Initialize KMeans with the optimal number of clusters and a random state for reproducibility
kmeans.fit(Xscaled)  # Fit KMeans to the scaled data
cluster_labels = kmeans.labels_  # Retrieve the cluster labels assigned to each data point

# Perform PCA to reduce the data to 2 dimensions for visualization
pca = PCA(n_components=2, random_state=42)  # Initialize PCA to reduce the data to 2 dimensions
X_pca = pca.fit_transform(Xscaled)  # Fit PCA to the scaled data and transform it
X_pca.shape  # Output the shape of the PCA-transformed data

In [None]:
# Create a figure with a specific size for the scatter plot
plt.figure(figsize=(8, 6))

# Create a scatter plot of the PCA-reduced data
colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'] 

# Create a scatter plot with custom colors
for cluster in range(best_n_clusters):
    # Scatter plot for each cluster
    # X_pca[cluster_labels == cluster, 0] represents the PCA-reduced data points for the current cluster on the x-axis
    # X_pca[cluster_labels == cluster, 1] represents the PCA-reduced data points for the current cluster on the y-axis
    # color=colors[cluster] assigns a unique color to each cluster
    # label=f'Cluster {cluster}' assigns a label to each cluster for the legend
    # marker='o' specifies the marker style for the scatter plot
    # s=20 sets the size of the markers in the scatter plot
    plt.scatter(X_pca[cluster_labels == cluster, 0], X_pca[cluster_labels == cluster, 1], 
                color=colors[cluster], label=f'Cluster {cluster}', marker='o', s=20)

# Set the title of the scatter plot
plt.title('KMeans Clusters Visualized using PCA')

# Label the x-axis as 'Principal Component 1'
plt.xlabel('Principal Component 1')

# Label the y-axis as 'Principal Component 2'
plt.ylabel('Principal Component 2')

# Add a legend to the plot
plt.legend()

# Display the plot
plt.show()