In [None]:
#Loading the Dataset:

from sklearn.datasets import load_iris
import pandas as pd

# Load the dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)

# Display the first few rows
print(X.head())

#Drop the species column since this is a clustering problem.
y = pd.Series(data.target)


In [None]:
#A) KMeans Clustering (4 marks)

#Provide a brief description of how KMeans clustering works.
'''KMeans clustering is an iterative algorithm that divides a dataset into K distinct, non-overlapping subsets (clusters). 
The algorithm works as follows:

Initialize K centroids randomly.
Assign each data point to the nearest centroid.
Recalculate the centroids as the mean of the assigned points.
Repeat steps 2 and 3 until convergence (when assignments no longer change).'''

#Explain why KMeans clustering might be suitable for the Iris dataset.
'''KMeans is suitable for the Iris dataset because:

The dataset has well-defined groups that can be separated in a multi-dimensional space.
It allows for efficient clustering of the continuous numerical features (sepal length, sepal width, petal length, petal width).'''

#Apply KMeans clustering to the preprocessed Iris dataset and visualize the clusters.

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Choosing the number of clusters (K)
k = 3  # Based on the known species in the dataset

# Applying KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
X['Cluster'] = kmeans.labels_

# Visualizing the clusters
plt.figure(figsize=(10, 6))
plt.scatter(X['sepal length (cm)'], X['sepal width (cm)'], c=X['Cluster'], cmap='viridis', marker='o', edgecolor='k', s=100)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', s=200, alpha=0.75, marker='X')  # centroids
plt.title('KMeans Clustering of Iris Dataset')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.show()


In [None]:
#B) Hierarchical Clustering (4 marks)

#Provide a brief description of how Hierarchical clustering works.
'''Hierarchical clustering builds a tree-like structure (dendrogram) to represent the relationships between data points. 
There are two main types:

Agglomerative: Starts with each point as its own cluster and merges them based on proximity.
Divisive: Starts with one cluster and recursively splits it into smaller clusters.'''

#Explain why Hierarchical clustering might be suitable for the Iris dataset.
'''Hierarchical clustering is suitable for the Iris dataset because:

It does not require specifying the number of clusters in advance.
The dendrogram can provide insights into the data structure and the relationships between clusters.'''

#Apply Hierarchical clustering to the preprocessed Iris dataset and visualize the clusters.
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Applying Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=k)
X['Hierarchical_Cluster'] = hierarchical.fit_predict(X)

# Dendrogram
plt.figure(figsize=(12, 7))
linked = linkage(X.drop(['Cluster'], axis=1), method='ward')
dendrogram(linked, orientation='top', labels=y, distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Iris Species')
plt.ylabel('Euclidean distances')
plt.show()

# Visualizing Hierarchical clusters
plt.figure(figsize=(10, 6))
plt.scatter(X['sepal length (cm)'], X['sepal width (cm)'], c=X['Hierarchical_Cluster'], cmap='viridis', marker='o', edgecolor='k', s=100)
plt.title('Hierarchical Clustering of Iris Dataset')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.show()
