In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def k_means(data, k):
  """
  Performs k-means clustering on the given data.

  Args:
    data: A NumPy array of data points.
    k: The number of clusters.

  Returns:
    A list of cluster labels, one for each data point.
  """

  # Initialize the centroids randomly.
  centroids = np.random.randint(data.min(), data.max(), (k, data.shape[1]))

  # Repeat until the centroids do not change.
  while True:

    # Assign each data point to the closest centroid.
    distances = np.linalg.norm(data - centroids, axis=1)
    clusters = np.argmin(distances, axis=0)

    # Update the centroids to be the mean of the data points in each cluster.
    new_centroids = np.array([data[clusters == i].mean(axis=0) for i in range(k)])

    # If the centroids have not changed, stop.
    if np.allclose(centroids, new_centroids):
      break

    # Set the centroids to the new values.
    centroids = new_centroids

  return clusters

def pca(data):
  """
  Performs principal component analysis on the given data.

  Args:
    data: A NumPy array of data points.

  Returns:
    A tuple of the principal components and their eigenvalues.
  """

  # Center the data.
  data -= data.mean(axis=0)

  # Compute the covariance matrix.
  covariance = np.cov(data.T)

  # Find the eigenvectors and eigenvalues of the covariance matrix.
  eigenvectors, eigenvalues = np.linalg.eig(covariance)

  # Sort the eigenvectors by their eigenvalues.
  eigenvectors = eigenvectors[:, eigenvalues.argsort()[::-1]]

  return eigenvectors, eigenvalues

def main():
  # Load the data.
  data = pd.read_csv("Iris Dataset.csv")

  # Remove the Species column.
  data = data.drop("Species", axis=1)

  # Split the data into training and test sets.
  X_train, X_test, y_train, y_test = train_test_split(data, data["Species"], test_size=0.25)

  # Perform PCA on the training data.
  eigenvectors, eigenvalues = pca(X_train)

  # Plot the data in the first three principal components.
  plt.figure()
  for i in range(3):
    plt.scatter(eigenvectors[:, i], eigenvectors[:, i + 3], c=y_train, cmap="tab10")
  plt.xlabel("PC1")
  plt.ylabel("PC2")
  plt.title("PCA of Iris Data")
  plt.show()

  # Perform k-means clustering on the training data with k=3.
  clusters = k_means(X_train, 3)

  # Plot the data with the clusters colored differently.
  plt.figure()
  for i in range(3):
    plt.scatter(eigenvectors[:, i], eigenvectors[:, i + 3], c=clusters, cmap="tab10")
  plt.xlabel("PC1")
  plt.ylabel("PC2")
  plt.title("K-Means Clustering of Iris Data")
  plt.show()

  # Plot the actual species of the data points.
  plt.figure()
  for i in range(3):
    plt.scatter(eigenvectors[:, i], eigenvectors[:, i + 3], c=y_train, cmap="tab10")
  plt.xlabel("PC1")
  plt.ylabel("PC2")
  plt.title("Actual Species of Iris Data")
  plt.show()

if __name__=='__main__':
    main()
