In [2]:
# Q1


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

# Load the IRIS dataset and select the specific attributes
iris = load_iris()
X = iris.data[:, [0, 3]]  # Select Sepal Length and Petal Width

# (a) Find all the principal components
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)

# (b) Scatter plot of mean-centered data
X_centered = X - np.mean(X, axis=0)
plt.scatter(X_centered[:, 0], X_centered[:, 1], label='Mean Centered Data')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Width')

# (c) Show all the principal components on the same plot
for i, (length, vector) in enumerate(zip(pca.explained_variance_, pca.components_)):
    v = vector * 3 * np.sqrt(length)
    plt.quiver(*np.mean(X, axis=0), v[0], v[1], color='r', scale=1, scale_units='xy', angles='xy', label=f'PC{i+1}')

plt.legend()
plt.show()

# (d) Project samples onto the first principal component
X_proj = X_pca[:, 0]  # First principal component
plt.scatter(X_proj, np.zeros_like(X_proj), label='Projected Samples')
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.3, label='Original PCA Data')
plt.xlabel('1st Principal Component')
plt.legend()
plt.show()

# (e) Function to compute number of PCs required to capture p% variance
def required_pcs(p):
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    num_pcs = np.argmax(cumulative_variance >= p / 100) + 1
    return num_pcs

# Example: Find the number of PCs for 96% variance
p = 96
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

p = 97
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

p = 98
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')
# Q2 

X = iris.data[:, [1, 2]]  # Select Sepal Width and Petal Length

# Repeat all steps from Task 1 for the new attributes
# (a) Find all the principal components
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)

# (b) Scatter plot of mean-centered data
X_centered = X - np.mean(X, axis=0)
plt.scatter(X_centered[:, 0], X_centered[:, 1], label='Mean Centered Data')
plt.xlabel('Sepal Width')
plt.ylabel('Petal Length')

# (c) Show all the principal components on the same plot
for i, (length, vector) in enumerate(zip(pca.explained_variance_, pca.components_)):
    v = vector * 3 * np.sqrt(length)
    plt.quiver(*np.mean(X, axis=0), v[0], v[1], color='r', scale=1, scale_units='xy', angles='xy', label=f'PC{i+1}')

plt.legend()
plt.show()

# (d) Project samples onto the first principal component
X_proj = X_pca[:, 0]  # First principal component
plt.scatter(X_proj, np.zeros_like(X_proj), label='Projected Samples')
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.3, label='Original PCA Data')
plt.xlabel('1st Principal Component')
plt.legend()
plt.show()

# (e) Function to compute number of PCs required to capture p% variance
p = 96
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

p = 97
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

p = 98
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')


#  Q3 


X = iris.data  # Use all four attributes

# (a) Compute all PCAs in normal form
pca = PCA()
pca.fit(X)
X_pca = pca.transform(X)

# (b) How many PCs are required to capture 95% of the variance?
p = 95
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

# (c) Scatter plot of samples projected onto the PCs
X_proj = X_pca[:, :num_pcs]
plt.scatter(X_proj[:, 0], X_proj[:, 1])
plt.xlabel('1st Principal Component')
plt.ylabel('2nd Principal Component')
plt.title('Scatter Plot of Samples on PCs')
plt.show()



# Q4 
import numpy as np
import matplotlib.pyplot as plt # type: ignore
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml

# Load MNIST dataset with the 'auto' parser
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X = mnist.data[:1000]  # Use a subset of the data to reduce computation time

# (a) Compute all PCAs
pca = PCA()
pca.fit(X)
X_pca = pca.transform(X)

# (b) How many PCs to capture 95% variance?
def required_pcs(p):
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    num_pcs = np.where(cumulative_variance >= p / 100)[0][0] + 1
    return num_pcs

p = 95
num_pcs = required_pcs(p)
print(f'Number of PCs required to capture {p}% variance: {num_pcs}')

# (c) Project MNIST test dataset on 1st PC
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], np.zeros_like(X_pca[:, 0]), s=10, alpha=0.5)
plt.xlabel('1st Principal Component')
plt.title('Projection on 1st PC')
plt.show()

# (d) Scatter plot of 1st PC vs 2nd PC
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], s=10, alpha=0.5)
plt.xlabel('1st Principal Component')
plt.ylabel('2nd Principal Component')
plt.title('1st PC vs 2nd PC')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'