In [1]:
import numpy as np
# With Eigen Decomposition
def pca_custom(data, n_components):
    """
    Custom PCA function to reduce the dimensionality of the data.

    Parameters:
    - data: Input data of shape (samples, features)
    - n_components: The number of principal components to keep

    Returns:
    - transformed_data: Data transformed to the new feature space with reduced dimensionality
    - explained_variance_ratio: Ratio of variance explained by the selected components
    """
    # Step 1: Standardize the Data (Mean Centering)
    mean_centered_data = data - np.mean(data, axis=0)  # Subtract the mean of each feature
    # Shape of mean_centered_data: (samples, features)

    # Step 2: Compute the Covariance Matrix
    covariance_matrix = np.cov(mean_centered_data, rowvar=False)
    # Shape of covariance_matrix: (features, features)

    # Step 3: Eigen Decomposition of the Covariance Matrix
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    # eigenvalues: Array of eigenvalues
    # eigenvectors: Matrix whose columns are the corresponding eigenvectors

    # Step 4: Sort Eigenvalues and Corresponding Eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]  # Sort in descending order
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    # Step 5: Select the Top n_components Eigenvectors
    selected_eigenvectors = sorted_eigenvectors[:, :n_components]
    # Shape of selected_eigenvectors: (features, n_components)

    # Step 6: Transform the Data to the New Feature Space
    transformed_data = np.dot(mean_centered_data, selected_eigenvectors)
    # Shape of transformed_data: (samples, n_components)

    # Calculate the Explained Variance Ratio
    total_variance = np.sum(sorted_eigenvalues)
    explained_variance = sorted_eigenvalues[:n_components]
    explained_variance_ratio = explained_variance / total_variance

    return transformed_data, explained_variance_ratio

# Example usage:
data = np.random.rand(100, 5)  # 100 samples, 5 features
n_components = 2  # Reduce to 2 components

transformed_data, explained_variance_ratio = pca_custom(data, n_components)
print("Original data shape:", data.shape)  # Output: (100, 5)
print("Transformed data shape:", transformed_data.shape)  # Output: (100, 2)
print("Explained variance ratio:", explained_variance_ratio)


Original data shape: (100, 5)
Transformed data shape: (100, 2)
Explained variance ratio: [0.2848268  0.22198853]
