In [3]:
import numpy as np


def eig(matrix, num_iterations=1000, tol=1e-10):
    """
    Compute all eigenvalues and eigenvectors of a matrix using the QR Algorithm.

    Args:
        matrix (numpy.ndarray): A square matrix (n x n).
        num_iterations (int): Maximum number of iterations.
        tol (float): Convergence tolerance.

    Returns:
        tuple: Eigenvalues (diagonal elements) and eigenvectors (columns of Q).
    """
    n = matrix.shape[0]
    A = np.array(matrix, dtype=np.float64)  # Ensure double precision
    Q_total = np.eye(n)  # Initialize Q_total to store cumulative Q

    for _ in range(num_iterations):
        # QR decomposition
        Q, R = np.linalg.qr(A)
        A = np.dot(R, Q)  # Update A

        # Accumulate the eigenvectors
        Q_total = np.dot(Q_total, Q)

        # Check for convergence (off-diagonal elements close to zero)
        if np.allclose(A - np.diag(np.diagonal(A)), 0, atol=tol):
            break

    eigenvalues = np.diagonal(A)
    eigenvectors = Q_total
    return eigenvalues, eigenvectors

In [4]:
import pandas as pd
import numpy as np

# Load the CSV file
file_path = '/kaggle/input/wine-quality-clustering-unsupervised/winequality-red.csv'
df = pd.read_csv(file_path)

# Extract numeric data and handle missing values
numeric_data = df.select_dtypes(include=[np.number]).dropna().to_numpy()

def PCA(data_matrix, n_components):
    mean_vector = np.mean(data_matrix, axis=0)
    centered_matrix = data_matrix - mean_vector
    covariance_matrix = np.dot(centered_matrix.T, centered_matrix) / (data_matrix.shape[0] - 1)
    eigvals, eigvecs = eig(covariance_matrix)
    sorted_indices = np.argsort(eigvals)[::-1]
    top_indices = sorted_indices[:n_components]
    
    eigenvector_subset = eigvecs[:, top_indices]
    reduced_data = np.dot(centered_matrix, eigenvector_subset)
    reconstructed_data = np.dot(reduced_data, eigenvector_subset.T) + mean_vector
    
    return reduced_data, reconstructed_data





In [45]:
variance_before_apply_PCA = np.var(numeric_data, axis=0)
total_variance_before_apply_PCA = np.sum(variance_before_apply_PCA)
for i in range(1,numeric_data.shape[1]+1):
    reduced_data, reconstructed_data = PCA(numeric_data, n_components=i)
    print(f"the shape after apply pca {reduced_data.shape}")
    reconstruction_error = np.mean((numeric_data - reconstructed_data) ** 2)
    print(f"Reconstruction Error (MSE): {reconstruction_error:.6f}")
    variance_after_apply_PCA = np.var(reduced_data, axis=0)
    total_variance_after_apply_PCA = np.sum(variance_after_apply_PCA)  
    print(f"the percentage of the change in variance when use n_components by {i} is: ", (1 - total_variance_after_apply_PCA / total_variance_before_apply_PCA))
    print("\n*------------------------------------------------------------------------------------------*\n")
    


the shape after apply pca (1599, 1)
Reconstruction Error (MSE): 5.381712
the percentage of the change in variance when use n_components by 1 is:  0.053920486465276585

*------------------------------------------------------------------------------------------*

the shape after apply pca (1599, 2)
Reconstruction Error (MSE): 0.556146
the percentage of the change in variance when use n_components by 2 is:  0.00557213909160148

*------------------------------------------------------------------------------------------*

the shape after apply pca (1599, 3)
Reconstruction Error (MSE): 0.297044
the percentage of the change in variance when use n_components by 3 is:  0.002976148226376152

*------------------------------------------------------------------------------------------*

the shape after apply pca (1599, 4)
Reconstruction Error (MSE): 0.145048
the percentage of the change in variance when use n_components by 4 is:  0.0014532614517668296

*---------------------------------------------

In [13]:
#chosse the number of components 5 
reduced_data, reconstructed_data = PCA(numeric_data, n_components = 5)
print(f"the shape before apply PCA {numeric_data.shape}")
print("*------------------------------------------------------------------------------------------*")
print(f"the shape after apply PCA {reduced_data.shape}")
print("*------------------------------------------------------------------------------------------*")
print(f"data after apply PCA with 5 components: \n{reduced_data}")


the shape before apply PCA (1599, 12)
*------------------------------------------------------------------------------------------*
the shape after apply PCA (1599, 5)
*------------------------------------------------------------------------------------------*
data after apply PCA with 5 components: 
[[-13.22202658  -2.03192212   1.18123474   0.47564207  -1.20021245]
 [ 22.04025471   4.40179054   0.35499069   0.2602393   -0.75290663]
 [  7.16536169  -2.50832073   0.62463767   0.27530638  -0.72707587]
 ...
 [ -3.43293096  14.27427694   1.73227854  -0.21146278   0.35664677]
 [  1.13557385  16.30769238   2.18955318   0.294478    -0.75029295]
 [ -3.87592057   3.13011173   1.84248483  -1.73878746   0.2121722 ]]
