In [8]:
import pandas as pd
import numpy as np
from pca_utils import PCA

In [9]:
# File path for the dataset
file_path = 'datasets/winequality-red.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Extract numeric data and drop rows with missing values
numeric_data = df.select_dtypes(include=[np.number]).dropna().to_numpy()

In [6]:
# Calculate variance of each feature before applying PCA
variance_before_apply_PCA = np.var(numeric_data, axis=0)

# Calculate total variance before PCA
total_variance_before_apply_PCA = np.sum(variance_before_apply_PCA)

# Iterate over different numbers of principal components
for i in range(1, numeric_data.shape[1] + 1):
    # Apply PCA with the current number of components
    reduced_data, reconstructed_data = PCA(numeric_data, n_components=i)

    # Display the shape of the reduced data
    print(f"The shape after applying PCA: {reduced_data.shape}")

    # Compute reconstruction error (Mean Squared Error)
    reconstruction_error = np.mean((numeric_data - reconstructed_data) ** 2)
    print(f"Reconstruction Error (MSE): {reconstruction_error:.6f}")

    # Calculate variance of the reduced data
    variance_after_apply_PCA = np.var(reduced_data, axis=0)

    # Calculate total variance after PCA
    total_variance_after_apply_PCA = np.sum(variance_after_apply_PCA)

    # Compute the percentage change in variance due to dimensionality reduction
    percentage_change_in_variance = (1 - total_variance_after_apply_PCA / total_variance_before_apply_PCA)
    print(f"The percentage of variance change with n_components={i}: {percentage_change_in_variance:.6f}")

    print("\n*------------------------------------------------------------------------------------------*\n")

The shape after applying PCA: (1599, 1)
Reconstruction Error (MSE): 5.381712
The percentage of variance change with n_components=1: 0.053920

*------------------------------------------------------------------------------------------*

The shape after applying PCA: (1599, 2)
Reconstruction Error (MSE): 0.556146
The percentage of variance change with n_components=2: 0.005572

*------------------------------------------------------------------------------------------*

The shape after applying PCA: (1599, 3)
Reconstruction Error (MSE): 0.297044
The percentage of variance change with n_components=3: 0.002976

*------------------------------------------------------------------------------------------*

The shape after applying PCA: (1599, 4)
Reconstruction Error (MSE): 0.145048
The percentage of variance change with n_components=4: 0.001453

*------------------------------------------------------------------------------------------*

The shape after applying PCA: (1599, 5)
Reconstruction E

In [7]:
#chosse the number of components 5 
reduced_data, reconstructed_data = PCA(numeric_data, n_components = 5)
print(f"the shape before apply PCA {numeric_data.shape}")
print("*------------------------------------------------------------------------------------------*")
print(f"the shape after apply PCA {reduced_data.shape}")
print("*------------------------------------------------------------------------------------------*")
print(f"data after apply PCA with 5 components: \n{reduced_data}")


the shape before apply PCA (1599, 12)
*------------------------------------------------------------------------------------------*
the shape after apply PCA (1599, 5)
*------------------------------------------------------------------------------------------*
data after apply PCA with 5 components: 
[[-13.22202658  -2.03192212   1.18123474   0.47564207  -1.20021245]
 [ 22.04025471   4.40179054   0.35499069   0.2602393   -0.75290663]
 [  7.16536169  -2.50832073   0.62463767   0.27530638  -0.72707587]
 ...
 [ -3.43293096  14.27427694   1.73227854  -0.21146278   0.35664677]
 [  1.13557385  16.30769238   2.18955318   0.294478    -0.75029295]
 [ -3.87592057   3.13011173   1.84248483  -1.73878746   0.2121722 ]]
