In [100]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.linalg as la

df = pd.read_csv("E:\data sets\mathsdataset.csv")
df.head()

Unnamed: 0,C1,C2,C3,C4,C5
0,65.0,73.0,34.0,71.0,74.0
1,64.0,80.0,51.0,75.0,87.0
2,68.0,80.0,52.0,64.0,77.0
3,61.0,71.0,53.0,85.0,87.0
4,67.0,51.0,54.0,45.0,66.0


In [57]:
# Prepare the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)


In [79]:
#Covariance Matrix Calculation
covariance_matrix = np.cov(scaled_data.T)
covariance_matrix 

array([[ 1.02380952,  0.29943684,  0.22704342, -0.02003864, -0.12535509],
       [ 0.29943684,  1.02380952,  0.28042538,  0.6188326 ,  0.59438571],
       [ 0.22704342,  0.28042538,  1.02380952,  0.24097965,  0.30827073],
       [-0.02003864,  0.6188326 ,  0.24097965,  1.02380952,  0.63019808],
       [-0.12535509,  0.59438571,  0.30827073,  0.63019808,  1.02380952]])

In [78]:
#Eigenvalue and Eigenvector Calculation
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# Print the results
print("Eigenvalues:")
print(eigenvalues)
print("\nEigenvectors:")
print(eigenvectors)

Eigenvalues:
[2.43568319 1.22187655 0.79612226 0.2818988  0.38346682]

Eigenvectors:
[[ 0.11575128  0.83486689  0.33549402  0.42067727  0.0085631 ]
 [ 0.55037393  0.12635536  0.36432537 -0.69779944  0.24792302]
 [ 0.3357494   0.36923272 -0.84820177 -0.14667182 -0.09990333]
 [ 0.53505135 -0.23163286  0.16819621  0.19402363 -0.77079903]
 [ 0.53355654 -0.3115205  -0.08351397  0.52625923  0.57822922]]


In [85]:
#Sorting the Components
sorted_indices = np.argsort(eigenvalues)[::-1]  # Sort eigenvalues in descending order
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]
# Printing the results
print("Sorted Eigenvalues:")
print(sorted_eigenvalues)
print("\nSorted Eigenvectors:")
print(sorted_eigenvectors)

Sorted Eigenvalues:
[2.43568319 1.22187655 0.79612226 0.38346682 0.2818988 ]

Sorted Eigenvectors:
[[ 0.11575128  0.83486689  0.33549402  0.0085631   0.42067727]
 [ 0.55037393  0.12635536  0.36432537  0.24792302 -0.69779944]
 [ 0.3357494   0.36923272 -0.84820177 -0.09990333 -0.14667182]
 [ 0.53505135 -0.23163286  0.16819621 -0.77079903  0.19402363]
 [ 0.53355654 -0.3115205  -0.08351397  0.57822922  0.52625923]]


In [89]:
#Choosing the Number of Components
explained_variance_ratio = sorted_eigenvalues / np.sum(sorted_eigenvalues)
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1  # Select components explaining 95% of the variance

print("Explained Variance Ratio:")
print(explained_variance_ratio)
print("Cumulative Variance Ratio:")
print(cumulative_variance_ratio)


Explained Variance Ratio:
[0.47580788 0.23869216 0.15552156 0.0749098  0.0550686 ]
Cumulative Variance Ratio:
[0.47580788 0.71450004 0.8700216  0.9449314  1.        ]


In [172]:
#Projection
pca = PCA(n_components=5)
transformed_data = pca.fit_transform(scaled_data)
X_pca = pd.DataFrame(transformed_data)
X_pca.head()


Unnamed: 0,0,1,2,3,4
0,2.001956,-1.840493,3.447027,0.416684,-0.115572
1,-0.198231,-1.357451,1.231248,0.846801,-0.007221
2,0.763188,-0.319936,1.196589,0.932971,0.550687
3,-0.260523,-1.858092,0.666702,-0.050577,-0.544116
4,3.640094,0.098667,-0.271699,0.79493,-0.281268


In [173]:
#Feature Selection
# Identify the components that contribute most to the variance explained
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)


In [178]:
# Determine the number of components explaining a desired amount of variance (e.g., 90%)
desired_variance = 0.8
n_components = np.argmax(cumulative_variance_ratio >= desired_variance) + 1


In [183]:
# Select the top n_components and reconstruct the data
X_selected = df_pca.iloc[:, :n_components]
X_reconstructed = np.dot(X_selected, pca.components_[:n_components, :]) + pca.mean_

In [185]:
# Step 4: Error Analysis
# Calculate the reconstruction error (mean squared error) between the original and reconstructed data
mse = np.mean((X_pca - X_reconstructed) ** 2)
mse

0    3.881038
1    1.734767
2    3.087083
3    1.141652
4    1.073857
dtype: float64