# ANALYSING HOW A PRINCIPLE COMPONENT ANALYSIS WORKS

#  Implementation of PCA on Iris data set from Scikit, using SVD 

In [82]:
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

## Iris data set
iris = load_iris()

## Initialization
X = iris.data  
y = iris.target

## Step 1: Centering data

X_centered = X - np.mean(X, axis=0)

## X_centered standardized

scaled = StandardScaler()
Standardized_X = scaled.fit_transform(X_centered)

## Step 2: Compute covariance matrix using SVD

U, S, V_t = np.linalg.svd(Standardized_X, full_matrices=False)
cov_matrix = np.dot(V_t.T * S**2, V_t)

## Step 3: Computing cummulative explained variance ratio to help in determining what principal components to retain

variance_ratio = S**2 / np.sum(S**2)
cumulative_variance_ratio = np.cumsum(variance_ratio)


## Step 4: Select principal components based on cumulative explained variance ratio 

targeted_variance_explained = 0.95  
components_cumulative = np.argmax(cumulative_variance_ratio >= targeted_variance_explained) + 1

ncomponents = components_cumulative  

## Perform PCA with the selected number of components
pca = PCA(n_components=ncomponents)
pca.fit(X_centered)

## Access principal components
principal_components = pca.components_

## Step 5: Transforming data using SVD

Transformed_X = np.dot(Standardized_X, principal_components.T)


print("Principal components are:")
print(principal_components)
print("\nData Transformed is:")
print(Transformed_X)


Principal components are:
[[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]]

Data Transformed is:
[[-2.03106506  0.48431026]
 [-2.0213579  -0.51520868]
 [-2.19653936 -0.32830794]
 [-2.12348909 -0.59565432]
 [-2.09431031  0.57283359]
 [-1.73712933  1.34587222]
 [-2.18338808 -0.09149359]
 [-2.00670574  0.22681794]
 [-2.22084331 -1.08107679]
 [-2.03928615 -0.3470462 ]
 [-1.88992239  1.04929019]
 [-2.04559167  0.05784895]
 [-2.11230873 -0.58483041]
 [-2.47732381 -0.95305556]
 [-1.91921191  1.90132462]
 [-1.80042995  2.44465673]
 [-1.93189314  1.38528844]
 [-1.98390267  0.47437452]
 [-1.63346958  1.42639954]
 [-1.99358214  0.96876287]
 [-1.73417006  0.52533968]
 [-1.92696295  0.79074634]
 [-2.4642279   0.29401996]
 [-1.70459141  0.08877929]
 [-1.89951882  0.02828678]
 [-1.88018755 -0.45535933]
 [-1.86369001  0.19709242]
 [-1.93858566  0.55401366]
 [-1.96781981  0.39578692]
 [-2.0504665  -0.35787011]
 [-1.98722125 -0.44639345]
 [-1.7372271

#  Reading the data set golub data set

In [49]:
import pandas as pd
file_path = "golub.csv" 
data = pd.read_csv(file_path)
data


Unnamed: 0,Samples,BM.PB,Gender,Source,tissue.mf,cancer,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,39,BM,F,DFCI,BM:f,allB,-1363.276427,-1058.585495,-541.469194,74.349803,...,-35.081447,1565.618803,-485.680714,-170.261227,-919.114293,1675.050053,389.769289,-526.449219,-268.963924,-779.643092
1,40,BM,F,DFCI,BM:f,allB,-796.285053,-1167.103365,7.538493,83.544731,...,-404.737767,622.498054,-1275.354673,214.828233,-750.220666,-441.589276,9.841713,-671.911209,-671.911209,-644.272577
2,42,BM,F,DFCI,BM:f,allB,-679.139168,-1069.832308,-690.301829,-112.075981,...,193.780934,162.525483,-692.534361,517.498108,-674.674103,1265.396405,-65.192805,-714.859684,3672.066149,-822.021231
3,47,BM,M,DFCI,BM:m,allB,-1164.400197,-1109.939891,-990.127218,-238.574994,...,-256.002292,-31.625831,-621.975549,-458.594630,-630.689198,937.767618,-264.715941,-635.046022,475.944222,-794.070116
4,48,BM,F,DFCI,BM:f,allB,-1299.653758,-1401.998536,-1077.543813,-437.344560,...,-509.203660,383.591216,-905.517483,-36.675640,-1116.739685,316.087213,-210.879518,-953.423549,-291.448812,-1057.945876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,29,BM,,CALGB,BM:NA,aml,-589.611534,-877.740714,-618.647808,-192.037937,...,-236.709128,475.796364,-502.502712,-4.418936,-652.151201,0.048183,-509.203391,-609.713570,4925.046956,-627.582046
68,30,BM,,CALGB,BM:NA,aml,-1552.764727,-1288.774596,-1085.544097,-232.814069,...,-415.092970,657.628835,-754.508853,102.411495,-817.363646,657.628835,-410.902650,-905.360357,-109.199643,-951.453872
69,31,BM,,CALGB,BM:NA,aml,-772.071972,-809.591203,-593.303872,-193.834414,...,-621.995048,605.104501,-575.647763,-226.939618,-750.001836,596.276447,95.284364,-758.829890,-407.914731,-772.071972
70,32,BM,,CALGB,BM:NA,aml,-1124.873709,-1026.849847,-935.360908,-135.921851,...,-96.712306,1188.489447,-887.438131,88.443879,-859.120126,1693.856917,-236.124022,-769.809496,-205.627709,-861.298435


#  Uploading the data set in to matrix

In [69]:
A = data.to_numpy()
A

array([[39, 'BM', 'F', ..., -526.4492185893072, -268.96392361976007,
        -779.6430919760288],
       [40, 'BM', 'F', ..., -671.9112088490682, -671.9112088490682,
        -644.2725769000689],
       [42, 'BM', 'F', ..., -714.8596835849695, 3672.066149087798,
        -822.0212306426248],
       ...,
       [31, 'BM', nan, ..., -758.8298902647056, -407.91473137119885,
        -772.0719717323849],
       [32, 'BM', nan, ..., -769.8094961108546, -205.6277092450704,
        -861.2984345215222],
       [33, 'BM', nan, ..., -937.506736866882, 760.4336513947948,
        -933.2724715594964]], dtype=object)

#  Determining the number of rows and columns of matrix A

In [68]:
num_rows, num_columns = A.shape
print("Number of rows of matrix A:", num_rows)
print("Number of columns of matrix A:", num_columns)


Number of rows of matrix A: 72
Number of columns of matrix A: 7135


# (6) Rank of Sample covariance matrix

In [70]:
# Excluding non-numeric columns
numerical_columns = data.select_dtypes(include=[np.number]).columns

# Removing the first column
numerical_columns = numerical_columns[1:]

# New matrix without the first column and non numeric columns
A_numeric = data[numerical_columns]


## Data matrix standardized
scaler = StandardScaler()
A_numeric_scaled = scaler.fit_transform(A_numeric)

Sigma = np.cov(A_numeric_scaled)


rank = np.linalg.matrix_rank(Sigma)

print("Rank of the sample covariance matrix Sigma is:", rank)


Rank of the sample covariance matrix Sigma is: 71


# (7)  Principal components both general and all those that explain 95% of the total variance

In [71]:
eigenvals, eigenvecs = np.linalg.eigh(Sigma)

## Eigenvalues and corresponding eigenvectors sorted in descending order
ids = np.argsort(eigenvals)[::-1]
eigenvals = eigenvals[ids]
eigenvecs = eigenvecs[:, ids]

principal_components_1 = eigenvecs

threshold = 1e-10  

## Counting the number of eigenvalues above the threshold
num_principal_components = np.sum(eigenvals > threshold)


print("Principal components of the dataset are:")
print(principal_components_1)

print("Total number of principal components is:", num_principal_components)



Principal components of the dataset are:
[[-0.0775121   0.06771408  0.08406691 ... -0.01222254 -0.0164755
   0.11785113]
 [-0.08024876  0.1052726   0.1478624  ...  0.0021952   0.04728771
   0.11785113]
 [-0.00294399 -0.04295139 -0.05347065 ... -0.01779791 -0.03447103
   0.11785113]
 ...
 [-0.00967697  0.07652213 -0.0786428  ... -0.03118667  0.0483311
   0.11785113]
 [ 0.08433685  0.12676813 -0.09697544 ...  0.02487616 -0.0071052
   0.11785113]
 [ 0.12763622  0.25771071 -0.248957   ... -0.0294624  -0.01132518
   0.11785113]]
Total number of principal components is: 71


### Principal components that explain 95% of the total variance

In [63]:
## Explained variance ratio
explained_var_ratio = eigenvals / np.sum(eigenvals)

## Cummulative explained variance ratio
cummulative_explained_var_ratio = np.cumsum(explained_var_ratio)

## Targeted percentage of total variance to be explained
percentage_explained = 0.95  

## Number of principal components that explain the targeted percentage of total variance
components = np.argmax(cummulative_explained_var_ratio >= percentage_explained) + 1

principal_components = principal_components[:, :components]

print("Number of principal components that explain 95% of the total variance is:", components)
print("Selected principal components:")
print(principal_components)



Number of principal components that explain 95% of the total variance is: 59
Selected principal components:
[[-0.0775121   0.06771408  0.08406691 ... -0.01490875  0.01415589
   0.05773765]
 [-0.08024876  0.1052726   0.1478624  ...  0.0395039  -0.04477017
  -0.00825602]
 [-0.00294399 -0.04295139 -0.05347065 ...  0.05072252 -0.05323075
  -0.01716934]
 ...
 [-0.00967697  0.07652213 -0.0786428  ... -0.10129978  0.17816096
  -0.11155602]
 [ 0.08433685  0.12676813 -0.09697544 ...  0.01477278 -0.04616776
   0.07331358]
 [ 0.12763622  0.25771071 -0.248957   ...  0.06760259 -0.02359174
   0.04786125]]


# (8) Eigen Values for the sample covariance matrix

In [56]:
eigenvalues = np.linalg.eigvals(Sigma)

print("Eigenvalues of the sample covariance matrix Sigma:")
print(eigenvalues)


Eigenvalues of the sample covariance matrix Sigma:
[1.14603875e+01 8.30327974e+00 3.66960217e+00 2.54451415e+00
 2.16230352e+00 2.00969004e+00 1.75309100e+00 1.58098861e+00
 1.37469076e+00 1.29826902e+00 1.16322862e+00 1.13410045e+00
 1.04748397e+00 1.01803671e+00 9.48745154e-01 8.76906953e-16
 8.85853420e-01 8.24235725e-01 8.09868605e-01 7.30073829e-01
 6.97559816e-01 6.91139673e-01 6.74409013e-01 6.43149359e-01
 6.40591849e-01 6.25324722e-01 5.94042773e-01 5.79809812e-01
 2.19587559e-01 2.22652578e-01 2.24870070e-01 5.72098747e-01
 5.62249878e-01 2.41202252e-01 5.44811907e-01 2.51977627e-01
 2.56433072e-01 5.30571462e-01 2.71491210e-01 5.16072974e-01
 5.08208713e-01 2.77940294e-01 5.00084641e-01 4.92120870e-01
 4.85337543e-01 4.75686796e-01 2.85534898e-01 2.91268924e-01
 2.99109976e-01 3.04694428e-01 3.12600073e-01 4.65439942e-01
 4.58476870e-01 4.53604113e-01 4.47654105e-01 4.40792198e-01
 4.36839035e-01 3.26720125e-01 4.23108870e-01 3.34561539e-01
 3.37093333e-01 3.44102433e-01 3.4