# Singular Value Decomposition

In [102]:
import numpy as np
import pandas as pd

### Loading Scores

In [103]:
marks = pd.read_csv("marks.csv")

In [104]:
marks

Unnamed: 0,x1,x2
0,83.7,77.3
1,75.1,66.5
2,56.9,51.3
3,82.9,81.0
4,88.5,78.0
5,73.6,68.3
6,60.2,52.0
7,52.0,57.7
8,77.5,70.3
9,58.0,58.9


### Centering Data

In [105]:
avg = marks.mean(axis = 0)

In [144]:
avg

array([67.19 , 61.935])

In [106]:
avg = avg.to_numpy()

In [107]:
centered_marks = marks - marks.mean(axis = 0)

In [108]:
centered_marks

Unnamed: 0,x1,x2
0,16.51,15.365
1,7.91,4.565
2,-10.29,-10.635
3,15.71,19.065
4,21.31,16.065
5,6.41,6.365
6,-6.99,-9.935
7,-15.19,-4.235
8,10.31,8.365
9,-9.19,-3.035


In [109]:
X = centered_marks.to_numpy()

In [110]:
X

array([[ 16.51 ,  15.365],
       [  7.91 ,   4.565],
       [-10.29 , -10.635],
       [ 15.71 ,  19.065],
       [ 21.31 ,  16.065],
       [  6.41 ,   6.365],
       [ -6.99 ,  -9.935],
       [-15.19 ,  -4.235],
       [ 10.31 ,   8.365],
       [ -9.19 ,  -3.035],
       [  4.01 ,  -1.535],
       [ 15.81 ,  11.165],
       [ -9.19 ,  -2.435],
       [  6.21 ,  -0.635],
       [  1.01 ,   4.565],
       [-11.49 , -14.835],
       [-21.89 , -18.335],
       [  8.11 ,   1.265],
       [ -5.09 ,  -0.935],
       [-23.99 , -20.235]])

### Decomposition of the matrix 

In [111]:
# X is your 20x2 matrix (omitted here for brevity)
U, s, VT = np.linalg.svd(X, full_matrices=False)  # thin SVD
S = np.diag(s)

In [112]:
VT

array([[-0.76940665, -0.63875927],
       [-0.63875927,  0.76940665]])

In [113]:
U

array([[-0.30427308,  0.09125026],
       [-0.12164122, -0.11014559],
       [ 0.1987783 , -0.11512011],
       [-0.32789184,  0.33137329],
       [-0.3602198 , -0.08949283],
       [-0.12158253,  0.05741153],
       [ 0.15842682, -0.22734501],
       [ 0.19448169,  0.46084437],
       [-0.17939294, -0.01069254],
       [ 0.12174309,  0.25279753],
       [-0.02844203, -0.26763016],
       [-0.26074339, -0.10786539],
       [ 0.11656424,  0.2858105 ],
       [-0.0590833 , -0.31860406],
       [-0.04990317,  0.20503811],
       [ 0.24750641, -0.29139636],
       [ 0.38584325, -0.00891254],
       [-0.09523693, -0.30085269],
       [ 0.06099017,  0.18105986],
       [ 0.42407625, -0.01752815]])

In [114]:
s

array([74.00404867, 13.98371482])

In [115]:
U @ np.diag(s)

array([[-22.51744   ,   1.2760176 ],
       [ -9.00194268,  -1.54024448],
       [ 14.71039928,  -1.60980681],
       [-24.26532398,   4.63382962],
       [-26.65772341,  -1.25144225],
       [ -8.99759939,   0.80282639],
       [ 11.72422585,  -3.17912776],
       [ 14.39243253,   6.44431618],
       [-13.27580387,  -0.14952147],
       [  9.0094815 ,   3.53504853],
       [ -2.10482518,  -3.74246389],
       [-19.2960664 ,  -1.50835884],
       [  8.62622594,   3.99669252],
       [ -4.37240316,  -4.4552683 ],
       [ -3.69303679,   2.86719449],
       [ 18.3164762 ,  -4.07480362],
       [ 28.55396281,  -0.12463047],
       [ -7.04791841,  -4.20703828],
       [  4.51351977,   2.53188948],
       [ 31.38335939,  -0.24510863]])

### Reconstructing the original matrix

In [120]:
reconstruct = U @ np.diag(s) @ VT

In [121]:
reconstruct_marks = reconstruct + avg

In [124]:
np.round(reconstruct_marks - marks, 2)

Unnamed: 0,x1,x2
0,-0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


### What happens if we take only one component

In [125]:
s

array([74.00404867, 13.98371482])

In [126]:
s1 = [74.00404867, 0]

In [127]:
reconstruct1 = U @ np.diag(s1) @ VT

In [142]:
reconstruct1

array([[ 17.32506807,  14.38322357],
       [  6.92615456,   5.75007435],
       [-11.31827903,  -9.39640393],
       [ 18.66990163,  15.49970068],
       [ 20.51062966,  17.02786799],
       [  6.9228128 ,   5.74730003],
       [ -9.02069733,  -7.48895796],
       [-11.07363329,  -9.19329972],
       [ 10.21449178,   8.48004281],
       [ -6.93195498,  -5.75488984],
       [  1.61946649,   1.3444766 ],
       [ 14.8465218 ,  12.32554132],
       [ -6.6370756 ,  -5.5100818 ],
       [  3.36415606,   2.79291306],
       [  2.84144707,   2.35896149],
       [-14.09281859, -11.699819  ],
       [-21.96960886, -18.23910849],
       [  5.42271529,   4.50192323],
       [ -3.47273212,  -2.8830526 ],
       [-24.14656541, -20.04641179]])

In [128]:
reconstruct_marks1 = reconstruct1 + avg

In [129]:
np.round(reconstruct_marks1 - marks, 2)

Unnamed: 0,x1,x2
0,0.82,-0.98
1,-0.98,1.19
2,-1.03,1.24
3,2.96,-3.57
4,-0.8,0.96
5,0.51,-0.62
6,-2.03,2.45
7,4.12,-4.96
8,-0.1,0.12
9,2.26,-2.72


### How much information loss?

In [130]:
singular_values = [74.00404867, 13.98371482]

In [132]:
singular_values_sq = np.power(singular_values, 2)

In [133]:
singular_values_sq / np.sum(singular_values_sq)

array([0.96552551, 0.03447449])

# Using sklearn for doing PCA

In [135]:
from sklearn.decomposition import PCA

In [155]:
pca = PCA()

In [156]:
pca.fit(centered_marks)

In [157]:
pca.explained_variance_ratio_

array([0.96552551, 0.03447449])

In [158]:
pca.components_

array([[-0.76940665, -0.63875927],
       [-0.63875927,  0.76940665]])

In [None]:
pca.

### Projecting the data onto the principal components

In [159]:
pca.transform(centered_marks)

array([[-22.51744   ,   1.2760176 ],
       [ -9.00194268,  -1.54024448],
       [ 14.71039928,  -1.60980681],
       [-24.26532398,   4.63382962],
       [-26.65772341,  -1.25144225],
       [ -8.99759939,   0.80282639],
       [ 11.72422585,  -3.17912776],
       [ 14.39243253,   6.44431618],
       [-13.27580387,  -0.14952147],
       [  9.0094815 ,   3.53504853],
       [ -2.10482518,  -3.74246389],
       [-19.2960664 ,  -1.50835884],
       [  8.62622594,   3.99669252],
       [ -4.37240316,  -4.4552683 ],
       [ -3.69303679,   2.86719449],
       [ 18.3164762 ,  -4.07480362],
       [ 28.55396281,  -0.12463047],
       [ -7.04791841,  -4.20703828],
       [  4.51351977,   2.53188948],
       [ 31.38335939,  -0.24510863]])

In [160]:
(pca.transform(centered_marks) @ pca.components_) + avg

array([[83.7, 77.3],
       [75.1, 66.5],
       [56.9, 51.3],
       [82.9, 81. ],
       [88.5, 78. ],
       [73.6, 68.3],
       [60.2, 52. ],
       [52. , 57.7],
       [77.5, 70.3],
       [58. , 58.9],
       [71.2, 60.4],
       [83. , 73.1],
       [58. , 59.5],
       [73.4, 61.3],
       [68.2, 66.5],
       [55.7, 47.1],
       [45.3, 43.6],
       [75.3, 63.2],
       [62.1, 61. ],
       [43.2, 41.7]])