In [5]:
import numpy as np
import pandas as pd
from scipy import linalg as la
from scipy import stats
from matplotlib import pyplot as plt
%matplotlib inline
#from matplotlib.ticker import AutoMinorLocator, MultipleLocator, FuncFormatter
#import seaborn as sns

1\. **PCA on 3D dataset**

* Generate a dataset with 3 features each with N entries (N being ${\cal O}(1000)$). With $N(\mu,\sigma)$ the normali distribution with mean $\mu$ and $\sigma$  standard deviation, generate the 3 variables $x_{1,2,3}$ such that:
    * $x_1$ is distributed as $N(0,1)$
    * $x_2$ is distributed as $x_1+N(0,3)$
    * $x_3$ is given by $2x_1+x_2$
* Find the eigenvectors and eigenvalues of the covariance matrix of the dataset
* Find the eigenvectors and eigenvalues using SVD. Check that they are two procedure yields to same result
* What percent of the total variability is explained by the principal components? Given how the dataset was constructed, do these make sense? Reduce the dimensionality of the system so that at least 99% of the total variability is retained.
* Redefine the data in the basis yielded by the PCA procedure
* Plot the data points in the original and the new coordiantes as a set of scatter plots. Your final figure should have 2 rows of 3 plots each, where the columns show the (0,1), (0,2) and (1,2) proejctions.


In [75]:
x1=np.random.normal(0, 1, 1000)
x2=x1+np.random.normal(0, 3, 1000)
x3=2*x1+x2
data = pd.DataFrame(
    { 'x1' : x1,
      'x2' : x2,
      'x3' : x3,
    }
    )
data

Unnamed: 0,x1,x2,x3
0,1.716107,0.852863,4.285077
1,-0.553477,-4.019568,-5.126522
2,0.782507,5.032375,6.597388
3,-1.028497,-4.300463,-6.357457
4,-0.680380,-2.104214,-3.464974
5,0.260760,-6.031062,-5.509543
6,-1.862402,-5.536014,-9.260817
7,-0.018197,-3.125550,-3.161944
8,1.476033,4.362691,7.314757
9,0.152357,1.357075,1.661789


In [87]:
c=np.cov(data, rowvar=False)
print("Covariance matrix:")
print(c)
l, V = la.eig(c)
print("\nCovariance matrix again with eigenvalues-eigenvectors decomposition:")
print(np.dot(V,np.dot(np.diag(l), la.inv(V))))
print("\nEigenvectors:","\nFirst->",V[0],"\nSecond->",V[1],"\nThird->",V[2])
print("\nEigenvalues:",l)
U, spectrum, Vt = la.svd(c)
print("\nCovariance matrix again with singular value decomposition:")
print(np.dot(U,np.dot(np.diag(spectrum), Vt)))
print("\nEigenvalues with SVD:",spectrum) #they are in non decreasing order
                                          #the last one is different since the machine precision is 1e-16

print("\nWeight of first eigenvalue:",spectrum[0]/spectrum.sum())
print("\nWeight of second eigenvalue:",spectrum[1]/spectrum.sum())
print("\nWeight of first+second eigenvalue:",(spectrum[0]+spectrum[1])/spectrum.sum())


#general case
threshold=0.99
i=1
while ((spectrum[:i].sum())/(spectrum.sum()))<threshold:
    i+=1
#Reducing the dimensionality to k<p simply means setting to zero all but first k diagonal values
spectrum[i:len(spectrum)]=0
print("\nTo keep the "+str(threshold*100)+"% we need",i,"eigenvalues")



print("\nRotate the data to obtain a trivial column:")
datarot=np.dot(data,U) #the last column becomes all 1e-16 which means zero
pd.DataFrame(datarot)


Covariance matrix:
[[ 0.98076416  0.90825825  2.86978657]
 [ 0.90825825  9.07383894 10.89035545]
 [ 2.86978657 10.89035545 16.62992859]]

Covariance matrix again with eigenvalues-eigenvectors decomposition:
[[ 0.98076416+0.j  0.90825825+0.j  2.86978657+0.j]
 [ 0.90825825+0.j  9.07383894+0.j 10.89035545+0.j]
 [ 2.86978657+0.j 10.89035545+0.j 16.62992859+0.j]]

Eigenvectors: 
First-> [-0.11996478 -0.81649658  0.56474931] 
Second-> [-0.57170905 -0.40824829 -0.71167556] 
Third-> [-0.8116386   0.40824829  0.41782307]

Eigenvalues: [ 2.47251425e+01+0.j -1.83875423e-15+0.j  1.95938922e+00+0.j]

Covariance matrix again with singular value decomposition:
[[ 0.98076416  0.90825825  2.86978657]
 [ 0.90825825  9.07383894 10.89035545]
 [ 2.86978657 10.89035545 16.62992859]]

Eigenvalues with SVD: [2.47251425e+01 1.95938922e+00 3.95797547e-16]

Weight of first eigenvalue: 0.926572096234674

Weight of second eigenvalue: 0.07342790376532611

Weight of first+second eigenvalue: 1.0

To keep the 99.0% we

Unnamed: 0,0,1,2
0,-4.171396,2.152613,-1.394845e-15
1,6.525304,0.406074,7.766757e-16
2,-8.325622,-0.382957,-1.197793e-15
3,7.741955,-0.176601,1.229390e-15
4,4.096927,-0.334473,7.105476e-16
5,7.888489,2.137409,2.005330e-16
6,10.904848,-0.981327,1.761328e-15
7,4.355444,0.892968,2.567848e-16
8,-8.608201,0.785042,-1.386493e-15
9,-2.142901,-0.185420,-2.888704e-16


In [85]:
datarot2=datarot[:,0:2] #I don't want the last column since is always zero
pd.DataFrame(datarot2)

Unnamed: 0,0,1
0,-4.171396,2.152613
1,6.525304,0.406074
2,-8.325622,-0.382957
3,7.741955,-0.176601
4,4.096927,-0.334473
5,7.888489,2.137409
6,10.904848,-0.981327
7,4.355444,0.892968
8,-8.608201,0.785042
9,-2.142901,-0.185420


2\. **PCA on a nD dataset**

Start from the dataset you have genereted in the previous exercise and add uncorrelated random noise. Such noise should be represented by other 10 uncorrelated variables normal distributed, with standar deviation much smaller (say, a factor 50) than those used to generate the $x_1$ and $x_2$.

Repeat the PCA procedure and compare the results with what you obtained before

3 \. **Looking at an oscillating spring** (optional)

Imagine you have $n$ cameras looking at a spring oscillating along the $x$ axis. Each  camera record the motion of the spring looking at it along a given direction defined by the pair $(\theta_i, \phi_i)$, the angles in spherical coordinates. 

Start from the simulation of the records (say ${\cal O}(1000)$) of the spring's motion along the x axis, assuming a little random noise affects the measurements along the $y$. Rotate such dataset to emulate the records of each camera.

Perform a Principal Component Analysis on the thus obtained dataset, aiming at finding the only one coordinate that really matters.


4\. **PCA on the MAGIC dataset** (optional)

Perform a PCA on the magic04.data dataset