# PCA

In [131]:
import numpy as np
import scipy.io as sio
from plotly.offline import plot, iplot, init_notebook_mode
from plotly.graph_objs import Scatter3d, Scatter
from sklearn.decomposition import PCA


In [200]:
def plot2D(x, y, color):
    data_plt = Scatter(
        x=x, 
        y=y, 
        mode='markers',
        marker=dict(
            size=5,
            color=color,                
            colorscale='Viridis',   
            opacity=0.8
        )
    )
    iplot([data_plt])

def plot3D(x,y,z,color):
    data_plt = Scatter3d(
        x=x, 
        y=y, 
        z=z,
        mode='markers',
        marker=dict(
            size=5,
            color=color,                
            colorscale='Viridis',   
            opacity=0.8
        )
    )
    iplot([data_plt])
    

def skleanr_PCA(data, desiredVariancePercentage):
    pca = PCA(desiredVariancePercentage)
    dataMatrix = data
    meanDataMatrix = dataMatrix.mean(0)
    demeanedDataMatrix = dataMatrix - meanDataMatrix

    return pca.fit_transform(demeanedDataMatrix).transpose()
    

In [28]:
data = sio.loadmat('./data/toydata.mat')['D'].transpose()

In [196]:
# This function should implement the PCA using the Singular Value
#    % Decomposition (SVD) of the given dataMatrix
#        %
#        % Input:
#        %       dataMatrix (nrDims x nrSamples)
#        %       desiredVariancePercentage (%)
#        % Output is a structure
#        %       eigvecs: eigenvectors
#        %       eigvals: eigenvalues
#        %       meanDataMatrix
#        %       demeanedDataMatrix
#        %       projectedData

def usingSVD(dataMatrix, desiredVariancePercentage=1.0):
    # This function should implement the PCA using the Singular Value
    # Decomposition (SVD) of the given dataMatrix
    # De-Meaning the feature space
    meanDataMatrix = dataMatrix.mean(0)
    demeanedDataMatrix = dataMatrix - meanDataMatrix
    
    # SVD Decomposition
    # You need to transpose the data matrix
    U, Sigma, V = np.linalg.svd(demeanedDataMatrix)

    
    eigenValues = Sigma**2
    eigenVectors = V
    
# Enforce a sign convention on the coefficients -- the largest element (absolute) in each
# column will have a positive sign.

#     for i in range(len(eigenValues)):
#         evec = eigenVectors[:, i]
#         max_, min_ = np.argmax(evec), np.argmin(evec)
#         if abs(evec[min_]) > abs(evec[max_]):
#             evec[min_] = - evec[min_]
    
    idx = eigenValues.argsort()[::-1]
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[:, idx]
    
    # Compute the accumelative Eigenvalues to finde the desired
    # Variance
    eigenSum = np.sum(eigenValues)
    normalized_eigenVal = eigenValues/eigenSum
    r = 0
    sum = 0.
    while sum < desiredVariancePercentage:
        sum += normalized_eigenVal[r]
        r += 1

    # Keep the eigenvectors and eigenvalues of the desired
    # variance, i.e. keep the first two eigenvectors and
    # eigenvalues if they have 90% of variance.
    eigvecs = eigenVectors[:,:r]
    eigvals = eigenValues[:r]
    
    
    # Project the data
    projectedData = np.dot(eigvecs.transpose(), demeanedDataMatrix.transpose())
    
    return projectedData

In [197]:
projectedDataSVD = usingSVD(data, .9)

In [198]:
projectedDataSVD2 

In [199]:
init_notebook_mode(connected=True)
plot3D(
    x=data[:, 0], 
    y=data[:, 1], 
    z=data[:, 2],
    color='blue'
)
plot2D(
    x=projectedDataSVD[0], 
    y=projectedDataSVD[1],
    color='red'
)
plot2D(
    x=projectedDataSVD2[0], 
    y=projectedDataSVD2[1],
    color='red'
)


In [193]:
# This function should implement the PCA using the EigenValue
#    % Decomposition of the given Covariance Matrix
#        %
#        % Input:
#        %       dataMatrix (nrDims x nrSamples)
#        %       desiredVariancePercentage (%)
#        % Output is a structure
#        %       eigvecs: eigenvectors
#        %       eigvals: eigenvalues
#        %       meanDataMatrix
#        %       demeanedDataMatrix
#        %       projectedData

 
def usingCOV(dataMatrix, desiredVariancePercentage=1.0):
    # This function should implement the PCA using the
    # EigenValue Decomposition of a given Covariance Matrix 
    # De-Meaning the feature space 
    meanDataMatrix = dataMatrix.mean(0)
    demeanedDataMatrix = dataMatrix - meanDataMatrix
            
    # Computing the Covariance 
    covMatrix = np.dot(demeanedDataMatrix.transpose(), demeanedDataMatrix)
    
    # Eigen Value Decomposition
    eigenValues, eigenVectors = np.linalg.eig(covMatrix)
    eigenVectors = -eigenVectors
# Enforce a sign convention on the coefficients -- the largest element (absolute) in each
# column will have a positive sign.

#     for i in range(len(eigenValues)):
#         evec = eigenVectors[:, i]
#         max_, min_ = np.argmax(evec), np.argmin(evec)
#         if abs(evec[min_]) > abs(evec[max_]):
#             evec[min_] = - evec[min_]

    # In COV, you need to order the eigevectors according to largest eigenvalues    
    idx = eigenValues.argsort()[::-1]
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[:,idx]
          
    # Compute the accumelative Eigenvalues to finde the desired
    # Variance 
    eigenSum = np.sum(eigenValues)
    normalized_eigenVal = eigenValues/eigenSum
    r = 0
    sum = 0.
    while sum < desiredVariancePercentage:
        sum += normalized_eigenVal[r]
        r += 1
        
    # Keep the eigenvectors and eigenvalues of the desired
    # variance, i.e. keep the first two eigenvectors and
    # eigenvalues if they have 90% of variance. 
    eigvecs = eigenVectors[:,:r]
    eigvals = eigenValues[:r]
    # Project the data 
    projectedData =  np.dot(eigvecs.transpose(), demeanedDataMatrix.transpose())
    
    # return the object
    return projectedData

In [194]:
projectedDataCOV = usingCOV(data, 0.9)

In [195]:
init_notebook_mode(connected=True)
plot3D(
    x=data[:, 0], 
    y=data[:, 1], 
    z=data[:, 2],
    color='blue'
)
plot2D(
    x=projectedDataCOV[0], 
    y=projectedDataCOV[1],
    color='red'
)

plot2D(
    x=projectedDataSVD2[0], 
    y=projectedDataSVD2[1],
    color='red'
)
