# PCA

In [2]:
import numpy as np
import scipy.io as sio
from plotly.offline import plot, iplot, init_notebook_mode
from plotly.graph_objs import Scatter3d, Scatter


In [7]:
def plot2D(x, y, color):
    data_plt = Scatter(
        x=x, 
        y=y, 
        mode='markers',
        marker=dict(
            size=5,
            color=color,                
            colorscale='Viridis',   
            opacity=0.8
        )
    )
    iplot([data_plt])

def plot3D(x,y,z,color):
    data_plt = Scatter3d(
        x=x, 
        y=y, 
        z=z,
        mode='markers',
        marker=dict(
            size=5,
            color=color,                
            colorscale='Viridis',   
            opacity=0.8
        )
    )
    iplot([data_plt])
    
    

In [4]:
data = sio.loadmat('./data/toydata.mat')['D'].transpose()

{'D': array([[  9.54122105e+00,   8.35856268e+00,   1.27192705e+01,
           1.17446818e+01,   1.07754479e+01,   9.45151552e+00,
           7.74550863e+00,   8.42393129e+00,   8.46098975e+00,
           1.05036840e+01,   8.60968935e+00,   1.05927497e+01,
           1.15987434e+01,   1.29857303e+01,   9.86756722e+00,
           8.00684973e+00,   1.02660209e+01,   1.04673576e+01,
           9.23295035e+00,   1.11777095e+01,   1.19337330e+01,
           1.21326137e+01,   9.22068213e+00,   1.15291778e+01,
           8.76998517e+00,   9.76201696e+00,   4.79581030e+00,
           9.45132604e+00,   6.90891330e+00,   1.12137999e+01,
           8.79632882e+00,   1.04482003e+01,   9.30890721e+00,
           9.79124680e+00,   1.39729895e+01,   1.35877182e+01,
           1.35694961e+01,   8.20532748e+00,   7.48581696e+00,
           1.07377091e+01,   7.74424262e+00,   9.17789911e+00,
           1.12393888e+01,   9.49787529e+00,   1.06740746e+01,
           6.59398902e+00,   1.02259791e+01,   7.5

In [9]:
# This function should implement the PCA using the Singular Value
#    % Decomposition (SVD) of the given dataMatrix
#        %
#        % Input:
#        %       dataMatrix (nrDims x nrSamples)
#        %       desiredVariancePercentage (%)
#        % Output is a structure
#        %       eigvecs: eigenvectors
#        %       eigvals: eigenvalues
#        %       meanDataMatrix
#        %       demeanedDataMatrix
#        %       projectedData

def usingSVD(dataMatrix, desiredVariancePercentage=1.0):
    # This function should implement the PCA using the Singular Value
    # Decomposition (SVD) of the given dataMatrix
    # De-Meaning the feature space
    meanDataMatrix = dataMatrix.mean(0)
    demeanedDataMatrix = dataMatrix - meanDataMatrix
    
    # SVD Decomposition
    # You need to transpose the data matrix
    U, Sigma, V = np.linalg.svd(demeanedDataMatrix)

    # Enforce a sign convention on the coefficients -- the largest element (absolute) in each
    # column will have a positive sign.

    eigenValues = Sigma**2
    eigenVectors = V.transpose()
    
    for i in range(len(eigenValues)):
        evec = eigenVectors[:, i]
        max_, min_ = np.argmax(evec), np.argmin(evec)
        if abs(evec[min_]) > abs(evec[max_]):
            evec[min_] = - evec[min_]
    
    idx = eigenValues.argsort()[::-1]
    eigenValues = eigenValues[idx]
    eigenVectors = V.transpose()[idx]
    
    
    # Compute the accumelative Eigenvalues to finde the desired
    # Variance
    eigenSum = np.sum(eigenValues)
    normalized_eigenVal = eigenValues/eigenSum
    r = 0
    sum = 0.
    while sum < desiredVariancePercentage:
        sum += normalized_eigenVal[r]
        r += 1

    # Keep the eigenvectors and eigenvalues of the desired
    # variance, i.e. keep the first two eigenvectors and
    # eigenvalues if they have 90% of variance.
    eigvecs = eigenVectors[:,:r]
    eigvals = eigenValues[:r]
    
    
    # Project the data
    projectedData = np.dot(eigvecs.transpose(), dataMatrix.transpose())
    
    return projectedData

In [10]:
projectedDataSVD = usingSVD(data, .9)

In [11]:
init_notebook_mode(connected=True)
plot3D(
    x=data[:, 0], 
    y=data[:, 1], 
    z=data[:, 2],
    color='blue'
)
plot2D(
    x=projectedDataSVD[0], 
    y=projectedDataSVD[1],
    color='red'
)


In [12]:
# This function should implement the PCA using the EigenValue
#    % Decomposition of the given Covariance Matrix
#        %
#        % Input:
#        %       dataMatrix (nrDims x nrSamples)
#        %       desiredVariancePercentage (%)
#        % Output is a structure
#        %       eigvecs: eigenvectors
#        %       eigvals: eigenvalues
#        %       meanDataMatrix
#        %       demeanedDataMatrix
#        %       projectedData

 
def usingCOV(dataMatrix, desiredVariancePercentage=1.0):
    # This function should implement the PCA using the
    # EigenValue Decomposition of a given Covariance Matrix 
    # De-Meaning the feature space 
    meanDataMatrix = dataMatrix.mean(0)
    demeanedDataMatrix = dataMatrix - meanDataMatrix
            
    # Computing the Covariance 
    covMatrix = np.cov(demeanedDataMatrix.transpose())
    
    # Eigen Value Decomposition
    eigenValues, eigenVectors = np.linalg.eig(covMatrix)
    
    # In COV, you need to order the eigevectors according to largest eigenvalues    
    idx = eigenValues.argsort()[::-1]
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[idx]
          
    # Compute the accumelative Eigenvalues to finde the desired
    # Variance 
    eigenSum = np.sum(eigenValues)
    normalized_eigenVal = eigenValues/eigenSum
    r = 0
    sum = 0.
    while sum < desiredVariancePercentage:
        sum += normalized_eigenVal[r]
        r += 1
        
    # Keep the eigenvectors and eigenvalues of the desired
    # variance, i.e. keep the first two eigenvectors and
    # eigenvalues if they have 90% of variance. 
    eigvecs = eigenVectors[:,:r]
    eigvals = eigenValues[:r]
    # Project the data 
    projectedData =  np.dot(eigvecs.transpose(), dataMatrix.transpose())
    
    # return the object
    return projectedData

In [13]:
projectedDataCOV = usingCOV(data, 0.9)

In [14]:
init_notebook_mode(connected=True)
plot3D(
    x=data[:, 0], 
    y=data[:, 1], 
    z=data[:, 2],
    color='blue'
)
plot2D(
    x=projectedDataCOV[0], 
    y=projectedDataCOV[1],
    color='red'
)

plot2D(
    x=projectedDataSVD[0], 
    y=projectedDataSVD[1],
    color='red'
)


In [15]:
a = np.array([[1,2,3],[4,5,6]])
a


array([[1, 2, 3],
       [4, 5, 6]])

In [16]:
a[0]

array([1, 2, 3])