In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

# Load dataset

In [1]:
def load_data(filename):
    '''
    Load in data and return values
    
    param:   filename (str) - filename/path to data
    return:  data - data values
    '''
    # double check it's samples x features
    data = pd.read_csv(filename)
    return data.values

In [2]:
def norm_data(data):
    '''
    Normalize data to z-values (0 mean and 1 std dev)
    
    param:   data - data values (n_samples, n_features)
    return:  data - normalized data
    '''
    means = data.mean(axis=0)    # mean for each feature
    stdevs = data.std(axis=0)
    data = (data - means) / stdevs
    return data

# Covariance matrix?

# PCA 

In [4]:
from sklearn.decomposition import PCA
def pca(data):
    '''
    Does PCA on normalized data set. Can optionally set fewer components
    
    param: data - normalized data (n_samples, n_features)
    return: 
        data_pc: data transformed onto princinpal components
        components:  principal axes in feature space, array, shape (n_components, n_features)
        weights: percentage of variance explained by each of the selected components. array, shape (n_components,)
    '''
    pca = PCA()
    data_pc = pca.fit(data)
    components = pca.components_
    weights = pca.explained_variance_ratio_
    
    return data_pc, components, weights

In [5]:
def cum_var_plot(weights, desired_var):
    '''
    Cumulative variance plot (number of components vs cumulative variance captured) with calculated number 
    of PCs required to get to a certain desired variance explained
    
    params:
        weights: percentage of variance explained by each of the selected components. array, shape (n_components,)
        desired_var:  percent variance to find number of PCs for
    return
        pcs_req:   pcs required to captured at least desired variance
        captured_var   exact variance captured by pcs_req
    
    '''
    INDEX_SHIFT = 1
    cum_var = np.cumsum(weights)
    pcs_req = ceil(np.min(np.where(cum_var > desired_var)))
    captured_var = cumulative_var[pcs_req-INDEX_SHIFT]
    plt.plot(range(INDEX_SHIFT,len(cum_var)+INDEX_SHIFT), cum_var)
    plt.axvline(x=pcs_req, ymin=0, ymax=1, color='k', linestyle='--')
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative variance captured')
    plt.title('Cumulative Variance Captured by Principal Components')
    
    return pcs_req, captured_var