In [72]:
import pandas as pd
import numpy as np

In [73]:
#TEST DF
df = pd.read_csv('https://raw.githubusercontent.com/ryanleeallred/datasets/master/points.csv')

## Standardizing

The Process of fitting data onto the standard scale where mean = 0, std =1.

Allows features to be compared to each other because, once standardized, they sit on the same scale.

### Raw

In [74]:
def std_raw(df):
    columns = df.columns
    for column in columns: #standardize each column seperately
        mean = df[column].mean() #mean of column
        std = df[column].std() #std of column
        df[column] = df[column].apply(lambda x: (x - mean) / std) #apply std formula
    return df

### Using a library

In [75]:
from sklearn.preprocessing import StandardScaler

def std_lib(df):
    std = StandardScaler() #grab scalar to use
    std.fit(df) #fit scalar to dataset
    return (pd.DataFrame(std.transform(df))) #transform dataset using fitted scalar

## Principal Component Analysis (PCA)

Using the dataset's features, derives new features which maximize the variance within the dataset. It does this by projecting the dataset on the axis of an eigen vector found from the variance-covariance matrix.

### Raw

In [83]:
### Finding the variance-covariance matrix

def cov_raw(std_df):
    cov = pd.DataFrame(np.dot(std_df.T, std_df))
    cov /= cov[0][0] #standardize covariance matrix.
    return cov

def cov_lib(std_df):
    return std_df.cov()

### Finding the eigen vectors of our covariance marix

def eigen_lib(cov):
    return np.linalg.eig(cov)

### project your data onto the eigen vector of greatest values

def PCA_raw(std_df):
    cov = cov_raw(std_df)
    val, vec = eigen_lib(cov)
    
    #sort eigen vectors by highest eigen value
    idx = val.argsort()[::-1]   
    val = val[idx]
    vec = vec[:,idx]
    
    #project the standardized df onto the eigen vectors
    return pd.DataFrame(std_df.dot(vec))

### Using a library

In [84]:
from sklearn.decomposition import PCA

def PCA_lib(std_df, n_PCA):
    pca = PCA(n_PCA)
    pca.fit(std_df)
    return pd.DataFrame(pca.transform(std_df))