## Principal Components Analysis

In [1]:
options(warn = -1)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)  

In [6]:
library(tidyverse)

In [8]:
glimpse(USArrests)

Rows: 50
Columns: 4
$ Murder   [3m[90m<dbl>[39m[23m 13.2, 10.0, 8.1, 8.8, 9.0, 7.9, 3.3, 5.9, 15.4, 17.4, 5.3, 2.…
$ Assault  [3m[90m<int>[39m[23m 236, 263, 294, 190, 276, 204, 110, 238, 335, 211, 46, 120, 24…
$ UrbanPop [3m[90m<int>[39m[23m 58, 48, 80, 50, 91, 78, 77, 72, 80, 60, 83, 54, 83, 65, 57, 6…
$ Rape     [3m[90m<dbl>[39m[23m 21.2, 44.5, 31.0, 19.5, 40.6, 38.7, 11.1, 15.8, 31.9, 25.8, 2…


In [10]:
# very different means for all the columns 
summary(USArrests)

     Murder          Assault         UrbanPop          Rape      
 Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
 1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
 Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
 Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
 3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
 Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00  

In [11]:
# very different mean 

# variance 
apply(USArrests, 2, var)
# variance is also very different 

# We need to scale because UrbanPop measures % of the population in each state living in urban area, 
# this is not comparable to number of assaults per 100,000 individuals. 
# If we don't scale the principal components we observed would be driven by Assault : largest mean and variance. 

pr.out <- prcomp(USArrests, scale =TRUE)

# prcomp() centers variable to mean zero, but by using scale = TRUE, we scale the variables to have SD : 1 

names(pr.out)

# center and scale corresponds to mean and SD of the variables that were used for scaling prior to implementing PCA
# rotation : provide PC loadings, each column contain corresponding PC loadeing vectors. 

pr.out$center
pr.out$scale
pr.out$rotation

# we seee 4 distinct Pricipal components. 
# there are general min(n-1, p) informative PC in dataset with n obs and p variables.

Unnamed: 0,PC1,PC2,PC3,PC4
Murder,-0.5358995,-0.4181809,0.3412327,0.6492278
Assault,-0.5831836,-0.1879856,0.2681484,-0.74340748
UrbanPop,-0.2781909,0.8728062,0.3780158,0.13387773
Rape,-0.5434321,0.1673186,-0.8177779,0.08902432
