In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import metrics


%matplotlib inline


# Case #1: Congressional Voting Data

After you've downloaded the data from the repository, go ahead and load it with Pandas

In [3]:
votes = pd.read_csv('../assets/datasets/votes.csv')

In [4]:
votes.head()

Unnamed: 0.1,Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,1,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,2,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,3,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,4,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,5,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


Next, let's define the x and y variables: 

In [5]:
x = (votes.ix[:,2:14].values == 'y').astype(int) 
# NOTE: x needs to convert the 'y' and 'n' to 0 and 1, this solution also converts N
y = votes.ix[:,1].values

x

array([[0, 1, 0, ..., 1, 0, 1],
       [0, 1, 0, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 1, 0, 1]])

In [6]:
xStand = StandardScaler().fit_transform(x)



Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [7]:
covMat1 = np.cov(xStand.T)
eigenValues, eigenVectors = np.linalg.eig(covMat1)

Now, let's check the eigenvalues: 

In [8]:
print(eigenValues)

[ 5.62972764  1.39305636  1.06146894  0.80225641  0.70993356  0.14530936
  0.54284626  0.49155717  0.21552425  0.38824614  0.34390066  0.30382302]


And the eigenvectors: 

In [9]:
print(eigenVectors)

[[ 0.22061923 -0.19191042 -0.2311798   0.6015333   0.64692288 -0.05913583
   0.01353542 -0.16004841  0.00585837  0.21273334  0.07328259 -0.02731374]
 [-0.05724413 -0.62900238  0.01619862  0.44103428 -0.6015398   0.01795272
  -0.13185009 -0.1133197   0.08989291  0.0289876  -0.05163865 -0.04816312]
 [ 0.33800352 -0.11542286  0.14791992  0.01534538  0.00982922 -0.21322592
   0.4402007  -0.12772666  0.05088414 -0.334095   -0.42978186  0.54382131]
 [-0.35586317  0.15138904 -0.06237562  0.18068828 -0.0459195  -0.43415925
  -0.2911962  -0.17756453 -0.5028514  -0.18302562  0.23248396  0.40812969]
 [-0.37548378 -0.05339551 -0.01789308  0.13818482  0.1446782   0.73353445
   0.09844158  0.00808002  0.08273321 -0.31979114  0.17012079  0.36046194]
 [-0.29680203 -0.13345763  0.29959174 -0.12218445  0.0233219  -0.11677767
   0.60507797 -0.45888542 -0.03985356  0.2338475   0.34935622 -0.14217777]
 [ 0.33307938  0.17630807  0.03474003  0.04300489 -0.05955743 -0.03096424
  -0.17048157 -0.44244235  0.289

To find the principal components, find the eigenpairs, and sort them from highest to lowest. 

In [10]:
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eigenPairs.sort()
eigenPairs.reverse()
for i in eigenPairs:
    print(i[0])

5.62972764069
1.39305636407
1.06146893606
0.802256406825
0.709933557454
0.542846258132
0.491557170284
0.388246138447
0.34390066459
0.303823016763
0.215524254248
0.145309362018


Now, calculate the explained variance. Recall the methods we learned in lesson 2.2!

In [11]:
totalEigen = sum(eigenValues)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues, reverse=True)]

In [12]:
print(varExpl)

[46.806547817243143, 11.582116130419827, 8.8252398132121161, 6.6701011601925959, 5.9025127190609199, 4.5133194641610288, 4.0868929483411272, 3.2279468215724685, 2.8592507362439656, 2.52603810871845, 1.791906634937757, 1.208127645896595]


Now, calculate the explained variance and the Cumulative explained variance

In [13]:
cvarex = np.cumsum(varExpl)

In [14]:
print(cvarex)

[  46.80654782   58.38866395   67.21390376   73.88400492   79.78651764
   84.2998371    88.38673005   91.61467687   94.47392761   96.99996572
   98.79187235  100.        ]


Now, conduct the PCA using scikit learn

In [15]:
PCA_set = PCA(n_components=5)
Y = PCA_set.fit_transform(xStand)

# Case #2: Airport Delays

In [16]:
airports = pd.read_csv('../assets/datasets/airport_operations.csv')

In [17]:
airports.head()

Unnamed: 0,airport,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,ABQ,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,ABQ,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,ABQ,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,ABQ,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,ABQ,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


First, let's define the x and y variables: Airport is going to be our "x" variable

In [18]:
x2 = airports.ix[:,2:14].values
y2 = airports.ix[:,0].values

Then, standardize the x variable for analysis

In [19]:
xStand = StandardScaler().fit_transform(x2)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [20]:
covMat = np.cov(xStand.T)
eigenValues2, eigenVectors2 = np.linalg.eig(covMat)

Then, check your eigenvalues and eigenvectors:

In [21]:
print(eigenValues2)
print(eigenVectors2)

[  5.71594128e+00   4.10771763e+00   8.46070622e-01   4.92674626e-01
   4.68096266e-01   1.66713004e-01   8.78260266e-02   7.27989129e-02
   5.25813963e-02   2.99299918e-03   1.67049793e-04   7.05899593e-04]
[[ -1.92844661e-01  -3.85272521e-01  -4.00175037e-01  -1.64039585e-01
    1.86296653e-01   2.78156147e-01   7.86539360e-02   3.17041164e-02
    7.52767812e-02  -1.14438376e-02  -7.07864735e-01  -3.44682322e-02]
 [ -1.92353674e-01  -3.85057896e-01  -4.02190449e-01  -1.66802678e-01
    1.84403875e-01   2.78351867e-01   9.04981279e-02   3.77083630e-02
    8.09001581e-02   1.35190021e-02   7.04235896e-01   3.71215770e-02]
 [  2.87689649e-01  -3.33455724e-01   2.51323774e-01   8.13357968e-02
    7.50865080e-03   2.37735910e-02  -5.78177983e-02   1.41563250e-02
    4.95309257e-01  -6.57324120e-01   2.23495764e-02  -2.32792117e-01]
 [  3.80590914e-01  -1.72431188e-01   8.79462334e-02   7.06458258e-02
    2.02464908e-01   3.45123821e-03  -3.15914078e-01   9.30547916e-02
    4.84973886e-01 

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [22]:
eigenPairs2 = [(np.abs(eigenValues2[i]), eigenVectors2[:,i]) for i in range(len(eigenValues2))]
eigenPairs2.sort()
eigenPairs2.reverse()
for i in eigenPairs2:
    print(i[0])

5.71594128131
4.10771763057
0.846070621527
0.492674626426
0.468096266456
0.166713003709
0.0878260265801
0.0727989128724
0.0525813962709
0.0029929991831
0.000705899593354
0.000167049792638


Next, Calculate the explained variance

In [23]:
totalEigen = sum(eigenValues2)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues2, reverse=True)]
cumulvarExpl = np.cumsum(varExpl)

In [24]:
print(cumulvarExpl)

[ 47.57620567  81.76648321  88.80868815  92.9094282   96.80559213
  98.19321523  98.92422853  99.53016479  99.96782207  99.99273407
  99.99860957 100.        ]


In [25]:
airports_pca = PCA(n_components=2)
Y = airports_pca.fit_transform(xStand)