In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [2]:
# Prepare the environment
%pylab inline
%matplotlib inline

import pandas as pd

import sklearn
from sklearn import cluster
from sklearn.decomposition import PCA

import scipy
from scipy.stats import moment
from scipy.stats import ttest_ind

# For running regressions and R-type formulas
import statsmodels
import statsmodels.api as sm
# R type regression formulas
import statsmodels.formula.api as smf

import patsy



Populating the interactive namespace from numpy and matplotlib


In [4]:
# Load Data
destinations = pd.read_csv('../data/destinations.csv', index_col=False)
sample_train = pd.read_csv('../data/train_sample.csv', index_col=False)

### Principal Component Analysis

A Summary of the PCA Approach
    1. Standardise the data
    2. Obtain the Eigenvectors and Eigenvalues from the covariance matrix or correlation matrix, or perform Singular Vector Decomposition
    3. Sort eigenvalues in descending order and choose the k eigenvectors that correspond to the k largest eigenvalues where k is the number of dimensions of the new feature subspace (k≤d)
    4. Compute projection matrix (W) from selected k eigenvectors.
    5. Transform the original dataset X via W to obtain a k-dimensional feature subspace Y

In [None]:
# Step 1 - Standardise the data
from sklearn.preprocessing import StandardScaler

stock_std_df = StandardScaler().fit_transform(stock_df)

In [None]:
# Step 2 - Obtain the Eigenvectors and Eigenvalues from the covariance matrix or correlation matrix
# Covariance matrix of standardised data
stock_std_cov = np.cov(stock_std_df.T)
# Eigendecomposition on the covariance matrix
eig_vals, eig_vecs = np.linalg.eig(stock_std_cov)

In [None]:
# Note: particularly in finance: the correlation matrix is typically used instead of the covariance matrix.
# However, the eigendecomposition of the covariance matrix yields the same results as a eigendecomposition 
# on the correlation matrix, since the correlation matrix can be understood as the normalized covariance matrix.

stock_std_corr = np.corrcoef(stock_std_df.T)
# Eigendecomposition on the correlation matrix
eig_vals, eig_vecs = np.linalg.eig(stock_std_corr)



In [None]:
# Step 3 - Sorting Eigenvector, Eigenvalue pairs
# We look for which eigenvector(s) to drop without losing too much information: The eigenvectors with 
# the lowest eigenvalues bear the least information about the distribution of the data.

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

for i in eig_pairs:
    print(i[0])

In [None]:
# Compute explained variance to find how much information (variance) can be attributed to each of 
# the principal components in our new feature subspace.
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
var_exp = np.array(var_exp)
cum_var_exp = np.cumsum(var_exp)

In [None]:
# Step 4 - Projection Matrix
# The projection matrix is used to transform the dataset to the new feature subspace
matrix_w = np.hstack((eig_pairs[0][1].reshape(5,1),
                      eig_pairs[1][1].reshape(5,1)))



In [None]:
# Step 5 - Project the dataset onto the new feature subspace
Y = stock_std_cov.dot(matrix_w)
Y

#### Visualise PCA

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))

    plt.bar(range(5), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
    plt.step(range(5), cum_var_exp, where='mid',
             label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

### Clustering of Hotels by Latent Features