# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Import Dataset

In [2]:
data = pd.read_csv('data.csv', header=None)
data = np.matrix(data)
print("Original Data")
print(data)

Original Data
[[0.5 0.7]
 [1.  1.1]
 [1.5 1.6]
 [1.1 0.9]
 [1.9 2.2]
 [2.2 2.9]
 [2.6 2.7]
 [2.  1.6]
 [2.5 2.4]
 [3.1 3. ]]


# Preprocessing Data

In [3]:
data_std = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

# Calculating Eigenvectors & Eigenvalues

In [4]:
cov_matrix = np.cov(data_std.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_matrix)

## Eigenvalues

In [7]:
print("Eigenvalues")
print(eigen_vals)

Eigenvalues
[2.1494305  0.07279173]


## Eigenvectors

In [8]:
print("Eigenvectors")
print(eigen_vecs)

Eigenvectors
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


# Transform Data using Eigenvectors

In [6]:
final = np.dot(data_std, eigen_vecs)
print("Transformed Data")
print(final)

Transformed Data
[[-2.29649685  0.16564047]
 [-1.48493521  0.05849416]
 [-0.58532165  0.03939976]
 [-1.56916825 -0.20948047]
 [ 0.31047304  0.20022808]
 [ 1.20244884  0.54097911]
 [ 1.39382819 -0.0026079 ]
 [-0.12596767 -0.41995421]
 [ 1.03780164 -0.17489286]
 [ 2.11733792 -0.19780613]]


# Compress Data to 1D

Which column should we choose as our new compressed dataset? To know the answer, we have to first sort the eigenvalues descendingly and then choose the eigenvector that pairs with the top 1 eigenvalues to generate the new compressed data.

## Sort

In [9]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

In [12]:
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order and the corresponding eigenvectors:')
print(eig_pairs)

Eigenvalues in descending order and the corresponding eigenvectors:
[(2.149430495007601, array([0.70710678, 0.70710678])), (0.07279172721462168, array([-0.70710678,  0.70710678]))]


Here we know the first eigenvector is chosen as it has the largest eigenvalues. The new data is the first column of the transformed data computed just now.

In [11]:
new_data = final[:, 0]
print("New Data")
print(new_data)

New Data
[[-2.29649685]
 [-1.48493521]
 [-0.58532165]
 [-1.56916825]
 [ 0.31047304]
 [ 1.20244884]
 [ 1.39382819]
 [-0.12596767]
 [ 1.03780164]
 [ 2.11733792]]
