In [None]:

from pandas import *
from numpy import *
set_printoptions(legacy = '1.25')

mnist = read_csv('mnist.csv').to_numpy()

dataset = mnist[:,1:]
labels = mnist[:,0]

Q = cov(dataset.T)
totvar = trace(Q)


In [None]:

from scipy.linalg import eigh

# use eigh for symmetric matrices
lamda, V = eigh(Q)

# sort in ascending order then reverse
sorted = sort(lamda)[::-1]
percent = sorted * 100 / totvar

# cumulative sums
sums = cumsum(percent)

data = array([percent, sums])
print(round(data.T[:20], 3))

d = len(lamda)
from matplotlib.pyplot import *

stairs(percent, range(d+1))
grid()
show()


In [None]:

# projection matrix onto top n 
# eigenvectors of variance
# of dataset

def pca(dataset, n):
	Q = cov(dataset.T)
	# columns of U are eigenvectors
	lamda, V = eigh(Q)
	# columns of V are eigenvectors
	# reverse order from increasing to decreasing 
	lamda, V = lamda[::-1], V[:,::-1]
	Vproj = V[:,:n] # top n columns
	P = dot(Vproj, Vproj.T)
	return P


In [None]:

from scipy.linalg import svd
	
# projection matrix onto top n 
# right singular vectors of 
# centered dataset
	
def pca_with_svd(dataset, n):
	# center dataset
	mu = mean(dataset, axis = 0)
	A = dataset - mu
	# rows of Vt are right singular vectors
	U, sigma, Vt = svd(A)
	# no need to sort, already decreasing order   
	Vproj = Vt[:n] # top n rows
	P = dot(Vproj.T, Vproj)
	return P


In [None]:

def display_image(v, row, col, i):
	A = reshape(v, (28,28))
	fig.add_subplot(row, col, i)
	axis('off')
	imshow(A, cmap = 'gray_r')


In [None]:

fig = figure(figsize = (10,5))
row, col = 2, 4
	
v = dataset[1] # second image
display_image(v, row, col, 1)
	
for i, n in enumerate([784, 600, 350, 150, 50, 10, 1], start = 2):
	# either will work
	P = pca(dataset, n)
	#P = pca_with_svd(dataset, n)
	projv = dot(P, v)
	display_image(projv, row, col, i)


In [None]:

from sklearn.decomposition import PCA

n = 10
engine = PCA(n_components = n)


In [None]:

reduced = engine.fit_transform(dataset)	
shape(reduced)


In [None]:

projected = engine.inverse_transform(reduced)
shape(projected)


In [None]:

from matplotlib.pyplot import *

fig = figure(figsize = (10,5))
row, col = 2, 4

v = dataset[1] # second image
display_image(v, row, col, 1)

for i, n in enumerate([784, 600, 350, 150, 50, 10, 3], start = 2):
	engine = PCA(n_components = n)
	reduced = engine.fit_transform(dataset)
	projected = engine.inverse_transform(reduced)
	projv = projected[1] # second image
	display_image(projv, row, col, i)


In [None]:

colors = ('blue', 'red', 'green', 'orange', 'gray', 'cyan','turquoise', 'black', 'orchid', 'brown')

for i, color in enumerate(colors):
	# here only use first two features
	points = reduced[labels == i, :2]
	scatter(*points.T, label = i, c = color, edgecolor = 'k')

grid()
legend(loc = 'upper right')
show()


In [None]:

%matplotlib ipympl

ax = axes(projection = '3d')

colors = ('blue', 'green', 'black', 'brown', 'gray', 'cyan' , 'turquoise', 'orange', 'orchid', 'red')

for i, color in enumerate(colors): 
	# here use all three features
	points = reduced[labels == i]
	ax.scatter(*points.T, label = i, c = color, edgecolor = 'k')

ax.axis('equal')
ax.axis('off')
legend(loc = 'upper right')
show()


In [None]:


from sklearn import datasets

iris = datasets.load_iris()
dataset = iris['data']
labels = iris['target']

n = 2
engine = PCA(n_components = n)
reduced = engine.fit_transform(dataset)

close('all')

colors = [ 'red', 'green', 'blue']
names = [ 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa' ]

for i, color in enumerate(colors):
	points = reduced[labels == i]
	scatter(*points.T, label = names[i], c = color, edgecolor='k')

grid()
legend(loc = 'upper right')
show()
