In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics

In [None]:
np.random.seed(1)

#define the parameters of the gaussian probability distribution
mu = 0.0
sigma = 1.0

numDims = 1 #number of dimensions
numSamples = 1000

#Generate samples from gaussian normal distribution:
X = np.random.normal(mu,sigma,[numSamples,numDims])

empiricalMean = np.mean(X)
empiricalStdDev = np.std(X)

print("empirical=",empiricalMean)
print("empirical std=",empiricalStdDev)

In [None]:
numBins = 30
freq, bins, ignored = plt.hist(X,numBins, density=True )
plt.bar(bins[:numBins], freq)
plt.show()
print(len(freq))


In [None]:
my_dataframe = pd.DataFrame(X) #Convert numpy to dataframe
my_dataframe.to_csv("normalData1.csv")


In [None]:
np.random.seed(1)

numDims=2
mu = np.tile(0.0,numDims)
covmat = np.identity(numDims)
numSamples = 10000

X = np.random.multivariate_normal(mu,covmat,numSamples)
print("matrix size =", np.shape(X) )


In [None]:
vig, ax = plt.subplots()
ax.plot(X[:,0],X[:,1],'.')
ax.set_xlabel('Dim 1')
ax.set_ylabel('Dim 2')
ax.set_title('ScatterPlot 2 dimensions')
ax.axis('equal')


In [None]:
empiricalMean = X.mean(0)
empirical_covMat = np.cov(X.T)
print("Empirical mean =", empiricalMean)
print("empirical covmat =",empirical_covMat)

In [None]:
numBins = 40
plt.hist2d(X[:,0],X[:,1],bins = numBins)
plt.colorbar()
plt.show()

my_dataframe = pd.DataFrame(X)
my_dataframe.to_csv("normalData2.csv")


In [None]:
# import the necessary modules here
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd# For repeatability of results
np.random.seed(1)


# define the parameters of Gaussian
numDims = 2
mu = np.tile(0.0,numDims)
rho = 0.7
covmat = [[1,rho],[rho,1]]
print(covmat)
numSamples = 10000

In [None]:
# generate samples from 2D normal distribution
X = np.random.multivariate_normal(mu,covmat,numSamples)
print("matrix size =" , np.shape(X))

In [None]:
# visualize the data
plt.plot(X[:,0],X[:,1], '.')
plt.axis('equal');
plt.show()


In [None]:
# compute the mean and standard deviation of the generated samples
empirical_Mean = X.mean(0)
empirical_CovMat = np.cov(X.T)# X.T is the transpose of X as cov function requires data on columns
print("empirical mean = ", empirical_Mean)
print("empirical Covariance = ", empirical_CovMat)

In [None]:
# plot the histogram of the data
numBins = 40
plt.hist2d(X[:,0],X[:,1], bins=numBins)
plt.colorbar()
plt.show()


#store the data in a csv file
my_dataframe = pd.DataFrame(X) #converting numpy to dataframe
my_dataframe.to_csv("normalData3.csv")

# end of task-3

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def euclidean_distance(p1, p2):
    p1, p2 = np.array(p1), np.array(p2) #Ensure p1/p2 are NumPy Arrays
    return np.sqrt(np.sum(np.square(p2-p1)))
%matplotlib inline
sns.set_style('darkgrid')
avg_distances = []
for n in range(2, 100):
    avg_distances.append(np.mean([euclidean_distance(np.random.randint(low=-100, high=100, size=n), [0 for i in range(n)]) for p in range(500)]))
plt.figure(figsize=(10,10))
plt.plot(range(2,100), avg_distances,'bs-')
plt.plot( np.diff(avg_distances),'ro-')
plt.xlabel('Number of dimensions')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
data = pd.read_csv('data.csv', delimiter=",", header=None).values
print(data.shape)

In [None]:
plt.plot(data[:,0],data[:,1], '.', markersize=14)
plt.axis('equal');
plt.title('Original Data')
plt.show()


We now proceed with Implementing PCA using the following steps:

1) normalise the data
2) compute the covariance matrix of data
3) compute the eigenvectors (U) and eigenvalues (S) of the covariance matrix

In [None]:
mu = data.mean(axis=0) # mean of each col
sigma = data.std(axis=0)  # std dev of each col

Xnorm = (data - mu)/sigma
print (Xnorm[0:5,:])

Calculate the covariance matrix of normalised data

In [None]:
# Covariance matrix of normalized data
m = len(Xnorm)
covmat = np.dot(Xnorm.T, Xnorm)/m 
print(covmat)

Calculate the eigenvectors and eigenvalues of the covariance matrix

In [None]:
S,U = np.linalg.eig(covmat)

print('Eigen values: {}'.format(S))
print('Eigen vectors:')
print(U)

So now we found out the principal components (
) the set of axis that capture the maximum variation in data. What can we do this this now?

We can do the following: 1. Decorrelation: Project our data onto 
 to get decorrelated data 2. Dimensionality Reduction: Reduce 
 to contain only those axis that contain maximum information. Project our data onto this reduced 

In [None]:
# Z contains uncorrelated data  
Z = np.dot(Xnorm,U)

In [None]:
# 2 plots in one row
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(18,5))  # added size of each figs (width, height)
fig.subplots_adjust(wspace=0.2) # leave some space between figs


# plot for original data 
axs[0].scatter(data[:,0], data[:,1])
axs[0].set_title("Original Data")


# plot for uncorrelated data after PCA
axs[1].scatter(Z[:,0], Z[:,1])
axs[1].set_title("Data after PCA")


To reduce the dimensionality of our 2D data to 1D, we remove the principle component that captures the least variation. Our principle components, which are the eigen vectors of the covariance matrix are: U[:,0] and U[:,1]. By projecting our data Xnorm onto just U[:,0], we get a reduced Z in 1D.

In general, we decide to keep 
 eigenvectors in 
 that captures maximum variation. Then our reduced data Znorm becomes: 

In this case, 
.k =1

In [None]:
k = 1 # number of principal components to retain

Ured =  U[:,0:k] # choose the first k principal components

#project our data Xnorm onto Ured
Zred = np.dot(Xnorm,Ured) 

print(Zred.shape)
print(Ured.shape)


In [None]:

#recover our Xnorm data from Zred
Xrec = np.dot(Zred, Ured.T)
print(Xrec.shape)


#Visualize the recovered data
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(18,5))  # added size of each figs (width, height)
fig.subplots_adjust(wspace=0.2) # leave some space between figs


# plot for Xnorm 
axs[0].scatter(Xnorm[:,0], Xnorm[:,1])
axs[0].set_title("Normalised Original Data")


# plot for Xrec
axs[1].scatter(Xrec[:,0], Xrec[:,1])
axs[1].set_title("Recovered data after dimensionality reduction")


To make this error term between  0 and 1, we divide it by the Frobenius norm of the original data Xnorm. Frobenius norm of a matrix is defined as the square root of the sum of the absolute squares of its elements.

You can get a formal definition or watch a video illustrating a simple example if you need more information.

In python, frobenius norm is implemented in linear algebra package of numpy. You can call it using linalg.norm(, 'fro')

In [None]:
rec_err = np.linalg.norm(Xnorm-Xrec, 'fro')/np.linalg.norm(Xnorm, 'fro')
print("The reconstruction error is: {}".format(rec_err))

# PCA USING INBUILT FUNCTIONS 

In [None]:
data = pd.read_csv('train_wbcd.csv').dropna()
data.shape

## Data normalization

In [None]:
data_norm=data.copy()
mu = data_norm.iloc[:,2:].mean(axis=0) # mean of each col
sigma = data_norm.iloc[:,2:].std(axis=0)  # std dev of each col
data_norm.iloc[:,2:]=(data_norm.iloc[:,2:]-mu)/sigma

## Step 2: Implement PCA

In [None]:
#perform PCA using sklearn PCA implementation

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Xnorm=data_norm.iloc[:,2:].copy().values
pca.fit(Xnorm)


In [None]:
#The amount of variance that each PC explains
var= pca.explained_variance_ratio_
print(var)

In [None]:
#Cumulative Variance explains
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1)
plt.plot(var1)
plt.xlabel("Principal components")
plt.ylabel("Variance captured")
