<a href="https://colab.research.google.com/github/meetsomto/dbs/blob/main/20b_mnist_PCA_TSNE_DV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cluster analysis of mnist dataset using PCA and t-SNE techniques

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [7]:
#connect to google drive if data is hosted on drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Importing dataset and reading
# dataset = pd.read_csv("mnist.csv")

# from google drive
dataset =pd.read_csv("/content/drive/My Drive/DBS/Semester 2/Data Visualization/Datasets/mnist.csv")
# https://drive.google.com/file/d/19U19vIeLkSkNettmGr-4-mWJFC4mHq0h/view?usp=sharing

In [9]:
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

   label  1x1  1x2  1x3  1x4  1x5  ...  28x23  28x24  28x25  28x26  28x27  28x28
0      7    0    0    0    0    0  ...      0      0      0      0      0      0
1      2    0    0    0    0    0  ...      0      0      0      0      0      0
2      1    0    0    0    0    0  ...      0      0      0      0      0      0
3      0    0    0    0    0    0  ...      0      0      0      0      0      0
4      4    0    0    0    0    0  ...      0      0      0      0      0      0

[5 rows x 785 columns]
(10000, 785)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 59.9 MB
None
              label      1x1      1x2  ...    28x26    28x27    28x28
count  10000.000000  10000.0  10000.0  ...  10000.0  10000.0  10000.0
mean       4.443400      0.0      0.0  ...      0.0      0.0      0.0
std        2.895865      0.0      0.0  ...      0.0      0.0      0.0
min        0.000000      0.0      0.0  

In [3]:
# Dividing dataset into label and feature sets
X = dataset.drop(['label'], axis = 1) # Features
Y = dataset['label'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(10000, 784)
(10000,)


In [4]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)


In [5]:
# Implementing PCA to visualize dataset
pca = PCA(n_components = 2)
pca.fit(X_scaled)
x_pca = pca.transform(X_scaled)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))
digits=list(dataset['label'])
data = [go.Scatter(x=x_pca[:,0], y=x_pca[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'digit: {a}' for a in digits],
                                hoverinfo='text')]

layout = go.Layout(title = 'PCA Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Principal Component'),
                    yaxis = dict(title='Second Principal Component'))
fig = go.Figure(data=data, layout=layout)
fig.show()


Variance explained by each of the n_components:  [0.06196825 0.04243746]
Total variance explained by the n_components:  0.10440571292880177


In [6]:
# Implementing t-SNE to visualize dataset
tsne = TSNE(n_components = 2, perplexity = 20, n_iter = 2000)
x_tsne = tsne.fit_transform(X_scaled)

data = [go.Scatter(x=x_tsne[:,0], y=x_tsne[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'digit: {a}' for a in digits],
                                hoverinfo='text')]

layout = go.Layout(title = 't-SNE Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()