[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/niteshjindal170988/unsupervised-learning/blob/main/clustering/k_means_clustering.ipynb)



# Import Packages

In [1]:
import sklearn
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import time
import math
from copy import deepcopy
import warnings
from scipy.stats import multivariate_normal
from sklearn.datasets import make_blobs
import os
warnings.filterwarnings("ignore")
#os.getcwd()

In this exercise, we will examine Principal Component Analysis and will apply it on  [Digit Recognizer Dataset](https://www.kaggle.com/c/digit-recognizer) tp generate top-30 projections that captures the maximum variance from data.<br>
Then, we will apply K-means Clustering on the projected data. 


# PCA on the Digit-Recognizer Data

## Download Dataset from Google Drive


In [None]:
!pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org gdown==4.2.0
import gdown
warnings.filterwarnings("ignore")
# Load digit-recognizer (Train Data)
url = 'https://drive.google.com/uc?id=1SfSO5ZloHH3W6GJa5rfy9-qwjG4YPbM4'
output = 'train.csv'
gdown.download(url, output, quiet=False, verify=False)



You should consider upgrading via the 'c:\users\ag89382\appdata\local\programs\python\python37\deepenv\scripts\python.exe -m pip install --upgrade pip' command.
Downloading...
From: https://drive.google.com/uc?id=1SfSO5ZloHH3W6GJa5rfy9-qwjG4YPbM4
To: C:\Users\AG89382\AppData\Local\Programs\Python\Python37\deepenv\tutorials\unsupervised-learning\clustering\train.csv
  7%|█████▎                                                                        | 5.24M/76.8M [00:13<02:34, 464kB/s]

# Read Dataset

In [None]:
data = pd.read_csv("train.csv")
display(data.head()) #  Digits / Pixel data
print(data.shape)

The data set-`train.csv` has 785 columns. The first column, called `label`, which is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.
There are 10 labels (0 to 9).

Let's get started!

# Define a Class PCA to get the Projections

In [None]:
class PCA:
    #lbl_col_index=0 # global variable ; column index/position having class labels.
    def __init__(self, df, lbl_col_index=0):
    
        '''
        Declare instance variables
        '''
        self.df=df  
        self.lbl_col_ind=lbl_col_index
        self.labels = pd.Series(self.df.iloc[:,self.lbl_col_ind]).unique() #unique target labels 
#         print(self.labels)
    
    def drop_target_column(self, dataframe):
        features= dataframe.drop(dataframe.iloc[:,self.lbl_col_ind:self.lbl_col_ind+1], axis=1)
        return features
            
        
    def subset_data(self, uniq_trgt_lbl:int):
        
        '''
        Takes an integer value as an input (numeric category label).
        Returns-
        (1) Pandas Dataframe which is scaled subset of data without labels 
        (2) Pandas Series of Labels 
        Standard Scaler returns values with zero mean and unit variance.
        '''
        subdf=self.df[self.df.iloc[:,self.lbl_col_ind] == self.labels[uniq_trgt_lbl]]
        catg = subdf.iloc[:,self.lbl_col_ind]
        features = self.drop_target_column(subdf)
        return features, catg
    
    
    def feature_scaling(self, features):
        scaled_features=StandardScaler(copy=True, with_mean=True).fit_transform(features)
        return scaled_features
        
        
    def cov_mat(self, uniq_trgt_lbl: int=None):
        '''
        Takes in the unique target label to filter the data.
        Returns the scaled data of dimensions (4132, 784) 
        and the covariance matrix of scaled data of dimensions (784, 784).
        '''
        if uniq_trgt_lbl is None:
            subdf=self.drop_target_column(self.df)
            scaled_feat=self.feature_scaling(subdf)
            covmat = np.cov(scaled_feat, rowvar = False, bias = False)
            return scaled_feat, covmat
        else:
            subdf=self.subset_data(uniq_trgt_lbl)[0] #get the scaled features 
            scaled_feat=self.feature_scaling(subdf)
            covmat = np.cov(scaled_feat, rowvar = False, bias = False)
            return scaled_feat, covmat
         

    def eig_val_eig_vec(self, covariance_matrix):
        
        '''
        Takes input as square array / covariance matrix
        and returns pairs of eigen value and eigen vector of the
        covariance matrix in descending value of eigen value.
        '''
        
        eigval, eigvec = np.linalg.eig(covariance_matrix)
        pairs_eigval_eigvec = [(np.abs(eigval[k]), eigvec[:,k]) for k in range(len(eigval))]
        sorted_eg_ev_pairs = sorted(pairs_eigval_eigvec, key=lambda rw: rw[0], reverse=True)  
        return sorted_eg_ev_pairs 
    
    def visualize_explained_variance(self, covariance_matrix):
        information=self.eig_val_eig_vec(covariance_matrix)
        eigval= [i[0] for i in information] #array containing eigen values sorted in descending order
        var_exp = [(i/sum(eigval)) for i in eigval] 
        cum_sum_exp = np.cumsum(var_exp) #cummulative explained variance 
    
        plt.step(range(0,len(cum_sum_exp)),
                 cum_sum_exp,
                 where='mid',
                 label='Cumulative explained variance')
        
        plt.ylabel('Explained variance ratio')
        plt.xlabel('Principal component index')
        plt.legend(loc='best')
        plt.tight_layout()
        plt.show()
        
    def extract_n_principal_components(self, covariance_matrix, reqno_of_pcs):
        
        '''
        Horizontally stacks the top two eigen vectors ordered based on
        descending eigen values
        '''
            
        srtd_eg_ev_pair = self.eig_val_eig_vec(covariance_matrix)
        stacked_cmpnts=[np.hstack((k[1].reshape(784,1))) for k in srtd_eg_ev_pair[0:reqno_of_pcs]]
        stacked_arr=np.asarray(stacked_cmpnts).T
        return stacked_arr
    
    def get_projected_data(self, scaleddata, covariance_matrix, reqno_of_pcs):
        '''
        Takes the covariance matrix of dimensions D*D 
        Computes the Dot Product of scaled data for cat0 -> (4132*784)
        with the eigen vectors of Covariance Matrix  with highest eigen values (784*2)
        Return the projected data set of dimensions (m*2) for example for cat0->(4132*2)
        '''
        stcked_cmpnts = self.extract_n_principal_components(covariance_matrix, reqno_of_pcs)
        projecteddata_nd = scaleddata.dot(stcked_cmpnts)
        projecteddata=pd.DataFrame(projecteddata_nd)
        projecteddata.columns = ["PC_" + str(col) for col in projecteddata.columns]
        return projecteddata



# Extract the Top-30 Principal Components 

In [None]:
uniq_categories=sorted(data.label.unique())

#create PCA instance (object)
pca_instance=PCA(data, 0)  ## 0 is the column index of labels in the data.
scaled_dat, covmatr=pca_instance.cov_mat()
projecteddata_30d=pca_instance.get_projected_data(scaled_dat, covmatr, 30)
projecteddata_30d