## Dimension Reduction
### ID: eo9232
### Name: Md Reza
### CSC 5825 - Fall 2021


In [1]:
# Import requires packages and libraries
import csv
import numpy as np
import random
import math
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder,scale
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

##### Load the training and test data sets

In [2]:
def load_training_set():
    df_train = pd.read_csv("/u/mreza6/5825/Data/fashion-mnist_train.csv")
    return df_train

In [3]:
def load_test_set():
    df_test = pd.read_csv("/u/mreza6/5825/Data/fashion-mnist_test.csv")
    return df_test

In [4]:
train_data = load_training_set()
test_data = load_test_set()

##### Create training and test features where each row uses a vector of dimension 784 with values between 0 (black) and 255 (white) on the gray color scale

In [5]:
# Training
X_Training_features = train_data.iloc[:,1:785]
Y_Training_features = train_data.iloc[:,0]

# Test
X_Test_features =test_data.iloc[:,1:785]
Y_Test_features = test_data.iloc[:,0]

##### Scaling the data before performing SVD

In [6]:
# Training set
scaler = StandardScaler()
X_Training_features = scaler.fit_transform(X_Training_features)

# Test set 
X_Test_features= scaler.fit_transform(X_Test_features)

##### Use SVD function to reduce the number of dimensions of training & test data set & report how many components are selected and their variance ratios

In [7]:
# Training:
svd_train = TruncatedSVD(n_components=140).fit(X_Training_features)
print("Sum of Training Variance:", svd_train.explained_variance_ratio_.sum(), "\n")
print("Training Set Variance Ratio:", "\n", svd_train.explained_variance_ratio_, "\n")
SVD_train=svd_train.transform(X_Training_features)
print("Training SVD:", SVD_train.shape, "\n")
print("==================================================================", "\n")
# Test:
svd_test= TruncatedSVD(n_components=140).fit(X_Test_features)
print("Sum of Test Variance:", svd_test.explained_variance_ratio_.sum(), "\n")
print("Test Set Variance Ratio:",  "\n", svd_test.explained_variance_ratio_, "\n")
SVD_Test= svd_test.transform(X_Test_features)
print("Test SVD:", SVD_Test.shape)

Sum of Training Variance: 0.9020745118802875 

Training Set Variance Ratio: 
 [0.22057176 0.14395563 0.05458802 0.05116399 0.04069273 0.03012154
 0.02750481 0.02325871 0.01694438 0.01309833 0.01161488 0.00963108
 0.00890642 0.00856623 0.00743049 0.00730157 0.00657498 0.00632692
 0.00623657 0.0058046  0.00515707 0.00511548 0.00472764 0.00453557
 0.00438423 0.00416784 0.00395056 0.00393244 0.00378355 0.00374478
 0.00368528 0.00353646 0.00336422 0.00330015 0.00329434 0.00319736
 0.00305999 0.00293714 0.00289203 0.00280918 0.00271998 0.00265986
 0.00255732 0.00253773 0.00245191 0.00243254 0.0023872  0.00228125
 0.00223339 0.00215709 0.00212645 0.00209085 0.0020228  0.00201541
 0.00199305 0.00195331 0.00191017 0.00185777 0.00181595 0.00178263
 0.00175762 0.00173527 0.00170987 0.00167605 0.00161352 0.00157431
 0.00154654 0.00150989 0.00148746 0.00146461 0.00144439 0.00143185
 0.00142552 0.00140258 0.00137435 0.00137034 0.00134203 0.00130682
 0.00129961 0.00127365 0.00125027 0.00122548 0.0012

##### Train generative classifiers (Naive Bayes and KNN) and discriminative classifier (multinomial logistic regression) 

In [8]:
def NaiveBayesAlgorithm(X,Y):
    tmp = GaussianNB()
    model=tmp.fit(X,Y)
    return model

##### Test without SVD

In [9]:
NaiveBayes=NaiveBayesAlgorithm(X_Training_features,Y_Training_features)
PredNB= NaiveBayes.predict(X_Test_features)
print(accuracy_score(Y_Test_features, PredNB))

0.3178


##### Test with SVD

In [10]:
NBA=NaiveBayesAlgorithm(SVD_train,Y_Training_features)
p=NBA.predict(SVD_Test)
print(accuracy_score(Y_Test_features, p))

0.5555


##### Find the shape and optimal value of k

In [11]:
K_T ,K_V, Y_T, Y_V = train_test_split(SVD_train, Y_Training_features, test_size=0.20)
optimal_K=0
top_acc=0

print("Shape of K:", K_V.shape, "\n")
for j in range (3,21):
    model=KNeighborsClassifier(n_neighbors=j)
    model.fit(K_T,Y_T)
    pred=model.predict(K_V)
    acc=accuracy_score(Y_V, pred)
    if(top_acc<acc):
        top_acc=acc
        optimal_K=j
        j=j+1
        
print('Optimal K is: ',optimal_K)

Shape of K: (12000, 140) 

Optimal K is:  6


##### Predict kNN accuracy without SVD

In [12]:
KNN_classify= KNeighborsClassifier(n_neighbors=optimal_K)
KNN_classify.fit(X_Training_features,Y_Training_features)
KNN_pred=KNN_classify.predict(X_Test_features)
print('KNN accuracy without SVD:', accuracy_score(Y_Test_features, KNN_pred))

KNN accuracy without SVD: 0.8602


##### Predict kNN accuracy with SVD

In [13]:
KNN_classify.fit(SVD_train,Y_Training_features)
KNN_pred_with_svd=KNN_classify.predict(SVD_Test)
print('KNN accuracy with SVD:', accuracy_score(Y_Test_features, KNN_pred_with_svd))

KNN accuracy with SVD: 0.7241


##### Predict MLR accuracy without SVD

In [14]:
MLR = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg')
MLR.fit(X_Training_features,Y_Training_features)
MLR_pred=MLR.predict(X_Test_features)
print('MLR accuracy without SVD', accuracy_score(Y_Test_features, MLR_pred))

MLR accuracy without SVD 0.8357




##### Predict MLR accuracy with SVD

In [15]:
MLR.fit(SVD_train,Y_Training_features)
MLR_with_svd=MLR.predict(SVD_Test)
print('MLR accuracy with SVD', accuracy_score(Y_Test_features, MLR_with_svd))

MLR accuracy with SVD 0.4808




## Comparison of NB, kNN, & MLR with brief descriptions:
##### It’s turned out that the kNN has the highest accuracy with/without SVD on the test data set compared to Naive Bayes and Multinomial Logistic Regression with/without SVD. Though from the execution point-view, the kNN took a little longer compared other two classifiers. While Naive Bayes was quick but the accuracy was lower compared to Multinomial Logistic Regression.

##### Note: For convergence warning, as suggested, it could have been removed by increasing the number of iterations parameter. But there are possibilities that a large dataset like this generally might not be fitted by a logistic model.