In [4]:
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_mldata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn import metrics
import pandas as pd

In [5]:
mnist = fetch_mldata('MNIST original')
normalized_dataset = np.divide(mnist.data,255)
mnist_data = normalized_dataset
mnist_target_data = mnist.target


In [20]:
def PCA(data,d):
    mean_vec = np.mean(data, axis=0)
    cov_mat = (data - mean_vec).T.dot((data - mean_vec)) / (data.shape[0]-1)
    
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    l = []
    for i in range(1,d):
        l.append(eig_pairs[i][1].reshape(784,1))
    matrix_w = np.hstack(l)
    return matrix_w
    

In [21]:
def Logistic_Regression(X_train,X_test,y_train,y_test,d):
    
    logreg = LogisticRegression()
     
    matrix_w = PCA(X_train,d)
    X = X_train.dot(matrix_w)
    
    logreg.fit(X,y_train)

    # transform new data using already fitted pca
    # (don't re-fit the pca)
    newdata_transformed = X_test.dot(matrix_w)

    # predict labels using the trained classifier

    y_pred = logreg.predict(newdata_transformed)
    print("LR accuracy with own PCA implementation d = %d : %f" % (d,metrics.accuracy_score(y_test, y_pred)))

In [22]:
def Decision_Trees(X_train,X_test,y_train,y_test,d):
    dectree = DecisionTreeClassifier(random_state=42)
    
    matrix_w = PCA(X_train,d)
    X = X_train.dot(matrix_w)
    dectree.fit(X,y_train)

    # transform new data using already fitted pca
    # (don't re-fit the pca)
    newdata_transformed = X_test.dot(matrix_w)

    # predict labels using the trained classifier

    y_pred = dectree.predict(newdata_transformed)
    print("DT accuracy with own PCA implementation d = %d : %f" % (d,metrics.accuracy_score(y_test, y_pred)))

In [23]:
print("MNIST logistic regression with PCA d = 5")
print("LR accuracy without PCA:0.9189")
print("LR accuracy with library PCA:0.648571")
mnistX_train, mnistX_test, mnisty_train, mnisty_test = train_test_split(mnist_data, mnist_target_data, test_size=0.33, random_state=42)
Logistic_Regression(mnistX_train, mnistX_test, mnisty_train, mnisty_test,5)

MNIST logistic regression with PCA d = 5
LR accuracy without PCA:0.9189
LR accuracy with library PCA:0.648571


  array = np.array(array, dtype=dtype, order=order, copy=copy)


LR accuracy with own PCA implementation d = 5 : 0.566104


In [24]:
print("MNIST logistic regression with PCA d = 20")
print("LR accuracy without PCA:0.9189")
print("LR accuracy with library PCA:0.8637")
mnistX_train, mnistX_test, mnisty_train, mnisty_test = train_test_split(mnist_data, mnist_target_data, test_size=0.33, random_state=42)
Logistic_Regression(mnistX_train, mnistX_test, mnisty_train, mnisty_test,20)

MNIST logistic regression with PCA d = 20
LR accuracy without PCA:0.9189
LR accuracy with library PCA:0.8637


  array = np.array(array, dtype=dtype, order=order, copy=copy)


LR accuracy with own PCA implementation d = 20 : 0.821558


In [25]:
print("MNIST Decision Trees with PCA d = 5")
print("DT accuracy without PCA: 0.9189")
print("DT accuracy with library PCA: 0.663377")
Decision_Trees(mnistX_train, mnistX_test, mnisty_train, mnisty_test,5)

MNIST Decision Trees with PCA d = 5
DT accuracy without PCA: 0.9189
DT accuracy with library PCA: 0.663377


  array = np.array(array, dtype=dtype, order=order, copy=copy)


DT accuracy with own PCA implementation d = 5 : 0.588052


In [26]:
print("MNIST Decision Trees with PCA d = 20")
print("DT accuracy without PCA: 0.868")
print("DT accuracy with library PCA: 0.841602")
Decision_Trees(mnistX_train, mnistX_test, mnisty_train, mnisty_test,20)

MNIST Decision Trees with PCA d = 20
DT accuracy without PCA: 0.868
DT accuracy with library PCA: 0.841602


  array = np.array(array, dtype=dtype, order=order, copy=copy)


DT accuracy with own PCA implementation d = 20 : 0.833680
