In [23]:
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_mldata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn import metrics
import pandas as pd

In [26]:
def Logistic_Regression(X_train,X_test,y_train,y_test,d):
    
    logreg = LogisticRegression()
    pca = PCA(n_components=d)
    X = pca.fit_transform(X_train)
    logreg.fit(X,y_train)

    # transform new data using already fitted pca
    # (don't re-fit the pca)
    newdata_transformed = pca.transform(X_test)

    # predict labels using the trained classifier

    y_pred = logreg.predict(newdata_transformed)
    print("LR accuracy with pca d = %d : %f" % (d,metrics.accuracy_score(y_test, y_pred)))

In [27]:
def Decision_Trees(X_train,X_test,y_train,y_test,d):
    dectree = DecisionTreeClassifier(random_state=42)
    
    pca = PCA(n_components=d)
    X = pca.fit_transform(X_train)
    dectree.fit(X,y_train)

    # transform new data using already fitted pca
    # (don't re-fit the pca)
    newdata_transformed = pca.transform(X_test)

    # predict labels using the trained classifier

    y_pred = dectree.predict(newdata_transformed)
    print("DT accuracy with pca d = %d : %f" % (d,metrics.accuracy_score(y_test, y_pred)))

In [28]:
mnist = fetch_mldata('MNIST original')
normalized_dataset = np.divide(mnist.data,255)
mnist_data = normalized_dataset
mnist_target_data = mnist.target


In [30]:
print("MNIST logistic regression with PCA d = 5")
print("LR accuracy without PCA:0.9189")
mnistX_train, mnistX_test, mnisty_train, mnisty_test = train_test_split(mnist_data, mnist_target_data, test_size=0.33, random_state=42)
Logistic_Regression(mnistX_train, mnistX_test, mnisty_train, mnisty_test,5)


MNIST logistic regression with PCA d = 5
LR accuracy without PCA:0.9189
LR accuracy with pca d = 5 : 0.648571


In [17]:
print("MNIST logistic regression with PCA d = 20")
print("LR accuracy without PCA:0.9189")
mnistX_train, mnistX_test, mnisty_train, mnisty_test = train_test_split(mnist_data, mnist_target_data, test_size=0.33, random_state=42)
Logistic_Regression(mnistX_train, mnistX_test, mnisty_train, mnisty_test,20)



MNIST logistic regression with PCA d = 20
accuracy without PCA:0.9189
accuracy with pca d = 20 : 0.863723


In [31]:
print("MNIST Decision Trees with PCA d = 5")
print("DT accuracy without PCA:0.9189")
Decision_Trees(mnistX_train, mnistX_test, mnisty_train, mnisty_test,5)

MNIST Decision Trees with PCA d = 5
DT accuracy without PCA:0.9189
DT accuracy with pca d = 5 : 0.663377


In [32]:
print("MNIST Decision Trees with PCA d = 20")
print("DT accuracy without PCA:0.868")
Decision_Trees(mnistX_train, mnistX_test, mnisty_train, mnisty_test,20)

MNIST Decision Trees with PCA d = 20
DT accuracy without PCA:0.868
DT accuracy with pca d = 20 : 0.841602


In [33]:
#load spamdata
X = pd.read_csv('spambase/spambase.data', header=None)
spam_X, spam_y = X.iloc[:,:X.shape[1]-1], X.iloc[:,-1]

In [34]:
spamX_train, spamX_test, spamy_train, spamy_test = train_test_split(spam_X, spam_y, test_size=0.33, random_state=42)
print("Spambase data with PCA")
print("LR accuracy without PCA:0.930")
Logistic_Regression(spamX_train, spamX_test, spamy_train, spamy_test,5)

Spambase data with PCA
LR accuracy without PCA:0.930
LR accuracy with pca d = 5 : 0.784068


In [37]:
for i in range(1,25):
    Logistic_Regression(spamX_train, spamX_test, spamy_train, spamy_test,i)

LR accuracy with pca d = 1 : 0.639895
LR accuracy with pca d = 2 : 0.718894
LR accuracy with pca d = 3 : 0.718236
LR accuracy with pca d = 4 : 0.777485
LR accuracy with pca d = 5 : 0.784068
LR accuracy with pca d = 6 : 0.826860
LR accuracy with pca d = 7 : 0.849243
LR accuracy with pca d = 8 : 0.845293
LR accuracy with pca d = 9 : 0.850560
LR accuracy with pca d = 10 : 0.855168
LR accuracy with pca d = 11 : 0.854510
LR accuracy with pca d = 12 : 0.867676
LR accuracy with pca d = 13 : 0.890718
LR accuracy with pca d = 14 : 0.895326
LR accuracy with pca d = 15 : 0.896643
LR accuracy with pca d = 16 : 0.897959
LR accuracy with pca d = 17 : 0.897959
LR accuracy with pca d = 18 : 0.902567
LR accuracy with pca d = 19 : 0.901909
LR accuracy with pca d = 20 : 0.908492
LR accuracy with pca d = 21 : 0.905859
LR accuracy with pca d = 22 : 0.907176
LR accuracy with pca d = 23 : 0.905859
LR accuracy with pca d = 24 : 0.907834


In [None]:
## with d = 20, the accuracies are comparable without PCA for Logistic regression

In [43]:
print("Spambase data with PCA")
print("DT accuracy without PCA:0.9117")
for i in range(1,58):
    Decision_Trees(spamX_train, spamX_test, spamy_train, spamy_test,i)

Spambase data with PCA
DT accuracy without PCA:0.9117
DT accuracy with pca d = 1 : 0.677419
DT accuracy with pca d = 2 : 0.731402
DT accuracy with pca d = 3 : 0.753127
DT accuracy with pca d = 4 : 0.815010
DT accuracy with pca d = 5 : 0.845951
DT accuracy with pca d = 6 : 0.859776
DT accuracy with pca d = 7 : 0.847926
DT accuracy with pca d = 8 : 0.854510
DT accuracy with pca d = 9 : 0.846610
DT accuracy with pca d = 10 : 0.851218
DT accuracy with pca d = 11 : 0.839368
DT accuracy with pca d = 12 : 0.867676
DT accuracy with pca d = 13 : 0.876234
DT accuracy with pca d = 14 : 0.870968
DT accuracy with pca d = 15 : 0.870968
DT accuracy with pca d = 16 : 0.873601
DT accuracy with pca d = 17 : 0.870968
DT accuracy with pca d = 18 : 0.877551
DT accuracy with pca d = 19 : 0.863726
DT accuracy with pca d = 20 : 0.857143
DT accuracy with pca d = 21 : 0.869651
DT accuracy with pca d = 22 : 0.855826
DT accuracy with pca d = 23 : 0.859776
DT accuracy with pca d = 24 : 0.859776
DT accuracy with pc

In [None]:
## the results with d in [13,18] show highest accuracy not as close to the accuracy for decision trees without PCA