In [2]:
from sklearn.datasets import fetch_20newsgroups, fetch_mldata
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EmpiricalCovariance
from numpy.linalg import eig
from numpy import linalg as LA
import numpy as np
import scipy

In [3]:
mnist_full = fetch_mldata('MNIST original', data_home='../data/')
mnist_data = mnist_full.data
mnist_target = mnist_full.target
mnist_train, mnist_test, mnist_train_target, mnist_test_target = train_test_split(mnist_full.data, mnist_full.target, test_size=0.2)

In [4]:
spam_file = open("../data/spambase.data")
spam_list = []
spam_target = []
spam_col = []
for line in spam_file.readlines():
    line = line.split(',')
    line = [float(x) for x in line]
    spam_list.append(line[:-1])
    spam_target.append(line[-1])
spam_data = np.array(spam_list)
spam_train, spam_test, spam_train_target, spam_test_target = train_test_split(spam_data, spam_target, test_size=0.2)
spam_col_file = open("../data/spambase.names")
for line in spam_col_file.readlines()[33:]:
    spam_col.append(line.split(':')[0])

In [5]:
def calc_PCA(d, train, test, train_target, test_target):
    pca_ob = PCA(n_components=d)
    train_data = pca_ob.fit_transform(train)
    test_data = pca_ob.transform(test)
    
    lr = LogisticRegression()
    lr.fit(train_data, train_target)
    np.fliplr(np.argsort(np.absolute(lr.coef_)))
    predicted_lr = lr.predict(test_data)
    print('Accuaracy Score LR : ', accuracy_score(test_target, predicted_lr))
    
    dt = DecisionTreeClassifier()
    dt.fit(train_data, train_target)
    np.flip(np.argsort(np.absolute(dt.feature_importances_)), axis=0)
    predicted_dt = dt.predict(test_data)
    print('Accuaracy Score DT : ', accuracy_score(test_target, predicted_dt))
    del pca_ob, lr, dt, train_data, test_data

In [6]:
calc_PCA(5, mnist_train, mnist_test, mnist_train_target, mnist_test_target)

Accuaracy Score LR :  0.639857142857
Accuaracy Score DT :  0.668071428571


In [7]:
calc_PCA(20, mnist_train, mnist_test, mnist_train_target, mnist_test_target)

Accuaracy Score LR :  0.863642857143
Accuaracy Score DT :  0.851071428571


In [8]:
calc_PCA(13, spam_train, spam_test, spam_train_target, spam_test_target)

Accuaracy Score LR :  0.869706840391
Accuaracy Score DT :  0.882736156352


In [9]:
class my_PCA:
    def __init__(self, n_features):
        self.n_features = n_features
        
    def fit(self, data):
        sc = StandardScaler()
        data = sc.fit_transform(data)
        ec = EmpiricalCovariance()
        ec.fit(data)
        
        self.eig_values, self.eig_vectors = eig(ec.covariance_)
        self.index = np.flip(np.argsort(self.eig_values), axis=0)[:self.n_features]

    def transform(self, data):
        sc = StandardScaler()
        data = sc.fit_transform(data)
        result = np.dot(data, self.eig_vectors[:,self.index])
        return result
        
    def fit_transform(self, data):
        self.fit(data)
        result = self.transform(data)
        return result
        

In [10]:
def calc_myPCA(d, train, test, train_target, test_target):
    pca_ob = my_PCA(n_features=d)
    train_data = pca_ob.fit_transform(train)
    test_data = pca_ob.transform(test)
    
    lr = LogisticRegression()
    lr.fit(train_data, train_target)
    np.fliplr(np.argsort(np.absolute(lr.coef_)))
    predicted_lr = lr.predict(test_data)
    print('Accuaracy Score LR : ', accuracy_score(test_target, predicted_lr))
    
    dt = DecisionTreeClassifier()
    dt.fit(train_data, train_target)
    np.flip(np.argsort(np.absolute(dt.feature_importances_)), axis=0)
    predicted_dt = dt.predict(test_data)
    print('Accuaracy Score DT : ', accuracy_score(test_target, predicted_dt))
    del pca_ob, lr, dt, train_data, test_data

In [11]:
calc_myPCA(5, mnist_train, mnist_test, mnist_train_target, mnist_test_target)



Accuaracy Score LR :  0.6415
Accuaracy Score DT :  0.673571428571


In [12]:
calc_myPCA(20, mnist_train, mnist_test, mnist_train_target, mnist_test_target)



Accuaracy Score LR :  0.850214285714
Accuaracy Score DT :  0.832714285714
