In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import scipy.linalg as la
import time
%matplotlib inline

# Problem 1
Apply PCA to the cancer dataset to reduce the dimension of the feature space to each of 15, 10, and 5.    Are there any features or combinations of features for which PCA is not a suitable method to use?  Explain.  WARNING: remember to center your data (subtract the mean) and also normalize it. 

In [2]:
#load data set
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# center and normalize the data
X = X - X.mean(axis=0)
X = X / X.std(axis=0)
U, s, Vh = la.svd(X)

xtrain,xtest,ytrain,ytest = train_test_split(X,y, test_size=.3)
    
xtrain5, xtest5 = xtrain.dot(Vh[:5].T), xtest.dot(Vh[:5].T)
xtrain10, xtest10 = xtrain.dot(Vh[:10].T), xtest.dot(Vh[:10].T)
xtrain15, xtest15 = xtrain.dot(Vh[:15].T), xtest.dot(Vh[:15].T)

In [3]:
#are any features or combinations of features not PCA-suitable?
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


No, all are suitable because they are all numeric/continous data and can be centered and normalized by their standard deviations.

# Problem 2
Apply three of your favorite classification methods to the full cancer data set and also to the PCA-reduced data.  Analyze and evaluate the performance (time and accuracy) for each combination.  

In [4]:
# Logistic Regression
log_reg = LogisticRegression()
start = time.clock()
log_reg.fit(xtrain,ytrain)
lr30_time = round(time.clock() - start, 5)
lr30_acc = round((log_reg.predict(xtest) == ytest).mean(), 4)

start = time.clock()
log_reg.fit(xtrain5,ytrain)
lr5_time = round(time.clock() - start, 5)
lr5_acc = round((log_reg.predict(xtest5) == ytest).mean(), 4)

start = time.clock()
log_reg.fit(xtrain10,ytrain)
lr10_time = round(time.clock() - start, 5)
lr10_acc = round((log_reg.predict(xtest10) == ytest).mean(), 4)

start = time.clock()
log_reg.fit(xtrain15,ytrain)
lr15_time = round(time.clock() - start, 5)
lr15_acc = round((log_reg.predict(xtest15) == ytest).mean(), 4)

In [5]:
# SVM
svc = SVC()
start = time.clock()
svc.fit(xtrain,ytrain)
svc30_time = round(time.clock() - start, 5)
svc30_acc = round((svc.predict(xtest) == ytest).mean(), 4)

start = time.clock()
svc.fit(xtrain5,ytrain)
svc5_time = round(time.clock() - start, 5)
svc5_acc = round((svc.predict(xtest5) == ytest).mean(), 4)

start = time.clock()
svc.fit(xtrain10,ytrain)
svc10_time = round(time.clock() - start, 5)
svc10_acc = round((svc.predict(xtest10) == ytest).mean(), 4)

start = time.clock()
svc.fit(xtrain15,ytrain)
svc15_time = round(time.clock() - start, 5)
svc15_acc = round((svc.predict(xtest15) == ytest).mean(), 4)

In [6]:
# Gaussian Naive Bayes
gnb = GaussianNB()
start = time.clock()
gnb.fit(xtrain,ytrain)
gnb30_time = round(time.clock() - start, 5)
gnb30_acc = round((gnb.predict(xtest) == ytest).mean(), 4)

start = time.clock()
gnb.fit(xtrain5,ytrain)
gnb5_time = round(time.clock() - start, 5)
gnb5_acc = round((gnb.predict(xtest5) == ytest).mean(), 4)

start = time.clock()
gnb.fit(xtrain10,ytrain)
gnb10_time = round(time.clock() - start, 5)
gnb10_acc = round((gnb.predict(xtest10) == ytest).mean(), 4)

start = time.clock()
gnb.fit(xtrain15,ytrain)
gnb15_time = round(time.clock() - start, 5)
gnb15_acc = round((gnb.predict(xtest15) == ytest).mean(), 4)

In [9]:
#analyze results
print('All Results are printed as "time / accuracy"')
print('\t\tFull Data\t\t15-dimensional\t\t10-dimensional\t\t5-dimensional')
print('Logistic Reg\t{} / {}\t{} / {}\t{} / {}\t{} / {}'.format(lr30_time, lr30_acc, lr15_time, lr15_acc, lr10_time, lr10_acc, lr5_time, lr5_acc))
print('SVM\t\t{} / {}\t\t{} / {}\t{} / {}\t{} / {}'.format(svc30_time, svc30_acc, svc15_time, svc15_acc, svc10_time, svc10_acc, svc5_time, svc5_acc))
print('Naive Bayes\t{} / {}\t\t{} / {}\t{} / {}\t{} / {}'.format(gnb30_time, gnb30_acc, gnb15_time, gnb15_acc, gnb10_time, gnb10_acc, gnb5_time, gnb5_acc))

All Results are printed as "time / accuracy"
		Full Data		15-dimensional		10-dimensional		5-dimensional
Logistic Reg	0.01242 / 0.9883	0.00232 / 0.9883	0.00157 / 0.9883	0.00123 / 0.9591
SVM		0.0319 / 0.9532		0.00877 / 0.9708	0.00681 / 0.9766	0.00602 / 0.9532
Naive Bayes	0.01701 / 0.924		0.00108 / 0.9006	0.00095 / 0.9064	0.00095 / 0.8889


# Problem 3
Find some aspect of your final project for which PCA is an appropriate dimension-reduction method.  Apply PCA and analyze the results and performance.  Compare to your results without PCA.  

In [None]:
# select data that I will reduce
default = pd.read_csv('default.csv')
to_reduce = default[['LIMIT_BAL','BILL_AMT1','BILL_AMT2','BILL_AMT3',
             'BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1',
             'PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']].values

# center and normalize the data
X = to_reduce - to_reduce.mean(axis=0)
X = X / X.std(axis=0)
U, s, Vh = la.svd(X)

#xtrain,xtest,ytrain,ytest = train_test_split(X,y, test_size=.3)
x5 = X.dot(Vh[:5].T)

added_data = default[['SEX', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4','PAY_5', 'PAY_6',
       'educ: _1', 'educ: _2', 'educ: _3', 'educ: _4', 'educ: _5',
       'educ: _6', 'marr: _1', 'marr: _2', 'marr: _3']].values
pca_data = np.concatenate((x5, added_data), axis=1)
original_data = np.concatenate((to_reduce, added_data), axis=1)
y = default['default']

pca_train, pca_test, original_train, original_test, ytrain, ytest = train_test_split(pca_data, original_data, y, test_size=.3)

In [None]:
#compare results
model = LogisticRegression()
model.fit(pca_train, )

# Problem 4
Repeat what you did in the previous problem, but replacing PCA by a random projection. Try 5 different random projections and compare the results and performance. 

In [None]:
#apply random projection 1

#apply random projection 2

#apply random projection 3

#apply random projection 4

#apply random projection 5

In [None]:
#compare results and performance

In [10]:
test = np.arange(16).reshape((4,4))
test

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [27]:
test - test.mean(axis=0)

array([[-6., -6., -6., -6.],
       [-2., -2., -2., -2.],
       [ 2.,  2.,  2.,  2.],
       [ 6.,  6.,  6.,  6.]])

In [23]:
X.shape

(569L, 30L)

In [24]:
np.ones_like?

In [59]:
default.columns

Index([u'LIMIT_BAL', u'SEX', u'AGE', u'PAY_0', u'PAY_2', u'PAY_3', u'PAY_4',
       u'PAY_5', u'PAY_6', u'BILL_AMT1', u'BILL_AMT2', u'BILL_AMT3',
       u'BILL_AMT4', u'BILL_AMT5', u'BILL_AMT6', u'PAY_AMT1', u'PAY_AMT2',
       u'PAY_AMT3', u'PAY_AMT4', u'PAY_AMT5', u'PAY_AMT6', u'default',
       u'educ: _1', u'educ: _2', u'educ: _3', u'educ: _4', u'educ: _5',
       u'educ: _6', u'marr: _1', u'marr: _2', u'marr: _3'],
      dtype='object')