In [32]:
import numpy as np
import pandas as pd
import scipy.linalg as la
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import load_breast_cancer
from keras.datasets import cifar10


Using TensorFlow backend.


In [33]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('Traning data shape:', x_train.shape)
print('Testing data shape:', x_test.shape)
classes = np.unique(y_train)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)
label_dict = {
 0: 'airplane',
 1: 'automobile',
 2: 'bird',
 3: 'cat',
 4: 'deer',
 5: 'dog',
 6: 'frog',
 7: 'horse',
 8: 'ship',
 9: 'truck',
}
x_train = x_train/255.0
x_train_flat = x_train.reshape(-1,3072)
feat_cols = ['pixel'+str(i) for i in range(x_train_flat.shape[1])]
df_cifar = pd.DataFrame(x_train_flat,columns=feat_cols)
df_cifar['label'] = y_train
print('Size of the dataframe: {}'.format(df_cifar.shape))


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Traning data shape: (50000, 32, 32, 3)
Testing data shape: (10000, 32, 32, 3)
Total number of outputs :  10
Output classes :  [0 1 2 3 4 5 6 7 8 9]
Size of the dataframe: (50000, 3073)


In [34]:

df_cifar.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel3063,pixel3064,pixel3065,pixel3066,pixel3067,pixel3068,pixel3069,pixel3070,pixel3071,label
0,0.231373,0.243137,0.247059,0.168627,0.180392,0.176471,0.196078,0.188235,0.168627,0.266667,...,0.847059,0.721569,0.54902,0.592157,0.462745,0.329412,0.482353,0.360784,0.282353,6
1,0.603922,0.694118,0.733333,0.494118,0.537255,0.533333,0.411765,0.407843,0.372549,0.4,...,0.560784,0.521569,0.545098,0.560784,0.52549,0.556863,0.560784,0.521569,0.564706,9
2,1.0,1.0,1.0,0.992157,0.992157,0.992157,0.992157,0.992157,0.992157,0.992157,...,0.305882,0.333333,0.32549,0.309804,0.333333,0.32549,0.313725,0.337255,0.329412,9
3,0.109804,0.098039,0.039216,0.145098,0.133333,0.07451,0.14902,0.137255,0.078431,0.164706,...,0.211765,0.184314,0.109804,0.247059,0.219608,0.145098,0.282353,0.254902,0.180392,4
4,0.666667,0.705882,0.776471,0.658824,0.698039,0.768627,0.694118,0.72549,0.796078,0.717647,...,0.294118,0.309804,0.321569,0.278431,0.294118,0.305882,0.286275,0.301961,0.313725,1


In [35]:
pca_cifar = PCA(n_components=2)
principalComponents_cifar = pca_cifar.fit_transform(df_cifar.iloc[:,:-1])
principal_cifar_Df = pd.DataFrame(data = principalComponents_cifar
             , columns = ['principal component 1', 'principal component 2'])
principal_cifar_Df['y'] = y_train

In [36]:
principal_cifar_Df.head()


Unnamed: 0,principal component 1,principal component 2,y
0,-6.401018,2.729039,6
1,0.829783,-0.949943,9
2,7.7302,-11.522102,9
3,-10.347817,0.010738,4
4,-2.625651,-4.96924,1


In [37]:
x_test = x_test/255.0
x_test = x_test.reshape(-1,32,32,3)
x_test_flat = x_test.reshape(-1,3072)
pca = PCA(0.9)
pca.fit(x_train_flat)


PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [38]:
pca.n_components_


99

In [39]:
train_img_pca = pca.transform(x_train_flat)
test_img_pca = pca.transform(x_test_flat)

In [40]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras.optimizers import RMSprop
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
batch_size = 128
num_classes = 10
epochs = 20
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(99,)))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

In [41]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              102400    
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_3 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 10)                2570      
Total params: 1,810,698
Trainable params: 1,810,698
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(train_img_pca, y_train,batch_size=batch_size,epochs=epochs,verbose=1,
                    validation_data=(test_img_pca, y_test))

Train on 50000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
#original
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(3072,)))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(x_train_flat, y_train,batch_size=batch_size,epochs=epochs,verbose=1,
                    validation_data=(x_test_flat, y_test))

Train on 50000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [8]:
mean = X.mean(axis=0)

In [9]:
X= X -mean

In [10]:
X

array([[-6.77204000e+03, -1.30830667e+01,  8.05643333e+00, ...,
         2.47676667e+00,  1.10060000e+00, -2.20947667e+01],
       [-6.78804000e+03, -1.24130667e+01,  7.88643333e+00, ...,
         2.70676667e+00,  1.12060000e+00, -2.18447667e+01],
       [-6.81604000e+03, -1.25530667e+01,  6.61643333e+00, ...,
         2.44676667e+00,  9.60600000e-01, -2.20647667e+01],
       ...,
       [ 1.31979600e+04,  3.64693333e+00, -2.36356667e+00, ...,
        -2.03233333e-01, -2.94000000e-02,  3.11523333e+00],
       [-6.79704000e+03,  6.69693333e+00, -2.94356667e+00, ...,
        -3.13233333e-01, -1.09400000e-01,  1.60523333e+00],
       [-6.79604000e+03,  5.93693333e+00, -3.46356667e+00, ...,
        -3.63233333e-01, -9.94000000e-02,  2.61523333e+00]])

In [13]:
def cov1(a, b): 
#     print("hello")
    if len(a) != len(b): 
#         print("here")
        return 
    a_mean = np.mean(a) 
    b_mean = np.mean(b) 
    sum = 0 
    for i in range(0, len(a)): 
        sum += ((a[i] - a_mean) * (b[i] - b_mean)) 
    return sum/(len(a)-1)

def calc_cov(A):
#     print(A[0])
    N,M = A.shape
    n=0;
    ret = np.reshape(np.zeros(M*M),(M,M))
#     print(ret)
    for m in range(0,M):
        for j in range(0,M):
#             print(m,j)
#             print(A.iloc[:,j].values)
            ret[m][j] = cov1(A[:,m],A[:,j])
#             print(ret[n][m])
    return ret;
        

In [15]:
ret = calc_cov(X)


In [16]:
ret

array([[ 4.84822162e+07,  2.16811389e+03, -3.42671225e+03,
        -3.03738008e+02, -3.01225983e+02, -3.90840562e+01,
         1.87884172e+03],
       [ 2.16811389e+03,  9.12595538e+01,  2.21435833e+01,
        -1.46895678e+01,  3.22109607e+00, -3.61865043e-01,
        -1.01930554e+02],
       [-3.42671225e+03,  2.21435833e+01,  4.14014050e+01,
         2.87610441e+01,  6.73072020e+00,  1.02262723e+00,
        -9.90197003e+01],
       [-3.03738008e+02, -1.46895678e+01,  2.87610441e+01,
         8.05624419e+01,  9.02193931e+00,  3.10256393e+00,
        -1.03608864e+02],
       [-3.01225983e+02,  3.22109607e+00,  6.73072020e+00,
         9.02193931e+00,  1.61219787e+00,  3.80021010e-01,
        -2.05803278e+01],
       [-3.90840562e+01, -3.61865043e-01,  1.02262723e+00,
         3.10256393e+00,  3.80021010e-01,  1.37164856e-01,
        -4.14119479e+00],
       [ 1.87884172e+03, -1.01930554e+02, -9.90197003e+01,
        -1.03608864e+02, -2.05803278e+01, -4.14119479e+00,
         3.2507089

In [17]:
V = np.cov(X.T)
print(V)

[[ 4.84822162e+07  2.16811389e+03 -3.42671225e+03 -3.03738008e+02
  -3.01225983e+02 -3.90840562e+01  1.87884172e+03]
 [ 2.16811389e+03  9.12595538e+01  2.21435833e+01 -1.46895678e+01
   3.22109607e+00 -3.61865043e-01 -1.01930554e+02]
 [-3.42671225e+03  2.21435833e+01  4.14014050e+01  2.87610441e+01
   6.73072020e+00  1.02262723e+00 -9.90197003e+01]
 [-3.03738008e+02 -1.46895678e+01  2.87610441e+01  8.05624419e+01
   9.02193931e+00  3.10256393e+00 -1.03608864e+02]
 [-3.01225983e+02  3.22109607e+00  6.73072020e+00  9.02193931e+00
   1.61219787e+00  3.80021010e-01 -2.05803278e+01]
 [-3.90840562e+01 -3.61865043e-01  1.02262723e+00  3.10256393e+00
   3.80021010e-01  1.37164856e-01 -4.14119479e+00]
 [ 1.87884172e+03 -1.01930554e+02 -9.90197003e+01 -1.03608864e+02
  -2.05803278e+01 -4.14119479e+00  3.25070892e+02]]


In [18]:
eigenValues, eigenVectors = np.linalg.eig(ret)


In [19]:
idx = eigenValues.argsort()[::-1]   
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:,idx]


In [20]:

eigenVectors

array([[ 9.99999996e-01, -4.25741414e-05,  4.19864265e-05,
         7.06997017e-05,  9.47762883e-07, -1.02659735e-06,
        -1.67114136e-07],
       [ 4.47197482e-05, -2.77302880e-01, -7.47831961e-01,
        -3.52612629e-01, -1.94427421e-01, -5.97873815e-02,
         4.45127228e-01],
       [-7.06799001e-05, -2.66635312e-01,  5.46489581e-02,
         8.09954769e-01, -2.55619219e-01, -8.43343712e-02,
         4.44340971e-01],
       [-6.26508637e-06, -2.78916930e-01,  6.59914709e-01,
        -4.67208184e-01, -2.61944390e-01, -3.62109086e-02,
         4.45542750e-01],
       [-6.21314779e-06, -5.54118004e-02,  4.06557192e-02,
         2.21611606e-02,  8.72135044e-01,  1.65937088e-01,
         4.54566691e-01],
       [-8.06158001e-07, -1.11385177e-02,  2.38806871e-02,
        -2.64648773e-02,  2.01288016e-01, -9.78320426e-01,
        -3.12655749e-02],
       [ 3.87535401e-05,  8.78077180e-01, -7.08870002e-03,
        -1.27517429e-02, -1.64637573e-01, -5.79308262e-02,
         4.4531572

In [21]:
P = eigenVectors.T.dot(X.T)


In [22]:
print(P.T[:,:2])
dataset1 = P.T[:,:2]

[[-6.77204215e+03 -2.46548986e+01]
 [-6.78804210e+03 -2.43147920e+01]
 [-6.81604203e+03 -2.46402914e+01]
 [-6.82504190e+03 -2.40279294e+01]
 [-6.83604196e+03 -2.35676509e+01]
 [-6.76604188e+03 -2.39767564e+01]
 [-6.75904190e+03 -2.35859216e+01]
 [-6.74404201e+03 -2.20943533e+01]
 [-6.72404205e+03 -2.42130437e+01]
 [-6.70804215e+03 -2.47987411e+01]
 [-6.74004215e+03 -2.45229545e+01]
 [-6.73304214e+03 -2.36441007e+01]
 [-6.67704199e+03 -2.41576202e+01]
 [-6.68704179e+03 -2.25565559e+01]
 [ 3.16395803e+03 -2.44088513e+01]
 [ 3.18495823e+03 -2.40079091e+01]
 [ 3.25295821e+03 -2.33714471e+01]
 [ 3.26695845e+03 -2.37425230e+01]
 [ 3.26095813e+03 -2.36156185e+01]
 [ 3.24095819e+03 -2.32176391e+01]
 [ 1.31759579e+04 -2.49843526e+01]
 [ 1.31789579e+04 -2.45341560e+01]
 [ 3.29495804e+03 -2.44846125e+01]
 [ 3.28095816e+03 -2.55099295e+01]
 [ 3.27395834e+03 -2.40603529e+01]
 [ 1.31709580e+04 -2.55643713e+01]
 [ 1.31649579e+04 -2.52098711e+01]
 [ 3.30495817e+03 -2.37095368e+01]
 [ 3.29695792e+03 -2

In [23]:
dataset1

array([[-6.77204215e+03, -2.46548986e+01],
       [-6.78804210e+03, -2.43147920e+01],
       [-6.81604203e+03, -2.46402914e+01],
       [-6.82504190e+03, -2.40279294e+01],
       [-6.83604196e+03, -2.35676509e+01],
       [-6.76604188e+03, -2.39767564e+01],
       [-6.75904190e+03, -2.35859216e+01],
       [-6.74404201e+03, -2.20943533e+01],
       [-6.72404205e+03, -2.42130437e+01],
       [-6.70804215e+03, -2.47987411e+01],
       [-6.74004215e+03, -2.45229545e+01],
       [-6.73304214e+03, -2.36441007e+01],
       [-6.67704199e+03, -2.41576202e+01],
       [-6.68704179e+03, -2.25565559e+01],
       [ 3.16395803e+03, -2.44088513e+01],
       [ 3.18495823e+03, -2.40079091e+01],
       [ 3.25295821e+03, -2.33714471e+01],
       [ 3.26695845e+03, -2.37425230e+01],
       [ 3.26095813e+03, -2.36156185e+01],
       [ 3.24095819e+03, -2.32176391e+01],
       [ 1.31759579e+04, -2.49843526e+01],
       [ 1.31789579e+04, -2.45341560e+01],
       [ 3.29495804e+03, -2.44846125e+01],
       [ 3.

In [24]:
pca = PCA(2)
pca.fit(X)


PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [25]:
print(pca.components_)
print(pca.explained_variance_)

[[ 9.99999996e-01  4.47197482e-05 -7.06799001e-05 -6.26508637e-06
  -6.21314779e-06 -8.06158001e-07  3.87535401e-05]
 [-4.25741414e-05 -2.77302880e-01 -2.66635312e-01 -2.78916930e-01
  -5.54118004e-02 -1.11385177e-02  8.78077180e-01]]
[4.84822167e+07 4.21500435e+02]


In [26]:
B = pca.transform(X)
print(B)

[[-6.77204215e+03 -2.46548986e+01]
 [-6.78804210e+03 -2.43147920e+01]
 [-6.81604203e+03 -2.46402914e+01]
 [-6.82504190e+03 -2.40279294e+01]
 [-6.83604196e+03 -2.35676509e+01]
 [-6.76604188e+03 -2.39767564e+01]
 [-6.75904190e+03 -2.35859216e+01]
 [-6.74404201e+03 -2.20943533e+01]
 [-6.72404205e+03 -2.42130437e+01]
 [-6.70804215e+03 -2.47987411e+01]
 [-6.74004215e+03 -2.45229545e+01]
 [-6.73304214e+03 -2.36441007e+01]
 [-6.67704199e+03 -2.41576202e+01]
 [-6.68704179e+03 -2.25565559e+01]
 [ 3.16395803e+03 -2.44088513e+01]
 [ 3.18495823e+03 -2.40079091e+01]
 [ 3.25295821e+03 -2.33714471e+01]
 [ 3.26695845e+03 -2.37425230e+01]
 [ 3.26095813e+03 -2.36156185e+01]
 [ 3.24095819e+03 -2.32176391e+01]
 [ 1.31759579e+04 -2.49843526e+01]
 [ 1.31789579e+04 -2.45341560e+01]
 [ 3.29495804e+03 -2.44846125e+01]
 [ 3.28095816e+03 -2.55099295e+01]
 [ 3.27395834e+03 -2.40603529e+01]
 [ 1.31709580e+04 -2.55643713e+01]
 [ 1.31649579e+04 -2.52098711e+01]
 [ 3.30495817e+03 -2.37095368e+01]
 [ 3.29695792e+03 -2

In [30]:
#with pca
from sklearn import datasets, linear_model, metrics 
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(dataset1, y, test_size = 0.4, random_state = 0) 
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train) 
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(reg.score(X_test, y_test))) 


Variance score: 0.7511515909062724


In [28]:
#without pca
from sklearn import datasets, linear_model, metrics 
from sklearn.model_selection import train_test_split 
X = dataset.iloc[:, :2].values #given marks
y = dataset.iloc[:, 2].values  #predicting history
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0) 
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train) 
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(reg.score(X_test, y_test))) 

Variance score: 0.06712912735986787
